In [104]:
import pickle
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from matplotlib import rcParams
from matplotlib.cm import rainbow

from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [105]:
warnings.filterwarnings('ignore')

In [106]:
dataset = pd.read_csv('./corpora/x_train.csv')
labels = pd.read_csv('./corpora/y_train.csv')

In [107]:
dataset.drop('id', axis=1, inplace=True)
dataset.drop('patient_no', axis=1, inplace=True)
dataset.drop('weight', axis=1, inplace=True)
dataset.drop('payer_code', axis=1, inplace=True)
dataset.drop('medical_specialty', axis=1, inplace=True)
dataset.drop('diag_1', axis=1, inplace=True)
dataset.drop('diag_2', axis=1, inplace=True)
dataset.drop('diag_3', axis=1, inplace=True)
dataset.drop('admission_typeid', axis=1, inplace=True)
dataset.drop('discharge_disposition_id', axis=1, inplace=True)
dataset.drop('admission_source_id', axis=1, inplace=True)
dataset.drop('time_in_hospital', axis=1, inplace=True)
dataset.drop('num_lab_procedures', axis=1, inplace=True)
dataset.drop('num_procedures', axis=1, inplace=True)
dataset.drop('number_outpatient', axis=1, inplace=True)
dataset.drop('number_emergency', axis=1, inplace=True)
dataset.drop('number_inpatient', axis=1, inplace=True)
dataset.drop('number_diagnoses', axis=1, inplace=True)
dataset.drop('num_medications', axis=1, inplace=True)

In [108]:
dataset['cast'].replace({'?': 'Other'}, inplace=True)
dataset['cast'].replace({
    'Caucasian': 1, 
    'AfricanAmerican': 2,
    'Asian': 3,  
    'Hispanic': 4,
    'Other': 5
}, inplace=True)
dataset['gender'].replace({
    'Male': '1',    
    'Female': '2',
    'Unknown/Invalid': '3'
}, inplace=True)

In [109]:
for i in dataset.index:
    age_group = dataset.at[i, 'age group']
    age_group = age_group.replace('[', '')    
    age_group = age_group.replace(')', '')
    age_group = age_group.split('-')
    age = int((int(age_group[0]) + int(age_group[0])) / 2)
    dataset.at[i, 'age group'] = age

In [110]:
dataset['max_glu_serum'].replace({
    'None': 0, 
    'Norm': 120,
    '>300': 320,  
    '>200': 220
}, inplace=True)

dataset['A1Cresult'].replace({
    'None': 0, 
    'Norm': 5,
    '>8': 8,  
    '>7': 7
}, inplace=True)

dataset['metformin'].replace({
    'No': 0, 
    'Up': 1,
    'Steady': 2,  
    'Down': 3
}, inplace=True)

dataset['repaglinide'].replace({
    'No': 0, 
    'Up': 1,
    'Steady': 2,  
    'Down': 3
}, inplace=True)

dataset['nateglinide'].replace({
    'No': 0, 
    'Up': 1,
    'Steady': 2,  
    'Down': 3
}, inplace=True)

dataset['chlorpropamide'].replace({
    'No': 0, 
    'Up': 1,
    'Steady': 2
}, inplace=True)

dataset['glimepiride'].replace({
    'No': 0, 
    'Up': 1,
    'Steady': 2,
    'Down': 3
}, inplace=True)

dataset['acetohexamide'].replace({
    'No': 0, 
    'Steady': 2
}, inplace=True)

dataset['glipizide'].replace({
    'No': 0, 
    'Up': 1,
    'Steady': 2,
    'Down': 3
}, inplace=True)

dataset['glyburide'].replace({
    'No': 0, 
    'Up': 1,
    'Steady': 2,
    'Down': 3
}, inplace=True)

dataset['tolbutamide'].replace({
    'No': 0, 
    'Steady': 2
}, inplace=True)

dataset['pioglitazone'].replace({
    'No': 0, 
    'Up': 1,
    'Steady': 2,
    'Down': 3
}, inplace=True)

dataset['rosiglitazone'].replace({
    'No': 0, 
    'Up': 1,
    'Steady': 2,
    'Down': 3
}, inplace=True)

dataset['acarbose'].replace({
    'No': 0, 
    'Up': 1,
    'Steady': 2,
    'Down': 3
}, inplace=True)

dataset['miglitol'].replace({
    'No': 0, 
    'Up': 1,
    'Steady': 2,
    'Down': 3
}, inplace=True)

dataset['troglitazone'].replace({
    'No': 0,
    'Steady': 2
}, inplace=True)

dataset['tolazamide'].replace({
    'No': 0, 
    'Up': 1,
    'Steady': 2
}, inplace=True)

dataset['examide'].replace({
    'No': 0
}, inplace=True)

dataset['citoglipton'].replace({
    'No': 0
}, inplace=True)

dataset['insulin'].replace({
    'No': 0, 
    'Up': 1,
    'Steady': 2,
    'Down': 3
}, inplace=True)

dataset['glyburide-metformin'].replace({
    'No': 0, 
    'Up': 1,
    'Steady': 2,
    'Down': 3
}, inplace=True)

dataset['glipizide-metformin'].replace({
    'No': 0, 
    'Steady': 2
}, inplace=True)

dataset['glimepiride-pioglitazone'].replace({
    'No': 0,
}, inplace=True)

dataset['metformin-rosiglitazone'].replace({
    'No': 0, 
    'Steady': 2
}, inplace=True)

dataset['metformin-pioglitazone'].replace({
    'No': 0, 
    'Steady': 2
}, inplace=True)

dataset['change'].replace({
    'No': 0, 
    'Ch': 1
}, inplace=True)

dataset['Med'].replace({
    'No': 0, 
    'Yes': 1
}, inplace=True)

In [111]:
dataset['gender'] = pd.to_numeric(dataset['gender'])
dataset['age group'] = pd.to_numeric(dataset['age group'])
dataset['max_glu_serum'] = pd.to_numeric(dataset['max_glu_serum'])
dataset['A1Cresult'] = pd.to_numeric(dataset['A1Cresult'])
dataset['metformin'] = pd.to_numeric(dataset['metformin'])
dataset['repaglinide'] = pd.to_numeric(dataset['repaglinide'])
dataset['nateglinide'] = pd.to_numeric(dataset['nateglinide'])
dataset['chlorpropamide'] = pd.to_numeric(dataset['chlorpropamide'])
dataset['glimepiride'] = pd.to_numeric(dataset['glimepiride'])
dataset['acetohexamide'] = pd.to_numeric(dataset['acetohexamide'])
dataset['glipizide'] = pd.to_numeric(dataset['glipizide'])
dataset['glyburide'] = pd.to_numeric(dataset['glyburide'])
dataset['tolbutamide'] = pd.to_numeric(dataset['tolbutamide'])
dataset['pioglitazone'] = pd.to_numeric(dataset['pioglitazone'])
dataset['rosiglitazone'] = pd.to_numeric(dataset['rosiglitazone'])
dataset['acarbose'] = pd.to_numeric(dataset['acarbose'])
dataset['miglitol'] = pd.to_numeric(dataset['miglitol'])
dataset['troglitazone'] = pd.to_numeric(dataset['troglitazone'])
dataset['tolazamide'] = pd.to_numeric(dataset['tolazamide'])
dataset['examide'] = pd.to_numeric(dataset['examide'])
dataset['citoglipton'] = pd.to_numeric(dataset['citoglipton'])
dataset['insulin'] = pd.to_numeric(dataset['insulin'])
dataset['glyburide-metformin'] = pd.to_numeric(dataset['glyburide-metformin'])
dataset['glipizide-metformin'] = pd.to_numeric(dataset['glipizide-metformin'])
dataset['glimepiride-pioglitazone'] = pd.to_numeric(dataset['glimepiride-pioglitazone'])
dataset['metformin-rosiglitazone'] = pd.to_numeric(dataset['metformin-rosiglitazone'])
dataset['metformin-pioglitazone'] = pd.to_numeric(dataset['metformin-pioglitazone'])
dataset['change'] = pd.to_numeric(dataset['change'])
dataset['Med'] = pd.to_numeric(dataset['Med'])

In [112]:
standardScaler = StandardScaler()
columns_to_scale = ['age group', 'max_glu_serum', 'A1Cresult']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])

In [113]:
labels['label'].replace({
    'NO': 0,
    '>5': 1,
    '<30': 1
}, inplace=True)
labels['label'] = pd.to_numeric(labels['label'])

In [114]:
x_train, y_train = dataset, labels

In [115]:
knn_classifier = KNeighborsClassifier(n_neighbors=20)
knn_classifier.fit(x_train, y_train)
pickle.dump(knn_classifier, open('knn_classifier.sav', 'wb'))

In [117]:
dt_classifier = DecisionTreeClassifier(max_features=len(dataset.columns), random_state = 0)
dt_classifier.fit(x_train, y_train)
pickle.dump(dt_classifier, open('dt_classifier.sav', 'wb'))

In [119]:
svc_classifier = SVC(kernel='linear')
svc_classifier.fit(x_train, y_train)
pickle.dump(svc_classifier, open('svc_classifier.sav', 'wb'))

In [None]:
lg_classifier = LogisticRegression()
lg_classifier.fit(x_train, y_train)
pickle.dump(lg_classifier, open('lg_classifier.sav', 'wb'))

In [None]:
rf_scores = []
estimators = [10, 100, 200, 500, 1000]
for i in estimators:
    rf_classifier = RandomForestClassifier(n_estimators = i, random_state = 0)
    rf_classifier.fit(x_train, y_train)
    rf_scores.append(rf_classifier.score(x_test, y_test))
colors = rainbow(np.linspace(0, 1, len(estimators)))
plt.bar([i for i in range(len(estimators))], rf_scores, color = colors, width = 0.8)
for i in range(len(estimators)):
    plt.text(i, rf_scores[i], rf_scores[i])
plt.xticks(ticks = [i for i in range(len(estimators))], labels = [str(estimator) for estimator in estimators])
plt.xlabel('Number of estimators')
plt.ylabel('Scores')
plt.title('Random Forest Classifier scores for different number of estimators')

NameError: name 'rf_scores' is not defined