In [1]:
import numpy as np
import pandas as pd


In [79]:
path = '../res/dataset_57_hypothyroid.csv'
df = pd.read_csv(path, na_values='?')
df = df.drop(columns=['TBG', 'TBG_measured'])

In [80]:
sub_df = df[['TSH_measured', 'TSH',
    'T3_measured', 'T3',
    'TT4_measured', 'TT4', 
    'T4U_measured', 'T4U',
    'FTI_measured', 'FTI']]


for x in ['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured']:
    print(sub_df[x].value_counts())


df = df.drop(columns=['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured'])

t    3403
f     369
Name: TSH_measured, dtype: int64
t    3003
f     769
Name: T3_measured, dtype: int64
t    3541
f     231
Name: TT4_measured, dtype: int64
t    3385
f     387
Name: T4U_measured, dtype: int64
t    3387
f     385
Name: FTI_measured, dtype: int64


In [81]:
df = df.drop(df[df.Class == 'secondary_hypothyroid'].index)
df = df.replace({'f': 0, 't': 1, 'F': 0, 'M': 1})
df = df.replace({v: float(k) for k, v in enumerate(df.Class.value_counts().keys())})


In [None]:
df['referral_source'] =  pd.Categorical(df.referral_source)
df_rs = pd.get_dummies(df['referral_source'], prefix = 'category')
df = pd.concat([df, df_rs], axis=1)
df = df.drop(columns=['referral_source'])

In [94]:
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split

imputer = Imputer()

x = df.drop(columns=['Class']).values
x = imputer.fit_transform(x)
y = df['Class'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)



In [96]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn import neighbors


pipe_lr = Pipeline([('scl', StandardScaler()),
            ('clf', LogisticRegression())])


pipe_n = Pipeline([('scl', StandardScaler()),
            ('clf', neighbors.KNeighborsClassifier(n_neighbors=5))])


pipelines = [pipe_lr, pipe_n]


pipe_dict = {
    0: 'Logistic Regression', 
    1: 'Nearest Neighbors Classification'
}


for pipe in pipelines:
    pipe.fit(X_train, y_train)


for idx, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' % (pipe_dict[idx], val.score(X_test, y_test)))


best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, val in enumerate(pipelines):
    if val.score(X_test, y_test) > best_acc:
        best_acc = val.score(X_test, y_test)
        best_pipe = val
        best_clf = idx
print('Classifier with best accuracy: %s' % pipe_dict[best_clf])



Logistic Regression pipeline test accuracy: 0.951
Nearest Neighbors Classification pipeline test accuracy: 0.944
Classifier with best accuracy: Logistic Regression


In [99]:
y_pred = best_pipe.predict(x_test)
