In [15]:
import numpy as np
import pandas as pd


In [46]:
path = '../res/dataset_57_hypothyroid.csv'
df = pd.read_csv(path, na_values='?')
df = df.drop(columns=['TBG', 'TBG_measured'])

In [47]:
sub_df = df[['TSH_measured', 'TSH',
    'T3_measured', 'T3',
    'TT4_measured', 'TT4', 
    'T4U_measured', 'T4U',
    'FTI_measured', 'FTI']]


for x in ['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured']:
    print(sub_df[x].value_counts())


# df = df.drop(columns=['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured'])

t    3403
f     369
Name: TSH_measured, dtype: int64
t    3003
f     769
Name: T3_measured, dtype: int64
t    3541
f     231
Name: TT4_measured, dtype: int64
t    3385
f     387
Name: T4U_measured, dtype: int64
t    3387
f     385
Name: FTI_measured, dtype: int64


In [48]:
df = df.drop(df[df.Class == 'secondary_hypothyroid'].index)
df = df.replace({'f': 0, 't': 1, 'F': 0, 'M': 1})
df = df.replace({v: float(k) for k, v in enumerate(df.Class.value_counts().keys())})


In [49]:
df['referral_source'] =  pd.Categorical(df.referral_source)
df_rs = pd.get_dummies(df['referral_source'], prefix = 'category')
df = pd.concat([df, df_rs], axis=1)
df = df.drop(columns=['referral_source'])
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,T4U_measured,T4U,FTI_measured,FTI,Class,category_STMW,category_SVHC,category_SVHD,category_SVI,category_other
0,41.0,0.0,0,0,0,0,0,0,0,0,...,1,1.14,1,109.0,0.0,0,1,0,0,0
1,23.0,0.0,0,0,0,0,0,0,0,0,...,0,,0,,0.0,0,0,0,0,1
2,46.0,1.0,0,0,0,0,0,0,0,0,...,1,0.91,1,120.0,0.0,0,0,0,0,1
3,70.0,0.0,1,0,0,0,0,0,0,0,...,0,,0,,0.0,0,0,0,0,1
4,70.0,0.0,0,0,0,0,0,0,0,0,...,1,0.87,1,70.0,0.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30.0,0.0,0,0,0,0,0,0,0,0,...,0,,0,,0.0,0,0,0,0,1
3768,68.0,0.0,0,0,0,0,0,0,0,0,...,1,1.08,1,114.0,0.0,0,0,0,1,0
3769,74.0,0.0,0,0,0,0,0,0,0,0,...,1,1.07,1,105.0,0.0,0,0,0,0,1
3770,72.0,1.0,0,0,0,0,0,0,0,0,...,1,0.94,1,87.0,0.0,0,0,0,1,0


In [50]:
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split

imputer = Imputer(strategy = 'median')

x = df.drop(columns=['Class']).values
x = imputer.fit_transform(x)
y = df['Class'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify = y)



In [51]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn import neighbors


pipe_lr = Pipeline([('scl', StandardScaler()),
            ('clf', LogisticRegression())])


pipe_n = Pipeline([('scl', StandardScaler()),
            ('clf', neighbors.KNeighborsClassifier(n_neighbors=5))])


pipelines = [pipe_lr, pipe_n]


pipe_dict = {
    0: 'Logistic Regression', 
    1: 'Nearest Neighbors Classification'
}


for pipe in pipelines:
    pipe.fit(x_train, y_train)


for idx, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' % (pipe_dict[idx], val.score(x_test, y_test)))


best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, val in enumerate(pipelines):
    if val.score(x_test, y_test) > best_acc:
        best_acc = val.score(x_test, y_test)
        best_pipe = val
        best_clf = idx
print('Classifier with best accuracy: %s' % pipe_dict[best_clf])



Logistic Regression pipeline test accuracy: 0.947
Nearest Neighbors Classification pipeline test accuracy: 0.941
Classifier with best accuracy: Logistic Regression


In [52]:
from sklearn.metrics import classification_report
print(classification_report(y_test, best_pipe.predict(x_test)))

              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1044
         1.0       0.83      0.09      0.16        58
         2.0       0.79      0.79      0.79        29

    accuracy                           0.95      1131
   macro avg       0.86      0.63      0.64      1131
weighted avg       0.94      0.95      0.93      1131

