In [219]:
import numpy as np
import pandas as pd


In [223]:
path = '../res/dataset_57_hypothyroid.csv'
df = pd.read_csv(path, na_values='?')
df = df.drop(columns=['TBG', 'TBG_measured']).dropna()


In [224]:
sub_df = df[['TSH_measured', 'TSH',
    'T3_measured', 'T3',
    'TT4_measured', 'TT4', 
    'T4U_measured', 'T4U',
    'FTI_measured', 'FTI']]


for x in ['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured']:
    print(sub_df[x].value_counts())


df = df.drop(columns=['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured'])

t    2643
Name: TSH_measured, dtype: int64
t    2643
Name: T3_measured, dtype: int64
t    2643
Name: TT4_measured, dtype: int64
t    2643
Name: T4U_measured, dtype: int64
t    2643
Name: FTI_measured, dtype: int64


In [225]:
df = df.drop(df[df.Class == 'secondary_hypothyroid'].index)
df = df.replace({'f': 0, 't': 1, 'F': 0, 'M': 1})
df = df.replace({v: float(k) for k, v in enumerate(df.Class.value_counts().keys())})
df['referral_source'] =  pd.Categorical(df.referral_source)
df_rs = pd.get_dummies(df['referral_source'], prefix = 'category')
df = pd.concat([df, df_rs], axis=1)
df = df.drop(columns=['referral_source'])

In [231]:
x = df.drop(columns=['Class']).values
y = df['Class'].values


In [249]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree
from sklearn import neighbors



X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)


pipe_lr = Pipeline([('scl', StandardScaler()),
            ('clf', LogisticRegression(random_state=42))])

pipe_svm = Pipeline([('scl', StandardScaler()),
            ('clf', svm.SVC(random_state=42))])

pipe_dt = Pipeline([('scl', StandardScaler()),
            ('clf', tree.DecisionTreeClassifier(random_state=42))])

pipe_n = Pipeline([('scl', StandardScaler()),
            ('clf', neighbors.KNeighborsClassifier(n_neighbors=5))])


pipelines = [pipe_lr, pipe_svm, pipe_dt, pipe_n]


pipe_dict = {
    0: 'Logistic Regression', 
    1: 'Support Vector Machine', 
    2: 'Decision Tree', 
    3: 'Nearest Neighbors Classification'
}


for pipe in pipelines:
    pipe.fit(X_train, y_train)


for idx, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' % (pipe_dict[idx], val.score(X_test, y_test)))


best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, val in enumerate(pipelines):
    if val.score(X_test, y_test) > best_acc:
        best_acc = val.score(X_test, y_test)
        best_pipe = val
        best_clf = idx
print('Classifier with best accuracy: %s' % pipe_dict[best_clf])

Logistic Regression pipeline test accuracy: 0.945
Support Vector Machine pipeline test accuracy: 0.939
Decision Tree pipeline test accuracy: 0.992
Nearest Neighbors Classification pipeline test accuracy: 0.920
Classifier with best accuracy: Decision Tree
