In [72]:
import autosklearn.classification
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import sklearn.metrics

In [73]:
dataset = pd.read_csv('../Datasets/Thyroid/thyroidDF.csv')

In [74]:
diagnoses = {'-': 'negative',
             'A': 'hyperthyroid', 
             'C': 'hyperthyroid', 
             'B': 'hyperthyroid', 
             'D': 'hyperthyroid',
             'E': 'hypothyroid', 
             'F': 'hypothyroid', 
             'G': 'hypothyroid', 
             'H': 'hypothyroid'}

dataset['target'] = dataset['target'].map(diagnoses) # re-mapping
dataset.dropna(subset=['target'], inplace=True)

In [75]:
dataset.drop(['TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'patient_id', 'referral_source', 'TBG'], axis=1, inplace=True)

In [76]:
dataset[dataset['age']>100]

dataset.drop(dataset[dataset['age']>100].index, inplace=True)

In [77]:
dataset['sex'] = np.where((dataset.sex.isnull()) & (dataset.pregnant == 't'), 'F', dataset.sex)

In [78]:
dataset.drop('T3', axis=1, inplace=True)

In [79]:
dataset.dropna(inplace=True)

In [81]:
dataset.replace('f', 0, inplace=True)
dataset.replace('t', 1, inplace=True)
dataset.replace('M', 0, inplace=True)
dataset.replace('F', 1, inplace=True)

In [83]:
diagnoses = {'negative': 0,
             'hypothyroid': 1, 
             'hyperthyroid': 2}

dataset['target'] = dataset['target'].map(diagnoses) # re-mapping

# train and test split --> stratified
X = dataset.drop('target', axis=1).copy()
y = dataset['target'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [85]:
automl = autosklearn.classification.AutoSklearnClassifier()
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)

In [86]:
sklearn.metrics.accuracy_score(y_test, y_hat)

0.9839743589743589

In [87]:
import pickle
with open('thyroid_automl.pkl', 'wb') as f:
    pickle.dump(automl, f)