In [1]:
import numpy as np
import pandas as pd


In [2]:
path = '../res/dataset_57_hypothyroid.csv'
df = pd.read_csv(path, na_values='?')
df = df.drop(columns=['TBG', 'TBG_measured'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 28 columns):
age                          3771 non-null float64
sex                          3622 non-null object
on_thyroxine                 3772 non-null object
query_on_thyroxine           3772 non-null object
on_antithyroid_medication    3772 non-null object
sick                         3772 non-null object
pregnant                     3772 non-null object
thyroid_surgery              3772 non-null object
I131_treatment               3772 non-null object
query_hypothyroid            3772 non-null object
query_hyperthyroid           3772 non-null object
lithium                      3772 non-null object
goitre                       3772 non-null object
tumor                        3772 non-null object
hypopituitary                3772 non-null object
psych                        3772 non-null object
TSH_measured                 3772 non-null object
TSH                          3403 non-null

In [3]:
df = df.drop(df[df.Class == 'secondary_hypothyroid'].index)
df = df[(df['hypopituitary'] != 't')]
df.Class.value_counts()

negative                   3480
compensated_hypothyroid     194
primary_hypothyroid          95
Name: Class, dtype: int64

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder())])

In [5]:
num_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).drop(['Class'], axis=1).columns

from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, categorical_features)])



In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


pipe_lr = Pipeline(steps = [('preprocessor', preprocessor),
                            ('classifier', LogisticRegression(multi_class = 'auto', 
                                                              solver = 'liblinear'))])


pipe_n = Pipeline(steps = [('preprocessor', preprocessor),
                           ('classifier', KNeighborsClassifier(n_neighbors=5))])

In [7]:
from sklearn.model_selection import train_test_split

x = df.drop(columns=['Class'])
y = df['Class']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify = y)

In [8]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score


pipe_lr.fit(x_train, y_train)
pipe_n.fit(x_train, y_train)

scores_pipe_lr = cross_val_score(pipe_lr, x_train, y_train, cv = 5)
scores_pipe_n = cross_val_score(pipe_n, x_train, y_train, cv = 5)

print(classification_report(y_test, pipe_n.predict(x_test)))
print(scores_pipe_n)
print('\n\n')
print(classification_report(y_test, pipe_lr.predict(x_test)))
print(scores_pipe_lr)

                         precision    recall  f1-score   support

compensated_hypothyroid       0.33      0.07      0.11        58
               negative       0.95      1.00      0.97      1044
    primary_hypothyroid       0.85      0.59      0.69        29

               accuracy                           0.94      1131
              macro avg       0.71      0.55      0.59      1131
           weighted avg       0.91      0.94      0.92      1131

[0.92641509 0.94686907 0.94686907 0.943074   0.94117647]



                         precision    recall  f1-score   support

compensated_hypothyroid       0.78      0.12      0.21        58
               negative       0.95      1.00      0.97      1044
    primary_hypothyroid       0.85      0.76      0.80        29

               accuracy                           0.95      1131
              macro avg       0.86      0.63      0.66      1131
           weighted avg       0.94      0.95      0.93      1131

[0.9490566  0.94497154 0

### under sampling

In [9]:
from imblearn.under_sampling import ClusterCentroids
from imblearn.pipeline import Pipeline

under_sampling = ClusterCentroids(sampling_strategy='not minority')
pipe_lr = Pipeline(steps = [('preprocessor', preprocessor),
                            ('under_sampling', under_sampling),
                            ('classifier', LogisticRegression(multi_class = 'auto', 
                                                              solver = 'liblinear'))])


pipe_n = Pipeline(steps = [('preprocessor', preprocessor),
                           ('under_sampling', under_sampling),
                           ('classifier', KNeighborsClassifier(n_neighbors=5))])

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score


pipe_lr.fit(x_train, y_train)
pipe_n.fit(x_train, y_train)

scores_pipe_lr = cross_val_score(pipe_lr, x_train, y_train, cv = 5)
scores_pipe_n = cross_val_score(pipe_n, x_train, y_train, cv = 5)

print(classification_report(y_test, pipe_n.predict(x_test)))
print(scores_pipe_n)
print('\n\n')
print(classification_report(y_test, pipe_lr.predict(x_test)))
print(scores_pipe_lr)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


                         precision    recall  f1-score   support

compensated_hypothyroid       0.15      0.50      0.23        58
               negative       0.97      0.84      0.90      1044
    primary_hypothyroid       0.67      0.76      0.71        29

               accuracy                           0.82      1131
              macro avg       0.59      0.70      0.61      1131
           weighted avg       0.92      0.82      0.86      1131

[0.83207547 0.81783681 0.81214421 0.82542694 0.80075901]



                         precision    recall  f1-score   support

compensated_hypothyroid       0.23      0.64      0.34        58
               negative       0.98      0.87      0.92      1044
    primary_hypothyroid       0.48      0.83      0.61        29

               accuracy                           0.86      1131
              macro avg       0.57      0.78      0.62      1131
           weighted avg       0.93      0.86      0.88      1131

[0.84528302 0.84250474 0

### over sampling

In [10]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

over_sampling = SMOTE(sampling_strategy='not majority')
pipe_lr = Pipeline(steps = [('preprocessor', preprocessor),
                            ('under_sampling', over_sampling),
                            ('classifier', LogisticRegression(multi_class = 'auto', 
                                                              solver = 'liblinear'))])


pipe_n = Pipeline(steps = [('preprocessor', preprocessor),
                           ('under_sampling', over_sampling),
                           ('classifier', KNeighborsClassifier(n_neighbors=5))])

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score


pipe_lr.fit(x_train, y_train)
pipe_n.fit(x_train, y_train)

scores_pipe_lr = cross_val_score(pipe_lr, x_train, y_train, cv = 5)
scores_pipe_n = cross_val_score(pipe_n, x_train, y_train, cv = 5)

print(classification_report(y_test, pipe_n.predict(x_test)))
print(scores_pipe_n)
print('\n\n')
print(classification_report(y_test, pipe_lr.predict(x_test)))
print(scores_pipe_lr)

                         precision    recall  f1-score   support

compensated_hypothyroid       0.19      0.48      0.27        58
               negative       0.97      0.89      0.92      1044
    primary_hypothyroid       0.72      0.72      0.72        29

               accuracy                           0.86      1131
              macro avg       0.63      0.70      0.64      1131
           weighted avg       0.92      0.86      0.89      1131

[0.83584906 0.84440228 0.85578748 0.86527514 0.83491461]



                         precision    recall  f1-score   support

compensated_hypothyroid       0.67      0.91      0.77        58
               negative       1.00      0.97      0.98      1044
    primary_hypothyroid       0.69      0.86      0.77        29

               accuracy                           0.96      1131
              macro avg       0.79      0.92      0.84      1131
           weighted avg       0.97      0.96      0.97      1131

[0.9509434  0.93927894 0

### Feature engineering

In [13]:
from sklearn.preprocessing import PolynomialFeatures
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

over_sampling = SMOTE(sampling_strategy='not majority')
feauters = PolynomialFeatures(degree=2)

pipe_lr = Pipeline(steps = [('preprocessor', preprocessor),
                            ('under_sampling', over_sampling),
                            ('feauters', feauters),
                            ('classifier', LogisticRegression(multi_class = 'auto', 
                                                              solver = 'liblinear'))])


pipe_n = Pipeline(steps = [('preprocessor', preprocessor),
                           ('under_sampling', over_sampling),
                           ('feauters', feauters),
                           ('classifier', KNeighborsClassifier(n_neighbors=5))])

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score


pipe_lr.fit(x_train, y_train)
pipe_n.fit(x_train, y_train)

scores_pipe_lr = cross_val_score(pipe_lr, x_train, y_train, cv = 5)
scores_pipe_n = cross_val_score(pipe_n, x_train, y_train, cv = 5)

print(classification_report(y_test, pipe_n.predict(x_test)))
print(scores_pipe_n)
print('\n\n')
print(classification_report(y_test, pipe_lr.predict(x_test)))
print(scores_pipe_lr)

                         precision    recall  f1-score   support

compensated_hypothyroid       0.18      0.48      0.27        58
               negative       0.97      0.88      0.92      1044
    primary_hypothyroid       0.74      0.69      0.71        29

               accuracy                           0.86      1131
              macro avg       0.63      0.68      0.63      1131
           weighted avg       0.92      0.86      0.88      1131

[0.8245283  0.84060721 0.85958254 0.84819734 0.85009488]



                         precision    recall  f1-score   support

compensated_hypothyroid       0.71      0.83      0.76        58
               negative       0.99      0.98      0.99      1044
    primary_hypothyroid       0.80      0.69      0.74        29

               accuracy                           0.97      1131
              macro avg       0.83      0.83      0.83      1131
           weighted avg       0.97      0.97      0.97      1131

[0.97358491 0.98102467 0

### Feature importance and hyperparameters tuning

In [14]:
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV



param_grid = {
    'classifier__C': np.logspace(-4, 4, 4),
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'classifier__penalty': ["l1","l2"],

    
}
search = GridSearchCV(pipe_lr, param_grid)
search.fit(x_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)








Best parameter (CV score=0.977):
{'classifier__C': 21.54434690031882, 'classifier__penalty': 'l1', 'preprocessor__num__imputer__strategy': 'mean'}




In [12]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

feauters = PolynomialFeatures(degree=2)

pipe_lr = Pipeline(steps = [('preprocessor', preprocessor),
                            ('under_sampling', under_sampling),
                            ('feauters', feauters),
                            ('classifier', LogisticRegression(multi_class = 'auto', 
                                                              solver = 'liblinear',
                                                             C =  21.54434690031882, penalty =  'l1'))])

pipe_lr.fit(x_train, y_train)
pipe_n.fit(x_train, y_train)

scores_pipe_lr = cross_val_score(pipe_lr, x_train, y_train, cv = 5)


print(classification_report(y_test, pipe_lr.predict(x_test)))
print(scores_pipe_lr)

                         precision    recall  f1-score   support

compensated_hypothyroid       0.59      0.93      0.72        58
               negative       1.00      0.96      0.98      1044
    primary_hypothyroid       0.69      0.83      0.75        29

               accuracy                           0.95      1131
              macro avg       0.76      0.91      0.82      1131
           weighted avg       0.97      0.95      0.96      1131

[0.96226415 0.92789374 0.9544592  0.9544592  0.95256167]
