In [1]:
import numpy as np
import pandas as pd


In [2]:
path = '../res/dataset_57_hypothyroid.csv'
df = pd.read_csv(path, na_values='?')
df = df.drop(columns=['TBG', 'TBG_measured'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 28 columns):
age                          3771 non-null float64
sex                          3622 non-null object
on_thyroxine                 3772 non-null object
query_on_thyroxine           3772 non-null object
on_antithyroid_medication    3772 non-null object
sick                         3772 non-null object
pregnant                     3772 non-null object
thyroid_surgery              3772 non-null object
I131_treatment               3772 non-null object
query_hypothyroid            3772 non-null object
query_hyperthyroid           3772 non-null object
lithium                      3772 non-null object
goitre                       3772 non-null object
tumor                        3772 non-null object
hypopituitary                3772 non-null object
psych                        3772 non-null object
TSH_measured                 3772 non-null object
TSH                          3403 non-null

In [3]:
df = df.drop(df[df.Class == 'secondary_hypothyroid'].index)
df = df[(df['hypopituitary'] != 't')]
df.Class.value_counts()

negative                   3480
compensated_hypothyroid     194
primary_hypothyroid          95
Name: Class, dtype: int64

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder())])

In [5]:
num_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).drop(['Class'], axis=1).columns

from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, categorical_features)])



In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


pipe_lr = Pipeline(steps = [('preprocessor', preprocessor),
                            ('classifier', LogisticRegression(multi_class = 'auto', 
                                                              solver = 'liblinear'))])


pipe_n = Pipeline(steps = [('preprocessor', preprocessor),
                           ('classifier', KNeighborsClassifier(n_neighbors=5))])

In [7]:
from sklearn.model_selection import train_test_split

x = df.drop(columns=['Class'])
y = df['Class']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify = y)

In [8]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score


pipe_lr.fit(x_train, y_train)
pipe_n.fit(x_train, y_train)

scores_pipe_lr = cross_val_score(pipe_lr, x_train, y_train, cv = 5)
scores_pipe_n = cross_val_score(pipe_n, x_train, y_train, cv = 5)

print(classification_report(y_test, pipe_n.predict(x_test)))
print(scores_pipe_n)
print('\n\n')
print(classification_report(y_test, pipe_lr.predict(x_test)))
print(scores_pipe_lr)

                         precision    recall  f1-score   support

compensated_hypothyroid       0.33      0.07      0.11        58
               negative       0.95      1.00      0.97      1044
    primary_hypothyroid       0.85      0.59      0.69        29

               accuracy                           0.94      1131
              macro avg       0.71      0.55      0.59      1131
           weighted avg       0.91      0.94      0.92      1131

[0.92641509 0.94686907 0.94686907 0.943074   0.94117647]



                         precision    recall  f1-score   support

compensated_hypothyroid       0.78      0.12      0.21        58
               negative       0.95      1.00      0.97      1044
    primary_hypothyroid       0.85      0.76      0.80        29

               accuracy                           0.95      1131
              macro avg       0.86      0.63      0.66      1131
           weighted avg       0.94      0.95      0.93      1131

[0.9490566  0.94497154 0

### over\under sampling