In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv('prepocessed_hypothyroid.csv')
df

Unnamed: 0,age,sex,on thyroxine,TSH,T3 measured,T3,TT4,binaryClass
0,41.0,1.0,0,1.300000,1,2.5000,125.000000,0
1,23.0,1.0,0,4.100000,1,2.0000,102.000000,0
2,46.0,0.0,0,0.980000,0,2.0135,109.000000,0
3,70.0,1.0,1,0.160000,1,1.9000,175.000000,0
4,70.0,1.0,0,0.720000,1,1.2000,61.000000,0
...,...,...,...,...,...,...,...,...
3767,30.0,1.0,0,5.086766,0,2.0135,108.319345,0
3768,68.0,1.0,0,1.000000,1,2.1000,124.000000,0
3769,74.0,1.0,0,5.100000,1,1.8000,112.000000,0
3770,72.0,0.0,0,0.700000,1,2.0000,82.000000,0


In [5]:
df.isnull().sum()

age             0
sex             0
on thyroxine    0
TSH             0
T3 measured     0
T3              0
TT4             0
binaryClass     0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           3772 non-null   float64
 1   sex           3772 non-null   float64
 2   on thyroxine  3772 non-null   int64  
 3   TSH           3772 non-null   float64
 4   T3 measured   3772 non-null   int64  
 5   T3            3772 non-null   float64
 6   TT4           3772 non-null   float64
 7   binaryClass   3772 non-null   int64  
dtypes: float64(5), int64(3)
memory usage: 235.9 KB


In [7]:
df.describe()

Unnamed: 0,age,sex,on thyroxine,TSH,T3 measured,T3,TT4,binaryClass
count,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0
mean,51.735879,0.684705,0.123012,5.086766,0.796129,2.0135,108.319345,0.077147
std,20.082295,0.455362,0.328494,23.290853,0.402927,0.738262,34.496511,0.266861
min,1.0,0.0,0.0,0.005,0.0,0.05,2.0,0.0
25%,36.0,0.0,0.0,0.6,1.0,1.7,89.0,0.0
50%,54.0,1.0,0.0,1.6,1.0,2.0135,106.0,0.0
75%,67.0,1.0,0.0,3.8,1.0,2.2,123.0,0.0
max,455.0,1.0,1.0,530.0,1.0,10.6,430.0,1.0


In [8]:
## split the data into training and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='binaryClass'),
                                                   df['binaryClass'],
                                                   test_size=0.20,
                                                   random_state=42)

In [9]:
## Scale the datasets
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve 

In [11]:
models={
    "Logisitic Regression":LogisticRegression(),
    "Decision Tree":DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier(),
    "Gradient Boost":GradientBoostingClassifier(),
    "Adaboost":AdaBoostClassifier()
}


In [12]:
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)


# Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))

    
    print('='*35)
    print('\n')

Logisitic Regression
Model performance for Training set
- Accuracy: 0.9546
- F1 score: 0.9476
- Precision: 0.9068
- Recall: 0.4592
- Roc Auc Score: 0.7276
----------------------------------
Model performance for Test set
- Accuracy: 0.9576
- F1 score: 0.9513
- Precision: 0.9333
- Recall: 0.4828
- Roc Auc Score: 0.7399


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.9907
- F1 score: 0.9907
- Precision: 0.9474
- Recall: 0.9310
- Roc Auc Score: 0.9634


Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.9960
- F1 score: 0.9960
- Precision: 0.9661
- Recall: 0.9828
- Roc Auc Score: 0.9899


Gradient Boost
Model performance for Training se



Adaboost
Model performance for Training set
- Accuracy: 0.9940
- F1 score: 0.9941
- Precision: 0.9498
- Recall: 0.9742
- Roc Auc Score: 0.9850
----------------------------------
Model performance for Test set
- Accuracy: 0.9960
- F1 score: 0.9960
- Precision: 0.9661
- Recall: 0.9828
- Roc Auc Score: 0.9899




In [13]:
## Hyperparameter Training
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}
adaboost_param={
    "n_estimators":[50,60,70,80,90],
    "algorithm":['SAMME','SAMME.R']
}

In [14]:
# Models list for Hyperparameter tuning
randomcv_models = [
                   ("RF", RandomForestClassifier(), rf_params),
    ("AB", AdaBoostClassifier(), adaboost_param)
                   
                   ]

In [15]:
randomcv_models

[('RF',
  RandomForestClassifier(),
  {'max_depth': [5, 8, 15, None, 10],
   'max_features': [5, 7, 'auto', 8],
   'min_samples_split': [2, 8, 15, 20],
   'n_estimators': [100, 200, 500, 1000]}),
 ('AB',
  AdaBoostClassifier(),
  {'n_estimators': [50, 60, 70, 80, 90], 'algorithm': ['SAMME', 'SAMME.R']})]

In [16]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits


84 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\kamal\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\kamal\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\kamal\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\kamal\AppData\Local\Programs\Python\Python310\lib

Fitting 3 folds for each of 10 candidates, totalling 30 fits
---------------- Best Params for RF -------------------
{'n_estimators': 100, 'min_samples_split': 8, 'max_features': 8, 'max_depth': None}
---------------- Best Params for AB -------------------
{'n_estimators': 50, 'algorithm': 'SAMME'}


In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [20]:
## Retraining the models with best parameters
models = {
    "Random Forest ": RandomForestClassifier(n_estimators=100, min_samples_split=8, max_features=8, max_depth=None, 
                                                     n_jobs=-1),
    "Adaboost":AdaBoostClassifier(n_estimators=50,algorithm='SAMME')
    
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(accuracy_score(y_train_pred, y_train)))
    print("- Classification report: {:.4f}".format(classification_report(y_train_pred, y_train)))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(accuracy_score(y_test_pred, y_test)))
    print("- Classification report: {:.4f}".format(classification_report(y_test_pred, y_test)))
    print('='*35)
    print('\n')

InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got '8' instead.