In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import joblib
import pickle
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

#preprocess
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
import warnings
warnings.filterwarnings("ignore")


In [2]:
data=pd.read_csv('preprocessed_data.csv', index_col=0)
data.head()


Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,referral_source,classes
0,41.0,0,0,0,0,0,0,0,0,0,...,0,0,0,1.3,2.5,125.0,1.14,109.0,1,negative
1,23.0,0,0,0,0,0,0,0,0,0,...,0,0,0,4.1,2.0,102.0,0.98,107.0,4,negative
2,46.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0.98,2.0,109.0,0.91,120.0,4,negative
3,70.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0.16,1.9,175.0,0.98,107.0,4,negative
4,70.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.72,1.2,61.0,0.87,70.0,3,negative


## Splitting dataset into train set and test set

In [3]:
# Load label_encoder
label_target=joblib.load('label_encoder.joblib')


In [4]:
X=data.drop(['classes'],axis=1)
y=label_target


In [5]:
y


array([1, 1, 1, ..., 1, 1, 1])

In [6]:
data['classes'].unique()


array(['negative', 'compensated hypothyroid', 'primary hypothyroid',
       'secondary hypothyroid'], dtype=object)

In [7]:
ros=RandomOverSampler()
X_sampled,y_sampled = ros.fit_resample(X,y)

print(X_sampled.shape)

X_sampled=pd.DataFrame(data=X_sampled,columns=X.columns)

X_sampled.head()


(10320, 22)


Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,referral_source
0,41.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1.3,2.5,125.0,1.14,109.0,1
1,23.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,4.1,2.0,102.0,0.98,107.0,4
2,46.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0.98,2.0,109.0,0.91,120.0,4
3,70.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0.16,1.9,175.0,0.98,107.0,4
4,70.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.72,1.2,61.0,0.87,70.0,3


In [8]:
print(y_sampled.shape)
y_sampled


(10320,)


array([1, 1, 1, ..., 3, 3, 3])

In [9]:
# train test set
X_train,X_test,y_train,y_test=train_test_split(X_sampled,y_sampled,test_size=0.2,random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(8256, 22)
(2064, 22)
(8256,)
(2064,)


# Models

In [10]:
# RandomForestClassifier
rfc=RandomForestClassifier(criterion='entropy',random_state=0)
rfc.fit(X_train,y_train)
y_pred=rfc.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
cr = classification_report(y_test,y_pred)
print("Random Forest:\n")
print(f'training accuracy:{rfc.score(X_train,y_train)}')
print(f'Testing accuracy:{rfc.score(X_test,y_test)}')
print(f'confusion matrix:\n{cm}')
print(f'classification report:\n{cr}')


Random Forest:

training accuracy:1.0
Testing accuracy:0.998062015503876
confusion matrix:
[[525   0   0   0]
 [  2 510   2   0]
 [  0   0 517   0]
 [  0   0   0 508]]
classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       525
           1       1.00      0.99      1.00       514
           2       1.00      1.00      1.00       517
           3       1.00      1.00      1.00       508

    accuracy                           1.00      2064
   macro avg       1.00      1.00      1.00      2064
weighted avg       1.00      1.00      1.00      2064



In [11]:
# SVC
from sklearn.svm import SVC
svc=SVC(gamma='auto')
svc.fit(X_train,y_train)
y_pred2=svc.predict(X_test)
cm2=confusion_matrix(y_test,y_pred2)
cr2 = classification_report(y_test,y_pred2)
print("Support Vector Classifier:\n")
print(f'training accuracy:{svc.score(X_train,y_train)}')
print(f'Testing accuracy:{svc.score(X_test,y_test)}')
print(f'confusion matrix:\n{cm2}')
print(f'classification report:\n{cr2}')


Support Vector Classifier:

training accuracy:0.9998788759689923
Testing accuracy:1.0
confusion matrix:
[[525   0   0   0]
 [  0 514   0   0]
 [  0   0 517   0]
 [  0   0   0 508]]
classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       525
           1       1.00      1.00      1.00       514
           2       1.00      1.00      1.00       517
           3       1.00      1.00      1.00       508

    accuracy                           1.00      2064
   macro avg       1.00      1.00      1.00      2064
weighted avg       1.00      1.00      1.00      2064



In [12]:
# Decision Tree
dt = DecisionTreeClassifier(max_depth = 4, criterion = 'entropy')
dt.fit(X_train, y_train)
y_pred3=dt.predict(X_test)
cm3=confusion_matrix(y_test,y_pred3)
cr3 = classification_report(y_test,y_pred3)
print("Decision Tree Classifier:\n")
print(f'training accuracy:{dt.score(X_train,y_train)}')
print(f'Testing accuracy:{dt.score(X_test,y_test)}')
print(f'confusion matrix:\n{cm3}')
print(f'classification report:\n{cr3}')


Decision Tree Classifier:

training accuracy:0.9894622093023255
Testing accuracy:0.9917635658914729
confusion matrix:
[[521   0   4   0]
 [  6 506   2   0]
 [  5   0 512   0]
 [  0   0   0 508]]
classification report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       525
           1       1.00      0.98      0.99       514
           2       0.99      0.99      0.99       517
           3       1.00      1.00      1.00       508

    accuracy                           0.99      2064
   macro avg       0.99      0.99      0.99      2064
weighted avg       0.99      0.99      0.99      2064



In [13]:
# xgb
from xgboost import XGBClassifier
xgb=XGBClassifier()
xgb.fit(X_train,y_train)
y_pred4=xgb.predict(X_test)
cm4=confusion_matrix(y_test,y_pred4)
cr4= classification_report(y_test,y_pred4)
print("XGB Classifier:\n")
print(f'training accuracy:{xgb.score(X_train,y_train)}')
print(f'Testing accuracy:{xgb.score(X_test,y_test)}')
print(f'confusion matrix:\n{cm4}')
print(f'classification report:\n{cr4}')


XGB Classifier:

training accuracy:1.0
Testing accuracy:0.998062015503876
confusion matrix:
[[525   0   0   0]
 [  1 510   3   0]
 [  0   0 517   0]
 [  0   0   0 508]]
classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       525
           1       1.00      0.99      1.00       514
           2       0.99      1.00      1.00       517
           3       1.00      1.00      1.00       508

    accuracy                           1.00      2064
   macro avg       1.00      1.00      1.00      2064
weighted avg       1.00      1.00      1.00      2064



In [14]:
# Hypertunning random
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score (estimator = svc, X=X_train,y=y_train,cv=10)
print(accuracies.mean())


0.9993943796316678


# Save model

In [15]:
import pickle
filename = ('thyroid_model.pkl')
pickle.dump(svc,open(filename,'wb'))
