In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns


In [33]:
df = pd.read_csv('/content/hypothyroid.csv')
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,binaryClass
0,41,F,f,f,f,f,f,f,f,f,...,t,125,t,1.14,t,109,f,?,SVHC,P
1,23,F,f,f,f,f,f,f,f,f,...,t,102,f,?,f,?,f,?,other,P
2,46,M,f,f,f,f,f,f,f,f,...,t,109,t,0.91,t,120,f,?,other,P
3,70,F,t,f,f,f,f,f,f,f,...,t,175,f,?,f,?,f,?,other,P
4,70,F,f,f,f,f,f,f,f,f,...,t,61,t,0.87,t,70,f,?,SVI,P


In [34]:
df.describe().T #To describe the values in column wise

Unnamed: 0,count,unique,top,freq
age,3772,94,59,95
sex,3772,3,F,2480
on thyroxine,3772,2,f,3308
query on thyroxine,3772,2,f,3722
on antithyroid medication,3772,2,f,3729
sick,3772,2,f,3625
pregnant,3772,2,f,3719
thyroid surgery,3772,2,f,3719
I131 treatment,3772,2,f,3713
query hypothyroid,3772,2,f,3538


In [35]:
df.shape

(3772, 30)

In [36]:
#Removing all string values
df.replace(
    {
        'f': 0,
        't': 1,
        '?': np.nan,
        'P': 0,
        'N': 1,
        'F': 0,
        'M': 1
    },
    inplace=True
)

  df.replace(


In [37]:
df=df.drop(['referral source'],axis=1)
df

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,binaryClass
0,41,0.0,0,0,0,0,0,0,0,0,...,2.5,1,125,1,1.14,1,109,0,,0
1,23,0.0,0,0,0,0,0,0,0,0,...,2,1,102,0,,0,,0,,0
2,46,1.0,0,0,0,0,0,0,0,0,...,,1,109,1,0.91,1,120,0,,0
3,70,0.0,1,0,0,0,0,0,0,0,...,1.9,1,175,0,,0,,0,,0
4,70,0.0,0,0,0,0,0,0,0,0,...,1.2,1,61,1,0.87,1,70,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30,0.0,0,0,0,0,0,0,0,0,...,,0,,0,,0,,0,,0
3768,68,0.0,0,0,0,0,0,0,0,0,...,2.1,1,124,1,1.08,1,114,0,,0
3769,74,0.0,0,0,0,0,0,0,0,0,...,1.8,1,112,1,1.07,1,105,0,,0
3770,72,1.0,0,0,0,0,0,0,0,0,...,2,1,82,1,0.94,1,87,0,,0


In [38]:
df.isnull().sum().sum()

6064

In [39]:
df.dtypes

Unnamed: 0,0
age,object
sex,float64
on thyroxine,int64
query on thyroxine,int64
on antithyroid medication,int64
sick,int64
pregnant,int64
thyroid surgery,int64
I131 treatment,int64
query hypothyroid,int64


In [40]:
#To transform object types into integer or float and ignoe=re errors

cols_to_replace = ['age','TSH', 'T3', 'TT4', 'T4U', 'FTI']
df[cols_to_replace] = df[cols_to_replace].apply(pd.to_numeric, errors='coerce')

In [41]:
df['age'].unique()

array([ 41.,  23.,  46.,  70.,  18.,  59.,  80.,  66.,  68.,  84.,  67.,
        71.,  28.,  65.,  42.,  63.,  51.,  81.,  54.,  55.,  60.,  25.,
        73.,  34.,  78.,  37.,  85.,  26.,  58.,  64.,  44.,  48.,  61.,
        35.,  83.,  21.,  87.,  53.,  77.,  27.,  69.,  74.,  38.,  76.,
        45.,  36.,  22.,  43.,  72.,  82.,  31.,  39.,  49.,  62.,  57.,
         1.,  50.,  30.,  29.,  75.,  19.,   7.,  79.,  17.,  24.,  15.,
        32.,  47.,  16.,  52.,  33.,  13.,  10.,  89.,  56.,  20.,  90.,
        40.,  88.,  14.,  86.,  94.,  12.,   4.,  11.,   8.,   5., 455.,
         2.,  91.,   6.,  nan,  93.,  92.])

In [42]:
df['age'].fillna(df['age'].mean(),inplace =True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].mean(),inplace =True)


In [43]:
df.dtypes

Unnamed: 0,0
age,float64
sex,float64
on thyroxine,int64
query on thyroxine,int64
on antithyroid medication,int64
sick,int64
pregnant,int64
thyroid surgery,int64
I131 treatment,int64
query hypothyroid,int64


In [44]:
del df['TBG']

In [45]:
df.fillna(df.mean(),inplace=True)
df.isnull().sum().sum()

0

In [46]:
df

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,binaryClass
0,41.0,0.0,0,0,0,0,0,0,0,0,...,1,2.5000,1,125.000000,1,1.140,1,109.000000,0,0
1,23.0,0.0,0,0,0,0,0,0,0,0,...,1,2.0000,1,102.000000,0,0.995,0,110.469649,0,0
2,46.0,1.0,0,0,0,0,0,0,0,0,...,0,2.0135,1,109.000000,1,0.910,1,120.000000,0,0
3,70.0,0.0,1,0,0,0,0,0,0,0,...,1,1.9000,1,175.000000,0,0.995,0,110.469649,0,0
4,70.0,0.0,0,0,0,0,0,0,0,0,...,1,1.2000,1,61.000000,1,0.870,1,70.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30.0,0.0,0,0,0,0,0,0,0,0,...,0,2.0135,0,108.319345,0,0.995,0,110.469649,0,0
3768,68.0,0.0,0,0,0,0,0,0,0,0,...,1,2.1000,1,124.000000,1,1.080,1,114.000000,0,0
3769,74.0,0.0,0,0,0,0,0,0,0,0,...,1,1.8000,1,112.000000,1,1.070,1,105.000000,0,0
3770,72.0,1.0,0,0,0,0,0,0,0,0,...,1,2.0000,1,82.000000,1,0.940,1,87.000000,0,0


In [47]:
df.to_csv('processed_hyperthyroid.csv')

#Building the Model


In [48]:
x = df.drop('binaryClass',axis=1)
y=df['binaryClass']

In [49]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=2)

In [50]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((2829, 27), (943, 27), (2829,), (943,))

In [51]:
x_train = x_train.drop(['FTI', 'FTI measured', 'T4U measured', 'TT4 measured','query on thyroxine','on antithyroid medication','sick', 'pregnant','thyroid surgery','I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary','psych' , 'TSH measured', 'T4U', 'TBG measured'],axis=1)
x_test = x_test.drop(['FTI', 'FTI measured', 'T4U measured', 'TT4 measured','query on thyroxine','on antithyroid medication','sick', 'pregnant','thyroid surgery','I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary','psych' , 'TSH measured', 'T4U', 'TBG measured'],axis=1)

In [52]:
x_train

Unnamed: 0,age,sex,on thyroxine,TSH,T3 measured,T3,TT4
3656,53.0,0.0,0,0.090000,1,3.9000,216.000000
1619,43.0,0.0,0,2.200000,0,2.0135,98.000000
2787,58.0,1.0,0,5.086766,1,1.6000,108.319345
3600,22.0,1.0,0,1.700000,1,2.3000,127.000000
1809,22.0,1.0,0,3.700000,1,2.1000,83.000000
...,...,...,...,...,...,...,...
3606,39.0,0.0,0,5.086766,0,2.0135,108.319345
1608,59.0,0.0,0,6.800000,1,0.7000,84.000000
2541,74.0,0.0,0,0.025000,1,2.2000,108.000000
2575,57.0,1.0,0,0.900000,1,1.5000,91.000000


In [53]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
x_train_prediction = lr.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction,y_train)
print("The training accuracy: ",training_data_accuracy)


The training accuracy:  0.9607635206786851


In [55]:
x_test_prediction = lr.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction,y_test)
print("The test accuracy: ",test_data_accuracy)

The test accuracy:  0.9597030752916225


In [56]:
input_data = [22.0,1.0,0,3.7,1,2.1,83]

inp_as_nparray = np.asarray(input_data)

inp_reshaped = inp_as_nparray.reshape(1,-1)

prediction = lr.predict(inp_reshaped)
if prediction[0]==0:
  print("The person is not suffering from hypothyroid")
else:
  print("The person is suffering from hypothyroid")

The person is not suffering from hypothyroid




#Into the pickle file

In [57]:
import pickle
file_name = "Thyroid_model.sav"
pickle.dump(lr,open(file_name,'wb'))

In [58]:
loaded_model = pickle.load(open('Thyroid_model.sav','rb'))

In [59]:
input_data = [22.0,1.0,0,3.7,1,2.1,83]

inp_as_nparray = np.asarray(input_data)

inp_reshaped = inp_as_nparray.reshape(1,-1)

prediction = loaded_model.predict(inp_reshaped)
if prediction[0]==0:
  print("The person is not suffering from hypothyroid")
else:
  print("The person is suffering from hypothyroid")

The person is not suffering from hypothyroid


