In [22]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
#preprocess
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTENC,RandomOverSampler,KMeansSMOTE
import warnings
warnings.filterwarnings("ignore")

In [23]:
df = pd.read_csv(r"../notebook/thyroid.csv")

In [24]:
# is null value
df.isnull().sum()

Unnamed: 0                      0
age                             1
sex                           110
on_thyroxine                    0
query_on_thyroxine              0
on_antithyroid_medication       0
sick                            0
pregnant                        0
thyroid_surgery                 0
I131_treatment                  0
query_hypothyroid               0
query_hyperthyroid              0
lithium                         0
goitre                          0
tumor                           0
hypopituitary                   0
psych                           0
TSH_measured                    0
TSH                           284
T3_measured                     0
T3                            585
TT4_measured                    0
TT4                           184
T4U_measured                    0
T4U                           297
FTI_measured                    0
FTI                           295
TBG_measured                    0
TBG                          2800
referral_sourc

In [25]:
#df['age'].value_counts()
for age in df['age'].unique():    
    if age > 100:
        print(age)

455.0


In [26]:
df['age'] = np.where(df['age'] > 100, np.nan, df['age'])

In [27]:
df['age'].isnull().sum()

2

In [28]:
# The measured columns contain value t. or f.

df = df.drop(['Unnamed: 0', 'TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured', 'TBG_measured', 'TBG'],axis =1)


In [29]:
numerical_columns=df.select_dtypes(include=np.number)
for i in numerical_columns:
    print(i)
    #print(numerical_columns)
    #print(f"{feature}:{numerical[feature]}")#.unique()}")
    # null value and encoder
#numerical_columns=['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']
#for i in numerical_columns:
    df[i].fillna(df[i].median(), inplace = True)

age
TSH
T3
TT4
T4U
FTI


In [30]:
df['sex'].fillna(method='ffill',inplace=True)
#df.isnull().sum()

In [31]:
categorical_features=df.select_dtypes(exclude='number')
for feature in categorical_features:
    print(f"{feature}:{categorical_features[feature].unique()}")

sex:['F' 'M']
on_thyroxine:['f' 't']
query_on_thyroxine:['f' 't']
on_antithyroid_medication:['f' 't']
sick:['f' 't']
pregnant:['f' 't']
thyroid_surgery:['f' 't']
I131_treatment:['f' 't']
query_hypothyroid:['f' 't']
query_hyperthyroid:['f' 't']
lithium:['f' 't']
goitre:['f' 't']
tumor:['f' 't']
hypopituitary:['f' 't']
psych:['f' 't']
referral_source:['SVHC' 'other' 'SVI' 'STMW' 'SVHD']
classes:['negative' 'compensated hypothyroid' 'primary hypothyroid'
 'secondary hypothyroid']


In [32]:
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,referral_source,classes
0,41.0,F,f,f,f,f,f,f,f,f,...,f,f,f,1.3,2.5,125.0,1.14,109.0,SVHC,negative
1,23.0,F,f,f,f,f,f,f,f,f,...,f,f,f,4.1,2.0,102.0,0.98,107.0,other,negative
2,46.0,M,f,f,f,f,f,f,f,f,...,f,f,f,0.98,2.0,109.0,0.91,120.0,other,negative
3,70.0,F,t,f,f,f,f,f,f,f,...,f,f,f,0.16,1.9,175.0,0.98,107.0,other,negative
4,70.0,F,f,f,f,f,f,f,f,f,...,f,f,f,0.72,1.2,61.0,0.87,70.0,SVI,negative


In [33]:
# clean data
df.to_csv("df_clean.csv")

In [34]:
# New notebook
data = pd.read_csv('df_clean.csv',index_col=0)
data

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,referral_source,classes
0,41.0,F,f,f,f,f,f,f,f,f,...,f,f,f,1.30,2.5,125.0,1.14,109.0,SVHC,negative
1,23.0,F,f,f,f,f,f,f,f,f,...,f,f,f,4.10,2.0,102.0,0.98,107.0,other,negative
2,46.0,M,f,f,f,f,f,f,f,f,...,f,f,f,0.98,2.0,109.0,0.91,120.0,other,negative
3,70.0,F,t,f,f,f,f,f,f,f,...,f,f,f,0.16,1.9,175.0,0.98,107.0,other,negative
4,70.0,F,f,f,f,f,f,f,f,f,...,f,f,f,0.72,1.2,61.0,0.87,70.0,SVI,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2795,70.0,M,f,f,f,f,f,f,f,f,...,f,f,f,2.70,2.0,155.0,1.05,148.0,SVI,negative
2796,73.0,M,f,t,f,f,f,f,f,f,...,f,f,f,1.40,0.7,63.0,0.88,72.0,other,negative
2797,75.0,M,f,f,f,f,f,f,f,f,...,f,f,f,1.40,2.0,147.0,0.80,183.0,other,negative
2798,60.0,F,f,f,f,f,f,f,f,f,...,f,f,f,1.40,2.0,100.0,0.83,121.0,other,negative


In [35]:
categorical_features=data[['sex', 'on_thyroxine', 'query_on_thyroxine','on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery','I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium','goitre', 'tumor', 'hypopituitary', 'psych','referral_source']]

In [36]:
# using label encoder to normalize values
le = LabelEncoder()
#cols = df.select_dtypes(include=['object'])
for i in categorical_features:
    try:
        data[i] = le.fit_transform(data[i])
    except:
        continue

In [37]:
data

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,referral_source,classes
0,41.0,0,0,0,0,0,0,0,0,0,...,0,0,0,1.30,2.5,125.0,1.14,109.0,1,negative
1,23.0,0,0,0,0,0,0,0,0,0,...,0,0,0,4.10,2.0,102.0,0.98,107.0,4,negative
2,46.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0.98,2.0,109.0,0.91,120.0,4,negative
3,70.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0.16,1.9,175.0,0.98,107.0,4,negative
4,70.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.72,1.2,61.0,0.87,70.0,3,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2795,70.0,1,0,0,0,0,0,0,0,0,...,0,0,0,2.70,2.0,155.0,1.05,148.0,3,negative
2796,73.0,1,0,1,0,0,0,0,0,0,...,0,0,0,1.40,0.7,63.0,0.88,72.0,4,negative
2797,75.0,1,0,0,0,0,0,0,0,0,...,0,0,0,1.40,2.0,147.0,0.80,183.0,4,negative
2798,60.0,0,0,0,0,0,0,0,0,0,...,0,0,0,1.40,2.0,100.0,0.83,121.0,4,negative


In [38]:
target = data['classes']
target.shape

(2800,)

In [40]:
# Initialize the label encoder for target

import joblib
label_encoder = LabelEncoder()

# Fit and transform the target variable
target_encoded = label_encoder.fit_transform(target)
print(np.unique(target))
print(np.unique(target_encoded))
# Save the label encoder using joblib

joblib.dump(target_encoded, 'label_encoder.joblib')

['compensated hypothyroid' 'negative' 'primary hypothyroid'
 'secondary hypothyroid']
[0 1 2 3]


['label_encoder.joblib']

In [41]:
data.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,referral_source,classes
0,41.0,0,0,0,0,0,0,0,0,0,...,0,0,0,1.3,2.5,125.0,1.14,109.0,1,negative
1,23.0,0,0,0,0,0,0,0,0,0,...,0,0,0,4.1,2.0,102.0,0.98,107.0,4,negative
2,46.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0.98,2.0,109.0,0.91,120.0,4,negative
3,70.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0.16,1.9,175.0,0.98,107.0,4,negative
4,70.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.72,1.2,61.0,0.87,70.0,3,negative


In [42]:
target_encoded

array([1, 1, 1, ..., 1, 1, 1])

In [43]:
data.to_csv("preprocessed_data.csv")