<h1 style="background-color:orange; text-align: center; border-radius: 15px 50px; height: 32px">Libraries, Utilities and Load dataset</h1>

In [None]:
!pip3 install crepes

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sc

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.feature_selection import f_classif

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import KMeansSMOTE

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from crepes import WrapClassifier

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('/kaggle/input/differentiated-thyroid-cancer-recurrence-dataset/Thyroid_Diff.csv')

In [None]:
df.head(5)

In [None]:
df.info()

<h1 style="background-color:orange; text-align: center; border-radius: 15px 50px; height: 32px">Preprocessing</h1>

In [None]:
df_cleaned = df.drop_duplicates()
df_cleaned.reset_index(inplace=True,drop=True)

In [None]:
df.shape[0] - df_cleaned.shape[0]

<h1 style="background-color:orange; text-align: center; border-radius: 15px 50px; height: 32px">Exloratory Data Analysis</h1>

In [None]:
mean_age = df_cleaned['Age'].mean()
std_age = df_cleaned['Age'].std()
median_age = df_cleaned['Age'].median()

print(f'Average of Ages: {mean_age:.0f}')
print(f'Stand. Deviation of Ages: {std_age:.0f}')
print(f'Median of Ages: {median_age:.0f}')

In [None]:
plt.pie(df_cleaned.groupby('Recurred').count()['Age'].to_numpy(), labels=['No', 'Yes'], autopct='%0.f%%')
plt.show()

In [None]:
sns.displot(df_cleaned,x='Age', kde=True)
plt.show()

In [None]:
df_cleaned.groupby(['Recurred', 'Thyroid Function']).count()

In [None]:
df_cleaned.groupby(['Recurred', 'Smoking']).count()

In [None]:
df_cleaned.groupby(['Recurred','Response', 'Risk']).count()

In [None]:
sns.histplot(df_cleaned, x='Gender')
plt.show()

In [None]:
sns.histplot(df_cleaned, x='T')
plt.show()

In [None]:
sns.histplot(df_cleaned, x='N')
plt.show()

In [None]:
sns.histplot(df_cleaned, x='M')
plt.show()

In [None]:
sns.histplot(df_cleaned,x='Smoking', hue='Gender')
plt.show()

In [None]:
df_cleaned.groupby(['Smoking'])['Age'].mean()

In [None]:
df_cleaned.groupby(['Response'])['Age'].mean()

In [None]:
df_cleaned.groupby(['Recurred'])['Age'].mean()

In [None]:
sns.histplot(df_cleaned, x='Risk')
plt.show()

The study population, is characterised by 293 females (80%) and 71 males (20%), where most non-smokers are women. The average age is 41 years old, and a standard deviation of 15 years old. Most population is aged between 20 and 45 years old.

Most population has a low risk of thyroid cancer. In addition, people who don't smoke have an average age of 39 years old, while people who do smoke have an average age of 53 years old. It could be related to the average age in the response to cancer and the recurrence of thyroid cancer. That is, the average age of people with recurrent cancer is 47 years old, while the average age of people without recurrence is 39 years old. In addition, people with excellent response have an average age of 38 years old, while the remaining responses present an average age of over 42 years old. 


70% of the population has no recurrence of thyroid cancer, while 30% have a recurrence. Of those who have not had a recurrence, 84% have Euthyroidism, and 7% have Clinical Hyperthyroidism. In addition, of those who did not have a recurrence, 240 people did not smoke, and 67% had an excellent response with a low risk. On the other hand, of the cases that did recur, 47% had a Structural Incomplete and an intermediate risk. In addition, 69% of these cases don't smoke, and most cases have an Euthyroid.

<h1 style="background-color:orange; text-align: center; border-radius: 15px 50px; height: 32px">Feature Engineering</h1>

In [None]:
df_cleaned['Recurred'] = np.where(df_cleaned['Recurred'] == 'Yes',0,1)
df_cleaned['Gender'] = np.where(df_cleaned['Gender'] == 'F',0,1)
df_cleaned['Smoking'] = np.where(df_cleaned['Smoking'] == 'No',0,1)
df_cleaned['Hx Smoking'] = np.where(df_cleaned['Hx Smoking'] == 'No',0,1)
df_cleaned['Hx Radiothreapy'] = np.where(df_cleaned['Hx Radiothreapy'] == 'No',0,1)
df_cleaned['Adenopathy'] = np.where(df_cleaned['Adenopathy'] == 'No',0,1)
df_cleaned['Focality'] = np.where(df_cleaned['Focality'] == 'Uni-Focal',0,1)
df_cleaned['M'] = np.where(df_cleaned['M'] == 'M0',0,1)

df_cleaned.loc[df_cleaned['Thyroid Function'] == 'Euthyroid','Thyroid Function'] = 1
df_cleaned.loc[df_cleaned['Thyroid Function'] == 'Clinical Hyperthyroidism','Thyroid Function'] = 2
df_cleaned.loc[df_cleaned['Thyroid Function'] == 'Clinical Hypothyroidism','Thyroid Function'] = 3
df_cleaned.loc[df_cleaned['Thyroid Function'] == 'Subclinical Hyperthyroidism','Thyroid Function'] = 4
df_cleaned.loc[df_cleaned['Thyroid Function'] == 'Subclinical Hypothyroidism','Thyroid Function'] = 5

df_cleaned.loc[df_cleaned['Physical Examination'] == 'Single nodular goiter-left','Physical Examination'] = 1
df_cleaned.loc[df_cleaned['Physical Examination'] == 'Multinodular goiter','Physical Examination'] = 2
df_cleaned.loc[df_cleaned['Physical Examination'] == 'Single nodular goiter-right','Physical Examination'] = 3
df_cleaned.loc[df_cleaned['Physical Examination'] == 'Normal','Physical Examination'] = 4
df_cleaned.loc[df_cleaned['Physical Examination'] == 'Diffuse goiter','Physical Examination'] = 5

df_cleaned.loc[df_cleaned['Pathology'] == 'Micropapillary','Pathology'] = 1
df_cleaned.loc[df_cleaned['Pathology'] == 'Papillary','Pathology'] = 2
df_cleaned.loc[df_cleaned['Pathology'] == 'Follicular','Pathology'] = 3
df_cleaned.loc[df_cleaned['Pathology'] == 'Hurthel cell','Pathology'] = 4

df_cleaned.loc[df_cleaned['Risk'] == 'Low','Risk'] = 0
df_cleaned.loc[df_cleaned['Risk'] == 'Intermediate','Risk'] = 1
df_cleaned.loc[df_cleaned['Risk'] == 'High','Risk'] = 2

df_cleaned.loc[df_cleaned['T'] == 'T1a','T'] = 1
df_cleaned.loc[df_cleaned['T'] == 'T1b','T'] = 2
df_cleaned.loc[df_cleaned['T'] == 'T2','T'] = 3
df_cleaned.loc[df_cleaned['T'] == 'T3a','T'] = 4
df_cleaned.loc[df_cleaned['T'] == 'T3b','T'] = 5
df_cleaned.loc[df_cleaned['T'] == 'T4a','T'] = 6
df_cleaned.loc[df_cleaned['T'] == 'T4b','T'] = 7

df_cleaned.loc[df_cleaned['N'] == 'N0','N'] = 0
df_cleaned.loc[df_cleaned['N'] == 'N1b','N'] = 1
df_cleaned.loc[df_cleaned['N'] == 'N1a','N'] = 2

df_cleaned.loc[df_cleaned['Response'] == 'Indeterminate','Response'] = 1
df_cleaned.loc[df_cleaned['Response'] == 'Excellent','Response'] = 2
df_cleaned.loc[df_cleaned['Response'] == 'Structural Incomplete','Response'] = 3
df_cleaned.loc[df_cleaned['Response'] == 'Biochemical Incomplete','Response'] = 4

df_cleaned.loc[df_cleaned['Stage'] == 'I','Stage'] = 1
df_cleaned.loc[df_cleaned['Stage'] == 'II','Stage'] = 2
df_cleaned.loc[df_cleaned['Stage'] == 'III','Stage'] = 3
df_cleaned.loc[df_cleaned['Stage'] == 'IVA','Stage'] = 4
df_cleaned.loc[df_cleaned['Stage'] == 'IVB','Stage'] = 5


In [None]:
df_cleaned['Thyroid Function'] = df_cleaned['Thyroid Function'].astype('int64')
df_cleaned['Physical Examination'] = df_cleaned['Physical Examination'].astype('int64')
df_cleaned['Pathology'] = df_cleaned['Pathology'].astype('int64')
df_cleaned['Risk'] = df_cleaned['Risk'].astype('int64')
df_cleaned['T'] = df_cleaned['T'].astype('int64')
df_cleaned['N'] = df_cleaned['N'].astype('int64')
df_cleaned['Response'] = df_cleaned['Response'].astype('int64')
df_cleaned['Stage'] = df_cleaned['Stage'].astype('int64')

<h1 style="background-color:orange; text-align: center; border-radius: 15px 50px; height: 32px">Feature Selection</h1>

In [None]:
X = df_cleaned.iloc[:,:-1]
y = df_cleaned['Recurred']

In [None]:
anova = SelectKBest(f_classif, k=10)
anova.fit(X,y)

index_anova = anova.get_support(indices=True)
X.iloc[:,index_anova].columns

In [None]:
rfe = RFE(RandomForestClassifier(), n_features_to_select=10)
rfe.fit(X,y)

index_rfe = rfe.get_support(indices=True)
X.iloc[:,index_rfe].columns

In [None]:
rf = SelectFromModel(RandomForestClassifier(), max_features=10)
rf.fit(X,y)

index_rf = rf.get_support(indices=True)
X.iloc[:,index_rf].columns

<h1 style="background-color:orange; text-align: center; border-radius: 15px 50px; height: 32px">Sample Dataset</h1>

In [None]:
unseen_data = df_cleaned.sample(frac=0.05)
learning_data = df_cleaned.drop(unseen_data.index)

In [None]:
X_unseen= unseen_data[['Gender', 'Smoking', 'Adenopathy', 'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response']]
y_unseen = unseen_data['Recurred']

In [None]:
X = learning_data[['Gender', 'Smoking', 'Adenopathy', 'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response']]
y = learning_data['Recurred']

<h1 style="background-color:orange; text-align: center; border-radius: 15px 50px; height: 32px">Split Dataset</h1>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, shuffle=True, stratify=y)

In [None]:
X_prop_train, X_cal, y_prop_train, y_cal = train_test_split(X_train, y_train, test_size=.15, shuffle=True, stratify=y_train)

<h1 style="background-color:orange; text-align: center; border-radius: 15px 50px; height: 32px">Oversampling</h1>

In [None]:
sm = SMOTE()
X_prop_train_res, y_prop_train_res = sm.fit_resample(X_prop_train, y_prop_train)

In [None]:
adasyn = ADASYN()
X_prop_train_res, y_prop_train_res = adasyn.fit_resample(X_prop_train, y_prop_train)

In [None]:
svm_sm = SVMSMOTE()
X_prop_train_res, y_prop_train_res = svm_sm.fit_resample(X_prop_train, y_prop_train)

In [None]:
bd_sm = BoderlineSMOTE()
X_prop_train_res, y_prop_train_res = bd_sm.fit_resample(X_prop_train, y_prop_train)

In [None]:
ksm = KMeasnSMOTE()
X_prop_train_res, y_prop_train_res = ksm.fit_resample(X_prop_train, y_prop_train)

In [None]:
x_axis = len(np.where(y_prop_train_res == 1))
y_axis = len(np.where(y_prop_train_res == 0))

plt.pie([x_axis, y_axis], labels=['No', 'Yes'], autopct='%0.f%%')
plt.show()

<h1 style="background-color:orange; text-align: center; border-radius: 15px 50px; height: 32px">Train</h1>

In [None]:
# rf = WrapClassifier(RandomForestClassifier())
# rf.fit(X_prop_train_res,y_prop_train_res)

# display(rf)

In [None]:
# rf.calibrate(X_cal,y_cal)

# display(rf)

In [None]:
# predictions_rf = rf.predict_set(X_test)

# display(predictions_rf)

<h1 style="background-color:orange; text-align: center; border-radius: 15px 50px; height: 32px">Test in Unseen Data</h1>