# Import Libraries 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBClassifier
import pickle
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Getting Data

In [None]:
df = pd.read_csv("/kaggle/input/thyroid-cancer-dataset/dataset.csv")

# EDA : Exploratory Data Analysis

In [None]:
df.head()

In [None]:
df.sample(10)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
obj_cols = df.select_dtypes(include = 'object').columns

# Preprocessing

## Label Encoding

In [None]:
le = LabelEncoder()

In [None]:
for cols in obj_cols:
    df[cols] = le.fit_transform(df[cols])

In [None]:
df.info()

In [None]:
df.sample(10)

## Plotting Graphs and Charts

In [None]:
plt.figure(figsize = (10,6))
sns.heatmap(df.corr(), annot = True)
plt.show()

In [None]:
for col in df.columns:
    plt.figure(figsize = (11, 8))
    sns.countplot(df, x = col, palette = 'pastel')
    plt.title(f'{col} - Count Plot')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.grid(axis = 'y', alpha = 0.7)
    plt.tight_layout()
    plt.show()

## Finding Correlation

In [None]:
df.corr()

In [None]:
df['Recurred'].value_counts()

# Checking data imbalance

In [None]:
outcome_counts = df["Recurred"].value_counts()
plt.pie(outcome_counts, labels = outcome_counts.index, autopct = "%1.1f%%")
plt.show()

# Train Test Split

In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
X_train.head()

In [None]:
y_train.head()

# Trying Out Different Models

## Logistic Regression

### With class weight

In [None]:
lr = LogisticRegression(class_weight = 'balanced', max_iter = 280)

In [None]:
lr.fit(X_train, y_train)

In [None]:
y_pred_lr = lr.predict(X_test)

In [None]:
accuracy_lr = accuracy_score(y_test, y_pred_lr)

In [None]:
print(f"Accuracy of Logistic Regression with class weight : {accuracy_lr: .2f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_lr)
print("Confusion Matrix:\n", conf_matrix)

print("Classification Report:\n", classification_report(y_test, y_pred_lr))

### Without class weight

In [None]:
lr2 = LogisticRegression(max_iter = 280)

In [None]:
lr2.fit(X_train, y_train)

In [None]:
y_pred_simple_lr = lr2.predict(X_test)

In [None]:
accuracy_simple_lr = accuracy_score(y_test, y_pred_simple_lr)
print(f"Accuracy of Logistic Regression without class weight : {accuracy_simple_lr: .2f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_simple_lr)
print("Confusion Matrix:\n", conf_matrix)

print("Classification Report:\n", classification_report(y_test, y_pred_simple_lr))

## Decision Tree 

### With class weight 

In [None]:
dt = DecisionTreeClassifier(class_weight = 'balanced', random_state = 42)

In [None]:
dt.fit(X_train, y_train)

In [None]:
y_pred_dt = dt.predict(X_test)

In [None]:
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Accuracy of Decision Tree with class weight : {accuracy_dt: .2f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_dt)
print("Confusion Matrix:\n", conf_matrix)

print("Classification Report:\n", classification_report(y_test, y_pred_dt))

### Without class weight

In [None]:
dt2 = DecisionTreeClassifier()
dt2.fit(X_train, y_train)

In [None]:
y_pred_simple_dt = dt2.predict(X_test)

In [None]:
accuracy_simple_dt = accuracy_score(y_test, y_pred_simple_dt)
print(f"Accuracy of Decision Tree without class weight : {accuracy_simple_dt: .2f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_simple_dt)
print("Confusion Matrix:\n", conf_matrix)

print("Classification Report:\n", classification_report(y_test, y_pred_simple_dt))

## Support Vector Machine

### With class weight

In [None]:
svm = SVC(class_weight = 'balanced', random_state = 42)

In [None]:
svm.fit(X_train, y_train)

In [None]:
y_pred_svm = svm.predict(X_test)

In [None]:
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy of SVM with class weight : {accuracy_svm: .2f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_svm)
print("Confusion Matrix:\n", conf_matrix)

print("Classification Report:\n", classification_report(y_test, y_pred_svm))

### Without class weight

In [None]:
svm2 = SVC()

In [None]:
svm2.fit(X_train, y_train)

In [None]:
y_pred_simple_svm = svm2.predict(X_test)

In [None]:
accuracy_simple_svm = accuracy_score(y_test, y_pred_simple_svm)
print(f"Accuracy of SVM without class weight : {accuracy_simple_svm: .2f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_simple_svm)
print("Confusion Matrix:\n", conf_matrix)

print("Classification Report:\n", classification_report(y_test, y_pred_simple_svm))

## Naive Bayes

### With class weight

In [None]:
sample_weights = compute_sample_weight(class_weight = 'balanced', y = y_train)

nb = GaussianNB()
nb.fit(X_train, y_train, sample_weight = sample_weights)

In [None]:
y_pred_nb = nb.predict(X_test)

In [None]:
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Accuracy of naive bayes with class weight : {accuracy_nb: .2f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_nb)
print("Confusion Matrix:\n", conf_matrix)

print("Classification Report:\n", classification_report(y_test, y_pred_nb))

### Without class weight

In [None]:
nb2 = GaussianNB()

In [None]:
nb2.fit(X_train, y_train)

In [None]:
y_pred_simple_nb = nb2.predict(X_test)

In [None]:
accuracy_simple_nb = accuracy_score(y_test, y_pred_simple_nb)
print(f"Accuracy of naive bayes without class weight : {accuracy_simple_nb: .2f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_simple_nb)
print("Confusion Matrix:\n", conf_matrix)

print("Classification Report:\n", classification_report(y_test, y_pred_simple_nb))

## Random Forest 

### With class weight

In [None]:
rf = RandomForestClassifier(class_weight = 'balanced', random_state = 42)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_pred_rf = rf.predict(X_test)

In [None]:
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy of random forest with class weight : {accuracy_rf: .2f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:\n", conf_matrix)

print("Classification Report:\n", classification_report(y_test, y_pred_rf))

### Without class weight

In [None]:
rf2 = RandomForestClassifier()
rf2.fit(X_train, y_train)

In [None]:
y_pred_simple_rf = rf2.predict(X_test)

In [None]:
accuracy_simple_rf = accuracy_score(y_test, y_pred_simple_rf)
print(f"Accuracy of random forest without class weight : {accuracy_simple_rf: .2f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_simple_rf)
print("Confusion Matrix:\n", conf_matrix)

print("Classification Report:\n", classification_report(y_test, y_pred_simple_rf))

## LightGBM

### With class weight

In [None]:
lgb = LGBMClassifier(class_weight = 'balanced', random_state = 42, force_col_wise = True)
lgb.fit(X_train, y_train)

In [None]:
y_pred_gbm = lgb.predict(X_test)

In [None]:
accuracy_gbm = accuracy_score(y_test, y_pred_gbm)
print(f"Accuracy of light gbm with class weight : {accuracy_gbm: .2f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_gbm)
print("Confusion Matrix:\n", conf_matrix)

print("Classification Report:\n", classification_report(y_test, y_pred_gbm))

### Without class weight

In [None]:
lgb2 = LGBMClassifier()

In [None]:
lgb2.fit(X_train, y_train)

In [None]:
y_pred_simple_gbm = lgb2.predict(X_test)

In [None]:
accuracy_simple_gbm = accuracy_score(y_test, y_pred_simple_gbm)
print(f"Accuracy of light gbm without class weight : {accuracy_simple_gbm: .2f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_simple_gbm)
print("Confusion Matrix:\n", conf_matrix)

print("Classification Report:\n", classification_report(y_test, y_pred_simple_gbm))

## XGBoost

### With class weight

In [None]:
xgb = XGBClassifier(scale_pos_weight = 275 / 108, random_state = 42)

In [None]:
xgb.fit(X_train, y_train)

In [None]:
y_pred_xgb = xgb.predict(X_test)

In [None]:
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy of xgboost with class weight : {accuracy_xgb: .2f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_xgb)
print("Confusion Matrix:\n", conf_matrix)

print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

### Without class weight

In [None]:
xgb2 = XGBClassifier()

In [None]:
xgb2.fit(X_train, y_train)

In [None]:
y_pred_simple_xgm = xgb2.predict(X_test)

In [None]:
accuracy_simple_xgm = accuracy_score(y_test, y_pred_simple_xgm)
print(f"Accuracy of xg boost without class weight : {accuracy_simple_xgm: .2f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_simple_xgm)
print("Confusion Matrix:\n", conf_matrix)

print("Classification Report:\n", classification_report(y_test, y_pred_simple_xgm))

# Selecting Best Model

## Random Forest with class weight

In [None]:
y_pred_rf = rf.predict(X_test)

In [None]:
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy of Random Forest with class weight: {accuracy_rf:.2f}")

## Random Forest without class weight

In [None]:
y_pred_simple_rf = rf2.predict(X_test)

In [None]:
accuracy_simple_rf = accuracy_score(y_test, y_pred_simple_rf)
print(f"Accuracy of Random Forest with class weight: {accuracy_simple_rf:.2f}")

# Saving Best Model (Random Forest with class weight)

## Save Pickle Model

In [None]:
with open("model_rf.pkl", "wb") as f:
    pickle.dump(rf, f)

## Load Pickle Model

In [None]:
with open("model_rf.pkl", "rb") as f:
    loaded_model = pickle.load(f)

## Check Loaded Model

In [None]:
y_pred_rf = loaded_model.predict(X_test)
acc = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy Score: {acc:.2f}")