##Import the dataset and libraries

In [1]:
from google.colab import files
uploaded=files.upload()

Saving merged_cancer_symptom_dataset.csv to merged_cancer_symptom_dataset.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

##Loading the dataset and performing data exploration

In [3]:
df=pd.read_csv('merged_cancer_symptom_dataset.csv')

In [4]:
df.head()

Unnamed: 0,Age,Gender,BMI,Smoking,Alcohol,Family_History,Fatigue,Weight_Loss,Cough,Chest_Pain,Bleeding,Mouth_Pain,Ulcers,Abnormal_Bleeding,Night_Sweats,Fever,Shortness_of_Breath,Loss_of_Appetite,Cancer_Type
0,69,Female,29.8,Yes,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,No,Cervical Cancer
1,32,Female,25.6,No,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No Cancer
2,89,Male,19.5,Yes,No,No,No,Yes,No,No,No,No,No,No,No,No,Yes,No,No Cancer
3,78,Female,25.4,No,No,No,No,Yes,Yes,No,No,No,No,No,No,No,Yes,Yes,Lung Cancer
4,38,Male,20.9,No,Yes,No,No,No,No,Yes,No,No,No,No,No,No,No,No,Lung Cancer


In [5]:

print(df.describe())
print(df.columns)
print(df.shape)

                Age           BMI
count  40000.000000  40000.000000
mean      53.435575     24.994575
std       20.783089      4.983979
min       18.000000      4.500000
25%       35.000000     21.600000
50%       53.000000     25.000000
75%       71.000000     28.400000
max       89.000000     43.900000
Index(['Age', 'Gender', 'BMI', 'Smoking', 'Alcohol', 'Family_History',
       'Fatigue', 'Weight_Loss', 'Cough', 'Chest_Pain', 'Bleeding',
       'Mouth_Pain', 'Ulcers', 'Abnormal_Bleeding', 'Night_Sweats', 'Fever',
       'Shortness_of_Breath', 'Loss_of_Appetite', 'Cancer_Type'],
      dtype='object')
(40000, 19)


As you can see only 2 numerical features are Age and BMI, We have 40k rows and 19 columns

In [6]:
df.isnull().sum()

Unnamed: 0,0
Age,0
Gender,0
BMI,0
Smoking,0
Alcohol,0
Family_History,0
Fatigue,0
Weight_Loss,0
Cough,0
Chest_Pain,0


Our dataset is clean(no missing values) So no need for data cleaning

Here fatigue and fever looks similar, lets check if they are reduntant or not

In [7]:
print(df['Fever'].value_counts())
print(df['Fatigue'].value_counts())

Fever
No     32071
Yes     7929
Name: count, dtype: int64
Fatigue
No     24044
Yes    15956
Name: count, dtype: int64


They are clearly different so we can conclude that there are no duplicates. or there is an inbuilt function to check.

In [8]:
df.T.duplicated()

Unnamed: 0,0
Age,False
Gender,False
BMI,False
Smoking,False
Alcohol,False
Family_History,False
Fatigue,False
Weight_Loss,False
Cough,False
Chest_Pain,False


So no duplicate columns

In [9]:
df.dtypes

Unnamed: 0,0
Age,int64
Gender,object
BMI,float64
Smoking,object
Alcohol,object
Family_History,object
Fatigue,object
Weight_Loss,object
Cough,object
Chest_Pain,object


##Feature Selection

Our DataSet - 19 columns - 18 features(2 numerical + 16 categorical) + 1 target(categorical)

For checking the linear correlation between numerical and categorical variables we use ANOVA f-test

In [10]:
X_num=df[['Age','BMI']]
y_cat=df['Cancer_Type']

In [11]:
from sklearn.feature_selection import f_classif
f_values,p_values=f_classif(X_num,y_cat)
print(f_values)
print(p_values)


[0.3033649  0.54697383]
[0.91111687 0.74077902]


Here if p-value < 0.05 , It rejects null hypothesis. but our p-values are very large, So both features dont seem useful for predicting Cancer_Type, **linearly**,So lets check non linear correlation

For checking non linear correlation between numerical feature and categorical target - We use mutual information test

In [12]:
from sklearn.feature_selection import mutual_info_classif
mutual_info=mutual_info_classif(X_num,y_cat,discrete_features=False)
print(mutual_info)

[0.0020252 0.0041882]


MI is extremely close to 0 so we are dropping these 2 features for now

In [13]:
df_1=df.drop(['Age','BMI'],axis=1)
df_1

Unnamed: 0,Gender,Smoking,Alcohol,Family_History,Fatigue,Weight_Loss,Cough,Chest_Pain,Bleeding,Mouth_Pain,Ulcers,Abnormal_Bleeding,Night_Sweats,Fever,Shortness_of_Breath,Loss_of_Appetite,Cancer_Type
0,Female,Yes,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,No,Cervical Cancer
1,Female,No,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No Cancer
2,Male,Yes,No,No,No,Yes,No,No,No,No,No,No,No,No,Yes,No,No Cancer
3,Female,No,No,No,No,Yes,Yes,No,No,No,No,No,No,No,Yes,Yes,Lung Cancer
4,Male,No,Yes,No,No,No,No,Yes,No,No,No,No,No,No,No,No,Lung Cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,Female,No,No,No,Yes,Yes,No,No,Yes,No,No,No,No,No,Yes,Yes,No Cancer
39996,Male,No,No,Yes,No,No,Yes,Yes,No,No,No,No,Yes,No,No,No,Leukemia
39997,Female,No,No,Yes,Yes,No,No,No,No,Yes,No,No,Yes,No,No,No,Leukemia
39998,Female,Yes,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No Cancer


Now, Lets check the linear correlation between 2 categorical variables- chi-square test

In [14]:
X_num_2=df[[col for col in df.columns]]
y_cat_2=df['Cancer_Type']

In [15]:
X_num_2=X_num_2.drop(columns=['Age','BMI','Cancer_Type'])

In [16]:
from sklearn.preprocessing import LabelEncoder
for col in X_num_2.columns:
  if X_num_2[col].dtype=='object':
    X_num_2[col]=LabelEncoder().fit_transform(X_num_2[col])

In [17]:
X_num_2.head()

Unnamed: 0,Gender,Smoking,Alcohol,Family_History,Fatigue,Weight_Loss,Cough,Chest_Pain,Bleeding,Mouth_Pain,Ulcers,Abnormal_Bleeding,Night_Sweats,Fever,Shortness_of_Breath,Loss_of_Appetite
0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,1
4,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0


In [18]:
X_num_2

Unnamed: 0,Gender,Smoking,Alcohol,Family_History,Fatigue,Weight_Loss,Cough,Chest_Pain,Bleeding,Mouth_Pain,Ulcers,Abnormal_Bleeding,Night_Sweats,Fever,Shortness_of_Breath,Loss_of_Appetite
0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,1
4,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1
39996,1,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0
39997,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0
39998,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
y_cat_2

Unnamed: 0,Cancer_Type
0,Cervical Cancer
1,No Cancer
2,No Cancer
3,Lung Cancer
4,Lung Cancer
...,...
39995,No Cancer
39996,Leukemia
39997,Leukemia
39998,No Cancer


Now chi square test

In [20]:
from sklearn.feature_selection import chi2
chi_values,p_values=chi2(X_num_2,y_cat_2)
chi_results=pd.DataFrame({'chi2':chi_values,'P_Values':p_values},index=X_num_2.columns)

In [21]:
print(chi_results)

                            chi2       P_Values
Gender                  2.433413   7.864881e-01
Smoking               564.898582  7.743019e-120
Alcohol              1015.272196  2.967694e-217
Family_History        711.084390  1.970235e-151
Fatigue                 3.113020   6.825664e-01
Weight_Loss           681.670931  4.509506e-145
Cough                 416.817618   7.031764e-88
Chest_Pain            393.886936   6.161952e-83
Bleeding              772.160603  1.217649e-164
Mouth_Pain            938.495408  1.239270e-200
Ulcers               1153.633053  3.242013e-247
Abnormal_Bleeding    4465.326779   0.000000e+00
Night_Sweats          792.078718  5.982834e-169
Fever                1050.833778  5.925207e-225
Shortness_of_Breath   365.040243   1.010212e-76
Loss_of_Appetite      464.978500   2.883824e-98


Here generally alpha = 0.05(type 1 error)

In [22]:
selected_features=chi_results[chi_results['P_Values']<0.05].index
print(selected_features)

Index(['Smoking', 'Alcohol', 'Family_History', 'Weight_Loss', 'Cough',
       'Chest_Pain', 'Bleeding', 'Mouth_Pain', 'Ulcers', 'Abnormal_Bleeding',
       'Night_Sweats', 'Fever', 'Shortness_of_Breath', 'Loss_of_Appetite'],
      dtype='object')


From our features 14 are selected and remaing 2 are pushed to check non linear correlation with mutual information

Our selected features are - 'Smoking', 'Alcohol', 'Family_History', 'Weight_Loss', 'Cough',
       'Chest_Pain', 'Bleeding', 'Mouth_Pain', 'Ulcers', 'Abnormal_Bleeding',
       'Night_Sweats', 'Fever', 'Shortness_of_Breath', 'Loss_of_Appetite'

In [23]:
X_rem=X_num_2.drop(columns=selected_features)
X_rem.columns

Index(['Gender', 'Fatigue'], dtype='object')

Our remaining features are gender and fatigue

Lets do mutual information test

In [24]:
from sklearn.feature_selection import mutual_info_classif
mutual_info=mutual_info_classif(X_rem,y_cat_2,discrete_features=False)
print(mutual_info)

[0.00506923 0.00161027]


As they are approx 0, MI test conforms they dont give extra information to our target prediction so safely drop them

In [25]:
df_1=df_1.drop(columns=['Gender','Fatigue'])

In [26]:
df_1.head()

Unnamed: 0,Smoking,Alcohol,Family_History,Weight_Loss,Cough,Chest_Pain,Bleeding,Mouth_Pain,Ulcers,Abnormal_Bleeding,Night_Sweats,Fever,Shortness_of_Breath,Loss_of_Appetite,Cancer_Type
0,Yes,No,No,No,No,Yes,No,No,No,Yes,No,Yes,No,No,Cervical Cancer
1,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No Cancer
2,Yes,No,No,Yes,No,No,No,No,No,No,No,No,Yes,No,No Cancer
3,No,No,No,Yes,Yes,No,No,No,No,No,No,No,Yes,Yes,Lung Cancer
4,No,Yes,No,No,No,Yes,No,No,No,No,No,No,No,No,Lung Cancer


Since i have all categorical data i decided to use catboost classification - Since i am using tree based model here checking multicollinearity is no need

##Some Basic EDA And Model Selection Thoughts

In [27]:
df_1['Cancer_Type'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Cancer_Type,Unnamed: 1_level_1
No Cancer,0.609475
Lung Cancer,0.18255
Leukemia,0.083575
Breast Cancer,0.06185
Oral Cancer,0.049175
Cervical Cancer,0.013375


Here our target is highly imbalanced

Catboost handles skewed categorical featuresm automatically so our main concern is target imbalance

To fix this we can use several methods - SMOTE,Assigning weights to balance them etc.

I am using class weights to balance

In [28]:
from sklearn.utils.class_weight import compute_class_weight
Y=df_1['Cancer_Type']
classes=np.unique(Y)
weights = compute_class_weight('balanced', classes=classes, y=Y)
class_weights_dict = dict(zip(classes, weights))
print("Class weights: ",class_weights_dict)

Class weights:  {'Breast Cancer': np.float64(2.6946914578280787), 'Cervical Cancer': np.float64(12.461059190031152), 'Leukemia': np.float64(1.9942167713630472), 'Lung Cancer': np.float64(0.9129918743723181), 'No Cancer': np.float64(0.2734593981158647), 'Oral Cancer': np.float64(3.3892560582952043)}


For cancer detection, recall is the most critical metric because missing an actual cancer case is far riskier than raising false alarms.
Since the dataset is highly imbalanced, accuracy and weighted averages are misleading, as they are dominated by the majority "No Cancer" class.
Instead, we should focus on macro recall or per-class recall, which treats each cancer type equally and ensures minority cancers are not ignored.
Using macro F1 or recall on cancer classes gives a fairer evaluation and aligns with the real-world goal of correctly identifying as many true cancer patients as possible.

##Model 1 building

In [29]:
from sklearn.model_selection import train_test_split
X=df_1.drop('Cancer_Type',axis=1)
y=df_1['Cancer_Type']
X_train1,X_test1,y_train1,y_test1=train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

Now our training set and testing set is ready lets go into building the model

In [30]:
cat_features=[i for i,col in enumerate(X_train1.columns) if X_train1[col].dtype=='object']

In [31]:
print(cat_features)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]


In [32]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [33]:
from catboost import CatBoostClassifier

model=CatBoostClassifier(iterations=100,learning_rate=0.001,depth=10,eval_metric='TotalF1',custom_metric='Recall',random_seed=42,class_weights=list(weights),verbose=100)
model.fit(X_train1,y_train1,cat_features=cat_features,eval_set=(X_test1,y_test1),early_stopping_rounds=50)

0:	learn: 0.3642388	test: 0.3567771	best: 0.3567771 (0)	total: 115ms	remaining: 11.4s
99:	learn: 0.4892445	test: 0.4634902	best: 0.4636063 (92)	total: 16.6s	remaining: 0us

bestTest = 0.4636063395
bestIteration = 92

Shrink model to first 93 iterations.


<catboost.core.CatBoostClassifier at 0x7c17cdf8cd40>

## Model 1 Evaluation

In [34]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,recall_score,precision_score,f1_score
y_pred=model.predict(X_test1)
print(accuracy_score(y_test1,y_pred))
print(classification_report(y_test1,y_pred))

0.25225
                 precision    recall  f1-score   support

  Breast Cancer       0.14      0.39      0.21       495
Cervical Cancer       0.14      0.96      0.24       107
       Leukemia       0.20      0.65      0.30       669
    Lung Cancer       0.34      0.13      0.19      1460
      No Cancer       0.82      0.16      0.27      4876
    Oral Cancer       0.14      0.78      0.24       393

       accuracy                           0.25      8000
      macro avg       0.30      0.51      0.24      8000
   weighted avg       0.60      0.25      0.25      8000



##Model 2 building

In [35]:
X1=df.drop('Cancer_Type',axis=1)
y1=df['Cancer_Type']

In [36]:
from sklearn.model_selection import train_test_split
X_train2,X_test2,y_train2,y_test2=train_test_split(X1,y1,test_size=0.2,stratify=y1,random_state=42)

In [38]:
from sklearn.preprocessing import LabelEncoder

X_train_enc = X_train2.copy()
X_test_enc = X_test2.copy()

for col in X_train2.columns:
  if X_train2[col].dtype == 'object':
    le = LabelEncoder()
    X_train_enc[col] = le.fit_transform(X_train_enc[col])
    X_test_enc[col] = le.transform(X_test_enc[col])

In [39]:
X_train_enc.head()

Unnamed: 0,Age,Gender,BMI,Smoking,Alcohol,Family_History,Fatigue,Weight_Loss,Cough,Chest_Pain,Bleeding,Mouth_Pain,Ulcers,Abnormal_Bleeding,Night_Sweats,Fever,Shortness_of_Breath,Loss_of_Appetite
16308,29,1,21.0,0,1,1,1,1,0,0,0,0,0,0,1,1,0,0
39678,83,1,26.5,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0
24830,64,1,19.1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0
26901,38,0,20.4,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0
3830,84,0,27.7,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0


In [40]:
X_test_enc.head()

Unnamed: 0,Age,Gender,BMI,Smoking,Alcohol,Family_History,Fatigue,Weight_Loss,Cough,Chest_Pain,Bleeding,Mouth_Pain,Ulcers,Abnormal_Bleeding,Night_Sweats,Fever,Shortness_of_Breath,Loss_of_Appetite
19932,27,1,24.4,0,1,0,1,0,1,1,0,0,1,0,1,0,0,1
21642,25,1,19.2,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0
37765,64,0,25.1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0
30486,60,0,27.2,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1
8783,52,1,26.2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [41]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42,class_weight='balanced')
rf_model.fit(X_train_enc, y_train2)

##Model 2 evaluation

In [42]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

y_pred_rf = rf_model.predict(X_test_enc)
print(accuracy_score(y_test2,y_pred_rf))
print(classification_report(y_test2,y_pred_rf))

0.563375
                 precision    recall  f1-score   support

  Breast Cancer       0.10      0.02      0.04       495
Cervical Cancer       0.15      0.04      0.06       107
       Leukemia       0.23      0.07      0.10       669
    Lung Cancer       0.26      0.12      0.16      1460
      No Cancer       0.61      0.87      0.72      4876
    Oral Cancer       0.13      0.02      0.04       393

       accuracy                           0.56      8000
      macro avg       0.25      0.19      0.19      8000
   weighted avg       0.46      0.56      0.48      8000



Model 1 is better because macro avg of model 1 is 0.51 > 0.19 for model 2

##Model 3 building

In [43]:
X2=X1.copy()
y2=y1.copy()

In [44]:

age_bins = [0, 20, 40, 60, 80, 120]
age_labels = ['0-20', '21-40', '41-60', '61-80', '81+']
X2['Age_binned'] = pd.cut(X2['Age'], bins=age_bins, labels=age_labels)

bmi_bins = [0, 18.5, 25, 30, 100]
bmi_labels = ['Underweight', 'Normal', 'Overweight', 'Obese']
X2['BMI_binned'] = pd.cut(X2['BMI'], bins=bmi_bins, labels=bmi_labels)


In [45]:
X2.head()

Unnamed: 0,Age,Gender,BMI,Smoking,Alcohol,Family_History,Fatigue,Weight_Loss,Cough,Chest_Pain,Bleeding,Mouth_Pain,Ulcers,Abnormal_Bleeding,Night_Sweats,Fever,Shortness_of_Breath,Loss_of_Appetite,Age_binned,BMI_binned
0,69,Female,29.8,Yes,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,No,61-80,Overweight
1,32,Female,25.6,No,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No,21-40,Overweight
2,89,Male,19.5,Yes,No,No,No,Yes,No,No,No,No,No,No,No,No,Yes,No,81+,Normal
3,78,Female,25.4,No,No,No,No,Yes,Yes,No,No,No,No,No,No,No,Yes,Yes,61-80,Overweight
4,38,Male,20.9,No,Yes,No,No,No,No,Yes,No,No,No,No,No,No,No,No,21-40,Normal


In [46]:
X2=X2.drop(['Age','BMI'],axis=1)
display(X2.head())

Unnamed: 0,Gender,Smoking,Alcohol,Family_History,Fatigue,Weight_Loss,Cough,Chest_Pain,Bleeding,Mouth_Pain,Ulcers,Abnormal_Bleeding,Night_Sweats,Fever,Shortness_of_Breath,Loss_of_Appetite,Age_binned,BMI_binned
0,Female,Yes,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,No,61-80,Overweight
1,Female,No,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No,21-40,Overweight
2,Male,Yes,No,No,No,Yes,No,No,No,No,No,No,No,No,Yes,No,81+,Normal
3,Female,No,No,No,No,Yes,Yes,No,No,No,No,No,No,No,Yes,Yes,61-80,Overweight
4,Male,No,Yes,No,No,No,No,Yes,No,No,No,No,No,No,No,No,21-40,Normal


In [47]:
y2.value_counts()

Unnamed: 0_level_0,count
Cancer_Type,Unnamed: 1_level_1
No Cancer,24379
Lung Cancer,7302
Leukemia,3343
Breast Cancer,2474
Oral Cancer,1967
Cervical Cancer,535


We are Converting Age and BMI into categories to see if we can get more information from them

In [48]:
X_train3,X_test3,y_train3,y_test3=train_test_split(X2,y2,test_size=0.2,stratify=y,random_state=42)

In [49]:
cat_features3=list(X2.columns)

In [50]:
print(cat_features3)

['Gender', 'Smoking', 'Alcohol', 'Family_History', 'Fatigue', 'Weight_Loss', 'Cough', 'Chest_Pain', 'Bleeding', 'Mouth_Pain', 'Ulcers', 'Abnormal_Bleeding', 'Night_Sweats', 'Fever', 'Shortness_of_Breath', 'Loss_of_Appetite', 'Age_binned', 'BMI_binned']


In [51]:
model3=CatBoostClassifier(iterations=500,learning_rate=0.05,depth=4,eval_metric='TotalF1',custom_metric='Recall',random_seed=42,class_weights=list(weights),verbose=100,l2_leaf_reg= 1,border_count= 32)
model3.fit(X_train3,y_train3,cat_features=cat_features3,eval_set=(X_test3,y_test3),early_stopping_rounds=50)

0:	learn: 0.3286415	test: 0.3373604	best: 0.3373604 (0)	total: 141ms	remaining: 1m 10s
100:	learn: 0.4735971	test: 0.4746019	best: 0.4750723 (93)	total: 13.8s	remaining: 54.3s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4773262463
bestIteration = 133

Shrink model to first 134 iterations.


<catboost.core.CatBoostClassifier at 0x7c17c940aae0>

##Model 3 evaluation

In [52]:
y_pred=model3.predict(X_test3)
print(accuracy_score(y_test3,y_pred))
print(classification_report(y_test3,y_pred))


0.253375
                 precision    recall  f1-score   support

  Breast Cancer       0.14      0.39      0.20       495
Cervical Cancer       0.14      1.00      0.25       107
       Leukemia       0.20      0.61      0.30       669
    Lung Cancer       0.33      0.24      0.28      1460
      No Cancer       0.85      0.14      0.24      4876
    Oral Cancer       0.15      0.73      0.24       393

       accuracy                           0.25      8000
      macro avg       0.30      0.52      0.25      8000
   weighted avg       0.62      0.25      0.25      8000



Model 3 is better than model 1

##Model 4 building

In [53]:
X3=X2.copy()
y3=y2.copy()
X3=X3.drop(['Gender','Fatigue'],axis=1)

X3

Unnamed: 0,Smoking,Alcohol,Family_History,Weight_Loss,Cough,Chest_Pain,Bleeding,Mouth_Pain,Ulcers,Abnormal_Bleeding,Night_Sweats,Fever,Shortness_of_Breath,Loss_of_Appetite,Age_binned,BMI_binned
0,Yes,No,No,No,No,Yes,No,No,No,Yes,No,Yes,No,No,61-80,Overweight
1,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No,21-40,Overweight
2,Yes,No,No,Yes,No,No,No,No,No,No,No,No,Yes,No,81+,Normal
3,No,No,No,Yes,Yes,No,No,No,No,No,No,No,Yes,Yes,61-80,Overweight
4,No,Yes,No,No,No,Yes,No,No,No,No,No,No,No,No,21-40,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,No,No,No,Yes,No,No,Yes,No,No,No,No,No,Yes,Yes,21-40,Normal
39996,No,No,Yes,No,Yes,Yes,No,No,No,No,Yes,No,No,No,61-80,Normal
39997,No,No,Yes,No,No,No,No,Yes,No,No,Yes,No,No,No,61-80,Overweight
39998,Yes,No,No,No,No,No,No,No,No,No,No,No,No,No,41-60,Underweight


In [54]:
X_train4,X_test4,y_train4,y_test4=train_test_split(X3,y3,test_size=0.2,stratify=y,random_state=42)

In [55]:
cat_features4=list(X3.columns)

In [56]:
model4=CatBoostClassifier(iterations=100,learning_rate=0.001,depth=10,eval_metric='TotalF1',custom_metric='Recall',random_seed=42,class_weights=list(weights),verbose=100)
model4.fit(X_train4,y_train4,cat_features=cat_features4,eval_set=(X_test4,y_test4),early_stopping_rounds=50)

0:	learn: 0.4864164	test: 0.4415600	best: 0.4415600 (0)	total: 289ms	remaining: 28.6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4600726694
bestIteration = 5

Shrink model to first 6 iterations.


<catboost.core.CatBoostClassifier at 0x7c17c93d7b60>

##Model 4 evaluation

In [57]:
y_pred=model4.predict(X_test4)
print(accuracy_score(y_test4,y_pred))
print(classification_report(y_test4,y_pred))


0.25375
                 precision    recall  f1-score   support

  Breast Cancer       0.14      0.40      0.21       495
Cervical Cancer       0.14      0.92      0.25       107
       Leukemia       0.19      0.61      0.29       669
    Lung Cancer       0.35      0.14      0.20      1460
      No Cancer       0.84      0.17      0.28      4876
    Oral Cancer       0.14      0.78      0.24       393

       accuracy                           0.25      8000
      macro avg       0.30      0.50      0.24      8000
   weighted avg       0.61      0.25      0.26      8000



Model 3 is still better so i am finalising X,y as X2,y2 and try xgboost and look for better recall

##Model 5 building

In [58]:
!pip install xgboost



In [59]:
import xgboost as xgb

class_labels = sorted(y_train2.unique())
weights_list = [class_weights_dict[label] for label in class_labels]

xgb_model = xgb.XGBClassifier(objective='multi:softprob',
                              num_class=len(class_labels),
                              eval_metric='merror',
                              use_label_encoder=False,
                              random_state=42,
                              n_estimators=100,
                              learning_rate=0.1,
                              scale_pos_weight=weights_list
                              )


from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train2_encoded = le.fit_transform(y_train2)
y_test2_encoded = le.transform(y_test2)

xgb_model.fit(X_train_enc, y_train2_encoded)

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


##Model 5 evaluation

In [60]:
y_pred_xgb = rf_model.predict(X_test_enc)
print(accuracy_score(y_test2,y_pred_xgb))
print(classification_report(y_test2,y_pred_xgb))

0.563375
                 precision    recall  f1-score   support

  Breast Cancer       0.10      0.02      0.04       495
Cervical Cancer       0.15      0.04      0.06       107
       Leukemia       0.23      0.07      0.10       669
    Lung Cancer       0.26      0.12      0.16      1460
      No Cancer       0.61      0.87      0.72      4876
    Oral Cancer       0.13      0.02      0.04       393

       accuracy                           0.56      8000
      macro avg       0.25      0.19      0.19      8000
   weighted avg       0.46      0.56      0.48      8000



model 3 is better than 5 but 5 is better than 2


I am selecting model 3,5 to optimise recall even further

##Model 3 recall optimisation

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import recall_score,make_scorer


params = {
    'depth': [4, 6],
    'learning_rate': [0.01, 0.05],
    'l2_leaf_reg': [1, 3],
    'border_count': [32, 64]
}

macro_recall_scorer = make_scorer(recall_score, average='macro')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(estimator=model3, param_grid=params, cv=skf, scoring='recall_macro', n_jobs=-1)
grid.fit(X_train3, y_train3,**{'cat_features': cat_features3})
print(grid.best_params_)
print(grid.best_score_)

AS you saw i already updated my params in model 3 and got recall 0.52