In [116]:
import numpy as np
import pandas as pd

In [117]:
df = pd.read_csv("dataset.csv")

In [118]:
df.shape

(3000, 16)

In [119]:
df.sample(5)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
1345,F,53,1,2,2,1,2,2,1,2,2,2,1,1,1,NO
2215,F,30,2,2,2,1,2,2,2,2,2,2,1,2,1,NO
2926,M,58,2,2,2,2,1,1,2,2,1,1,1,1,2,NO
1577,M,64,2,1,1,2,1,1,1,1,2,1,1,2,1,NO
1478,M,71,1,2,2,2,2,1,1,2,1,2,1,2,1,NO


In [120]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   GENDER                 3000 non-null   str  
 1   AGE                    3000 non-null   int64
 2   SMOKING                3000 non-null   int64
 3   YELLOW_FINGERS         3000 non-null   int64
 4   ANXIETY                3000 non-null   int64
 5   PEER_PRESSURE          3000 non-null   int64
 6   CHRONIC_DISEASE        3000 non-null   int64
 7   FATIGUE                3000 non-null   int64
 8   ALLERGY                3000 non-null   int64
 9   WHEEZING               3000 non-null   int64
 10  ALCOHOL_CONSUMING      3000 non-null   int64
 11  COUGHING               3000 non-null   int64
 12  SHORTNESS_OF_BREATH    3000 non-null   int64
 13  SWALLOWING_DIFFICULTY  3000 non-null   int64
 14  CHEST_PAIN             3000 non-null   int64
 15  LUNG_CANCER            3000 non-null   str  
dtyp

In [121]:
df['LUNG_CANCER'].value_counts()

LUNG_CANCER
YES    1518
NO     1482
Name: count, dtype: int64

In [122]:
df['LUNG_CANCER'] = df['LUNG_CANCER'].map({'YES':1, 'NO':0})

In [123]:
x = df.drop('LUNG_CANCER',axis =1)
y  = df['LUNG_CANCER']

In [124]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size=0.2,
    random_state=42,
)

In [125]:
categorical_features = ['GENDER']
numeric_features = [col for col in x.columns if col !='GENDER']

In [126]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

In [127]:
preprocessor = ColumnTransformer(transformers=[('num',StandardScaler(),numeric_features),
('cat',OneHotEncoder(drop='first'),categorical_features)])

In [128]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,roc_auc_score

In [129]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [132]:
results =[]

for name, model in models.items():
    pipe = Pipeline([
        ('preprocessor',preprocessor),
        ('model',model)
    ])
    pipe.fit(x_train,y_train)
    y_pred = pipe.predict(x_test)
    y_prob = pipe.predict_proba(x_test)[:,1]
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_prob)
    })
results_df = pd.DataFrame(results).sort_values(by ='ROC-AUC',ascending=False)
results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,ROC-AUC
0,Logistic Regression,0.521667,0.516717,0.57047,0.527001
2,SVM,0.486667,0.484277,0.516779,0.504567
1,Random Forest,0.503333,0.5,0.530201,0.502017
3,XGBoost,0.488333,0.486068,0.526846,0.487944


In [133]:
y.value_counts(normalize=True)

LUNG_CANCER
1    0.506
0    0.494
Name: proportion, dtype: float64

In [135]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])

roc_scores = cross_val_score(
    pipe,
    x,
    y,
    cv=cv,
    scoring='roc_auc'
)

print("ROC scores:", roc_scores)
print("Mean ROC-AUC:", np.mean(roc_scores))

ROC scores: [0.50685069 0.49927215 0.49145404 0.49252089 0.54773071]
Mean ROC-AUC: 0.5075656942126565


In [138]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import pandas as pd

pipe_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

pipe_rf.fit(x, y)

# Get feature names after preprocessing
ohe = pipe_rf.named_steps['preprocessor'].named_transformers_['cat']
encoded_cat_features = ohe.get_feature_names_out(['GENDER'])

all_features = numeric_features + list(encoded_cat_features)

importances = pipe_rf.named_steps['model'].feature_importances_

feat_imp = pd.DataFrame({
    'Feature': all_features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

feat_imp.head(10)

Unnamed: 0,Feature,Importance
0,AGE,0.311794
2,YELLOW_FINGERS,0.054441
1,SMOKING,0.053333
13,CHEST_PAIN,0.053301
7,ALLERGY,0.05312
3,ANXIETY,0.051563
11,SHORTNESS_OF_BREATH,0.05141
12,SWALLOWING_DIFFICULTY,0.051296
14,GENDER_M,0.05039
6,FATIGUE,0.049978
