Importing Necessery Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression



Loading the DataSet

In [None]:
df = pd.read_csv("Lung_Cancer_Dataset.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
le = LabelEncoder()
df["LUNG_CANCER"] = le.fit_transform(df["LUNG_CANCER"])

In [None]:
le = LabelEncoder()
df["GENDER"] = le.fit_transform(df["GENDER"])

In [None]:
df.head()

Checking Data is imbalanced

In [None]:
sns.barplot (x=df['LUNG_CANCER'].value_counts().index, y=df['LUNG_CANCER'].value_counts().values)
class_counts = df['LUNG_CANCER'].value_counts()
class_percentages = (class_counts / len(df)) * 100
print(class_percentages)

In [None]:
X = df.drop("LUNG_CANCER", axis=1)
y = df["LUNG_CANCER"]

In [None]:
def evaluate_model(y_true, y_pred, y_proba):
        print(f"Precision = {precision_score(y_true, y_pred)} ")
        print(f"Recall = {recall_score(y_true, y_pred)} ")
        print(f"F1-score = {f1_score(y_true, y_pred)} ")
        print(f"ROC AUC = {roc_auc_score(y_true, y_proba)} ")
        print(f"G-Mean = {geometric_mean_score(y_true, y_pred, average='binary')} ")
        return


In [None]:
def plot_confusion_matrix(cf_matrix):
    group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]

    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names, group_counts, group_percentages)]
    labels = np.asarray(labels).reshape(2, 2)

    plt.figure(figsize=(5, 4))
    sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues', cbar=False)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()


## **Oversampling**

In [None]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
print(y.value_counts())
print(y_resampled.value_counts())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf = LogisticRegression(random_state=101)

In [None]:
clf.fit(X_train_scaled, y_train)

In [None]:
y_pred = clf.predict(X_test_scaled)
y_probs = clf.predict_proba(X_test_scaled)[:, 1]

In [None]:
evaluate_model(y_test, y_pred, y_probs)

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
plot_confusion_matrix(cf_matrix)

## **Undersampling**

In [None]:
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X, y)

In [None]:
print(y.value_counts())
print(y_enn.value_counts())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_enn, y_enn, test_size=0.3, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf1 = LogisticRegression(random_state=101)
clf1.fit(X_train_scaled, y_train)

In [None]:
y_pred = clf1.predict(X_test_scaled)
y_probs = clf1.predict_proba(X_test_scaled)[:, 1]

In [None]:
evaluate_model(y_test, y_pred, y_probs)

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cf_matrix)

## **Ensamble Method**

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
ada = AdaBoostClassifier(random_state=42)
ada.fit(X_train_scaled, y_train)

In [None]:
ada_pred = ada.predict(X_test_scaled)
ada_probs = ada.predict_proba(X_test_scaled)[:, 1]

In [None]:
evaluate_model(y_test, ada_pred, ada_probs)

In [None]:
cf_ada = confusion_matrix(y_test, ada_pred)
plot_confusion_matrix(cf_ada)

## **Threshold Moving**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf3 = LogisticRegression(random_state=101)
clf3.fit(X_train_scaled, y_train)


In [None]:
def find_best_threshold(y_test, y_probs):
    thresholds = np.linspace(0, 1, 1000)
    best_f1 = -1
    best_threshold = 0.0

    for thresh in thresholds:
        y_pred = (y_probs >= thresh).astype(int)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)

        if (precision + recall) == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)

        if (f1 > best_f1):
            best_f1 = f1
            best_threshold = thresh

    return best_threshold, best_f1

In [None]:
y_probs = clf3.predict_proba(X_test_scaled)[:,1]

In [None]:
best_thrsh = find_best_threshold(y_test, y_probs)

In [None]:
y_pred_final = (y_probs >= best_thrsh[0]).astype(int)

In [None]:
evaluate_model(y_test, y_pred_final, y_probs)

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred_final)
plot_confusion_matrix(cf_matrix)

In [None]:
find_best_threshold(y_test, y_probs)

## **Combining Sampling and Threshold Moving**

In [None]:
smote_enn = SMOTEENN(random_state=42)
X_senn, y_senn = smote_enn.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_senn, y_senn, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf4 = LogisticRegression(random_state=101)
clf4.fit(X_train_scaled, y_train)

In [None]:
y_probs1 = clf4.predict_proba(X_test_scaled)[:,1]

In [None]:
best_thresh = find_best_threshold(y_test, y_probs1)

In [None]:
y_pred = (y_probs1 >= best_thresh[0]).astype(int)

In [None]:
evaluate_model(y_test, y_pred, y_probs1)

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cf_matrix)

## **BaseLine Model**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf5 = LogisticRegression(random_state=101)
clf5.fit(X_train, y_train)

In [None]:
y_pred = clf5.predict(X_test)
y_probs = clf5.predict_proba(X_test_scaled)[:, 1]

In [None]:
evaluate_model(y_test, y_pred, y_probs)

In [None]:
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cf_matrix)

In [None]:
results = {
    "Method": ["Baseline", "Oversampling", "Undersampling", "Threshold Moving", "AdaBoost", "Combination"],
    "G": [ 0.845, 0.956, 0.892, 0.845, 0.965, 0.993]
}

df = pd.DataFrame(results)

plt.figure(figsize=(7, 5))

sns.lineplot(x="Method", y="G", data=df, marker="o", color="b", linewidth=2, markersize=8)

plt.xticks(rotation=30, ha='right', fontsize=10)
plt.title("G-mean Comparison")
plt.xlabel("Method")
plt.ylabel("G-mean")
plt.ylim(0.82, 1.03)

plt.grid(True ,linestyle='--')
plt.tight_layout()
plt.show()