<a href="https://colab.research.google.com/github/MrZuberi/Lung-Cancer-Diagnostic-Tool/blob/main/Lung_Cancer_Predictive_Accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Importing Dataset***

In [150]:
import kagglehub
khwaishsaxena_lung_cancer_dataset_path = kagglehub.dataset_download('khwaishsaxena/lung-cancer-dataset')

print('Data source import complete.')

Data source import complete.


# ***Initial Data Exploration***

In [152]:
import pandas as pd

df = pd.read_csv("/kaggle/input/lung-cancer-dataset/Lung Cancer.csv")

df.columns

Index(['id', 'age', 'gender', 'country', 'diagnosis_date', 'cancer_stage',
       'family_history', 'smoking_status', 'bmi', 'cholesterol_level',
       'hypertension', 'asthma', 'cirrhosis', 'other_cancer', 'treatment_type',
       'end_treatment_date', 'survived'],
      dtype='object')

In [153]:
df.head()

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64.0,Male,Sweden,2016-04-05,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,2017-09-10,0
1,2,50.0,Female,Netherlands,2023-04-20,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,2024-06-17,1
2,3,65.0,Female,Hungary,2023-04-05,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,2024-04-09,0
3,4,51.0,Female,Belgium,2016-02-05,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,2017-04-23,0
4,5,37.0,Male,Luxembourg,2023-11-29,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,2025-01-08,0


## ***To calculate the number of days treatment was given we subtracted the end date by the diagnosis date***

In [154]:
df['diagnosis_date'] = pd.to_datetime(df['diagnosis_date'])
df['end_treatment_date'] = pd.to_datetime(df['end_treatment_date'])

df['treatment_duration_days'] = (df['end_treatment_date'] - df['diagnosis_date']).dt.days

df.drop(['diagnosis_date', 'end_treatment_date'], axis=1, inplace=True)

In [155]:
columns_to_encode = ['gender', 'country', 'family_history', 'smoking_status', 'treatment_type']

df = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)

In [156]:
stage_order = ['Stage I', 'Stage II', 'Stage III', 'Stage IV']

df['cancer_stage'] = pd.Categorical(df['cancer_stage'], categories=stage_order, ordered=True).codes

In [157]:
df = df.astype({col: 'int' for col in df.select_dtypes('bool').columns})

# ***Select Ensemble Model Type***

In [158]:
from sklearn.model_selection import train_test_split

X = df.drop(['id','survived'],axis=1)
y = df['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=4)

In [159]:
def preprocess_input(user_input_dict, training_columns):
    """
    Takes a dictionary of raw user input and preprocesses it
    to match the format required by the trained model.
    """
    # Convert to DataFrame
    df = pd.DataFrame([user_input_dict])

    if 'diagnosis_date' in df.columns and 'end_treatment_date' in df.columns:
        df['diagnosis_date'] = pd.to_datetime(df['diagnosis_date'])
        df['end_treatment_date'] = pd.to_datetime(df['end_treatment_date'])
        df['treatment_duration_days'] = (df['end_treatment_date'] - df['diagnosis_date']).dt.days
        df.drop(['diagnosis_date', 'end_treatment_date'], axis=1, inplace=True)


    stage_map = {'Stage I': 0, 'Stage II': 1, 'Stage III': 2, 'Stage IV': 3}
    df['cancer_stage'] = df['cancer_stage'].map(stage_map)

    # OneHot Encoding
    columns_to_encode = ['gender', 'country', 'family_history', 'smoking_status', 'treatment_type']
    df = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)


    final_df = df.reindex(columns=training_columns, fill_value=0)

    return final_df

In [160]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [161]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='binary', zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='binary'))

Accuracy: 0.7795880149812734
Precision: 0.18421052631578946
Recall: 0.00010817326266013507


In [162]:
from sklearn.metrics import roc_curve, auc
y_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

## ***The Model Outputs below Average Results***

In [163]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)
y_proba_lr = log_reg.predict_proba(X_test)[:, 1]

In [164]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression Precision:", precision_score(y_test, y_pred_lr, zero_division=0))
print("Logistic Regression Recall:", recall_score(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.7796697310180456
Logistic Regression Precision: 0.0
Logistic Regression Recall: 0.0


# ***Resampling training data***




**Explanation**:
Apply SMOTE to the training data to balance the classes.



In [165]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Original dataset shape %s" % y_train.value_counts())
print("Resampled dataset shape %s" % y_train_resampled.value_counts())

Original dataset shape survived
0    465007
1    131293
Name: count, dtype: int64
Resampled dataset shape survived
0    465007
1    465007
Name: count, dtype: int64


## ***Re-instantiate the XGBoost model and fit it to the resampled training data***




In [166]:
model_resampled = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_resampled.fit(X_train_resampled, y_train_resampled)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


# ***Evaluate Retrained Model***



**Reasoning**:
Predict labels and probabilities on the test set using the retrained model and calculate evaluation metrics.



In [167]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, auc
import matplotlib.pyplot as plt

y_pred_resampled = model_resampled.predict(X_test)
y_proba_resampled = model_resampled.predict_proba(X_test)[:, 1]

accuracy_resampled = accuracy_score(y_test, y_pred_resampled)
precision_resampled = precision_score(y_test, y_pred_resampled, average='binary', zero_division=0)
recall_resampled = recall_score(y_test, y_pred_resampled, average='binary')
fpr_resampled, tpr_resampled, _ = roc_curve(y_test, y_proba_resampled)
roc_auc_resampled = auc(fpr_resampled, tpr_resampled)

print("Resampled Model Accuracy:", accuracy_resampled)
print("Resampled Model Precision:", precision_resampled)
print("Resampled Model Recall:", recall_resampled)
print("Resampled Model AUC-ROC:", roc_auc_resampled)

Resampled Model Accuracy: 0.7502349336057201
Resampled Model Precision: 0.21560628988749259
Resampled Model Recall: 0.05064054024818037
Resampled Model AUC-ROC: 0.5018205243202877


# ***Comparing Retrained Model***

Compare the evaluation metrics of the retrained model with the original model to assess the impact of resampling.


In [170]:
print("Original Model Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='binary', zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average='binary'))
print("AUC-ROC:", roc_auc)

print("\nResampled Model Metrics:")
print("Accuracy:", accuracy_resampled)
print("Precision:", precision_resampled)
print("Recall:", recall_resampled)
print("AUC-ROC:", roc_auc_resampled)

Original Model Metrics:
Accuracy: 0.7795880149812734
Precision: 0.18421052631578946
Recall: 0.00010817326266013507
AUC-ROC: 0.4986273721566256

Resampled Model Metrics:
Accuracy: 0.7502349336057201
Precision: 0.21560628988749259
Recall: 0.05064054024818037
AUC-ROC: 0.5018205243202877


# ***Test Prompt***



**Explanation**:
The model using resampled data shows an increase in precision and recall



In [179]:

user_input = {
    'age': 18,
    'gender': 'Male',
    'country': 'United Kingdom',
    'cancer_stage': 'Stage I',
    'cholesterol_level': 197,
    'bmi': 22.5,
    'hypertension': 0,
    'asthma': 0,
    'cirrhosis': 0,
    'other_cancer': 0, # Added missing feature
    'family_history': 'No',
    'smoking_status': 'Never Smoked', # Use the original category name
    'treatment_type': 'Surgery', # Use the original category name
    'diagnosis_date': '2018-04-28',
    'end_treatment_date': '2025-08-23'
}

# Fetching training set columns
training_columns = X_train.columns.tolist()

# Preprocess the input using the function
preprocessed_user_df = preprocess_input(user_input, training_columns)

# Make the prediction
prediction = model_resampled.predict(preprocessed_user_df)
prediction_probability = model_resampled.predict_proba(preprocessed_user_df)[:, 1]

print(f"Predicted Survival Outcome: {prediction[0]}")
print(f"Predicted Survival Probability: {prediction_probability[0]:.2f}")

Predicted Survival Outcome: 1
Predicted Survival Probability: 0.98
