In [4]:
import pandas as pd
import numpy as np

In [14]:
# Import and Preprocess Dataset

df = pd.read_csv('Cleaned_Lung_Cancer_dataset.csv')

df = df.drop('Days from Diagnosis to Treatment', axis=1)
df = df[(df['Cancer Cell Type'] != 'Not Lung Cancer') &
        (df['Cause of Death'] != 'Not Applicable') & 
        (df['Cause of Death'] != 'Unknown')]
df = df.reset_index(drop=True)
df

Unnamed: 0,Age,Tumor Location,Cancer Cell Type,Tumor Extent at Diagnosis,Regional Lymph Node Involvement,Metastatic Spread,Tumor Laterality,Extent of Regional Lymph Node Surgery,Treatment Plan,Cause of Death
0,50-79 years,Upper lobe,"NSCLC-Adenocarcinoma, NOS",Pleural Involvement/Lung Collapse,Ipsilateral Nodes Only,No Distant Metastasis,Right,4+ Nodes Removed,Chemotherapy after Surgery,Alive or Not Cancer-related
1,50-79 years,Upper lobe,"NSCLC-Adenocarcinoma, NOS",Localized/Early disease,No Nodal Involvement,No Distant Metastasis,Right,Unknown/Inapplicable,Unknown Treatment,Alive or Not Cancer-related
2,85+ years,Lower lobe,"NSCLC-Adenocarcinoma, NOS",Localized/Early disease,No Nodal Involvement,No Distant Metastasis,Left,4+ Nodes Removed,Surgery,Alive or Not Cancer-related
3,50-79 years,Upper lobe,"NSCLC-Squamous, NOS",Spread within Lung,No Nodal Involvement,No Distant Metastasis,Right,Regional Biopsy/aspiration only,Radiotherapy after Surgery,Alive or Not Cancer-related
4,50-79 years,Not Specified,Other / Rare,Unknown,Unknown Node Status,Distant Lymph Node,Unspecified,Unknown/Inapplicable,Chemotherapy,Cancer
...,...,...,...,...,...,...,...,...,...,...
190933,50-79 years,Not Specified,"Malignant Tumor Cells, NOS",Unknown,Unknown Node Status,Unknown,Unspecified,Unknown/Inapplicable,Unknown Treatment,Cancer
190934,80-84 years,Not Specified,"Malignant Tumor Cells, NOS",Unknown,Unknown Node Status,Unknown,Unspecified,Unknown/Inapplicable,Unknown Treatment,Cancer
190935,85+ years,Not Specified,"Malignant Tumor Cells, NOS",Unknown,Unknown Node Status,Unknown,Unspecified,Unknown/Inapplicable,Unknown Treatment,Cancer
190936,50-79 years,Not Specified,"Malignant Tumor Cells, NOS",Unknown,Unknown Node Status,Unknown,Unspecified,Unknown/Inapplicable,Unknown Treatment,Cancer


In [16]:
# Implement Naive Bayes for Predicting Survival Rates and Apply Cross-Validation Testing

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, confusion_matrix

# 1. Separate features and target
X = df.drop(columns='Cause of Death')
y = df['Cause of Death']

# 2. Encode all categorical features to integers
X_encoded = X.copy()
for col in X_encoded.columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))

# 3. Encode the target as well
y_encoded = LabelEncoder().fit_transform(y.astype(str))

# 4. Set up Naive Bayes
model = CategoricalNB()

# 5. Set up k-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
fold = 1

for train_index, test_index in kf.split(X_encoded, y_encoded):
    X_train, X_test = X_encoded.iloc[train_index], X_encoded.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

    # Train the model
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

    print(f"Fold {fold} accuracy: {acc:.4f}")
    fold += 1

print(f"\nMean accuracy over {kf.n_splits} folds: {sum(accuracies)/kf.n_splits:.4f}")
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Fold 1 accuracy: 0.7181
Fold 2 accuracy: 0.7187
Fold 3 accuracy: 0.7228
Fold 4 accuracy: 0.7195
Fold 5 accuracy: 0.7224

Mean accuracy over 5 folds: 0.7203
Confusion matrix:
 [[14951  6274]
 [ 4325 12637]]


In [17]:
# Implement Logistic Regression for Predicting Survival Rates and Apply Cross-Validation Testing

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# 1. Separate features and target
X = df.drop(columns='Cause of Death')
y = df['Cause of Death']

# 2. One-hot encode categorical features
X_encoded = pd.get_dummies(X.astype(str), drop_first=True)

# 3. Encode target as integers
y_encoded = pd.factorize(y.astype(str))[0]

# 4. Set up logistic regression
model = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='multinomial')

# 5. Set up k-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
fold = 1

for train_index, test_index in kf.split(X_encoded, y_encoded):
    X_train, X_test = X_encoded.iloc[train_index], X_encoded.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

    # Train the model
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

    print(f"Fold {fold} accuracy: {acc:.4f}")
    fold += 1

print(f"\nMean accuracy over {kf.n_splits} folds: {sum(accuracies)/kf.n_splits:.4f}")
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Fold 1 accuracy: 0.7351
Fold 2 accuracy: 0.7359
Fold 3 accuracy: 0.7383
Fold 4 accuracy: 0.7362
Fold 5 accuracy: 0.7375

Mean accuracy over 5 folds: 0.7366
Confusion matrix:
 [[16089  5136]
 [ 4888 12074]]
