<a href="https://colab.research.google.com/github/MrZuberi/Lung-Cancer-Diagnostic-Tool/blob/main/lungcancerdemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Uploading The Dataset as CSV**

In [None]:
from google.colab import files

uploaded = files.upload()

# Uploaded "Lung Cancer.csv"

# **Analyzing Lung Cancer Data Preprocessing**

In [9]:
import pandas as pd
import io

df = pd.read_csv('Lung Cancer.csv')

# Display the first 5 rows
display(df.head())

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64.0,Male,Sweden,2016-04-05,Stage I,Yes,Passive Smoker,29.4,199.0,0.0,0.0,1.0,0.0,Chemotherapy,2017-09-10,0.0
1,2,50.0,Female,Netherlands,2023-04-20,Stage III,Yes,Passive Smoker,41.2,280.0,1.0,1.0,0.0,0.0,Surgery,2024-06-17,1.0
2,3,65.0,Female,Hungary,2023-04-05,Stage III,Yes,Former Smoker,44.0,268.0,1.0,1.0,0.0,0.0,Combined,2024-04-09,0.0
3,4,51.0,Female,Belgium,2016-02-05,Stage I,No,Passive Smoker,43.0,241.0,1.0,1.0,0.0,0.0,Chemotherapy,2017-04-23,0.0
4,5,37.0,Male,Luxembourg,2023-11-29,Stage I,No,Passive Smoker,19.7,178.0,0.0,0.0,0.0,0.0,Combined,2025-01-08,0.0


#**Pre-processing Data For Training**


In [16]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd # Importing pandas

# Convert relevant columns to numeric types before preprocessing
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['bmi'] = pd.to_numeric(df['bmi'], errors='coerce')
df['cholesterol_level'] = pd.to_numeric(df['cholesterol_level'], errors='coerce')
df['hypertension'] = pd.to_numeric(df['hypertension'], errors='coerce')
df['asthma'] = pd.to_numeric(df['asthma'], errors='coerce')
df['cirrhosis'] = pd.to_numeric(df['cirrhosis'], errors='coerce')
df['other_cancer'] = pd.to_numeric(df['other_cancer'], errors='coerce')
df['survived'] = pd.to_numeric(df['survived'], errors='coerce') # Also convert target

# Pandas will turn "not rated" and "N/A" into NaN
df.fillna(df.mean(numeric_only=True), inplace=True)


# Define categorical and numerical features
categorical_features = ['gender', 'country', 'family_history', 'smoking_status', 'treatment_type']
# 'cancer_stage' is ordinal, will be handled separately
numerical_features = ['age', 'bmi', 'cholesterol_level', 'hypertension', 'asthma', 'cirrhosis', 'other_cancer']

# Create an ordinal encoder for 'cancer_stage'
cancer_stage_order = ['Stage I', 'Stage II', 'Stage III', 'Stage IV']
df['cancer_stage_encoded'] = df['cancer_stage'].map({stage: i for i, stage in enumerate(cancer_stage_order)})

# Create a column transformer for encoding and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('scaler', StandardScaler(), numerical_features)
    ],
    remainder='passthrough' # Keep other columns (like 'id', 'survived', 'cancer_stage_encoded', diagnosis_date, end_treatment_date)
)

# Apply the preprocessing
preprocessed_data = preprocessor.fit_transform(df)

# Need the names of all columns after preprocessing using the fitted preprocessor
all_preprocessed_feature_names = preprocessor.get_feature_names_out()


# Create a new dataframe from the new preprocessed data
preprocessed_df = pd.DataFrame(preprocessed_data, columns=all_preprocessed_feature_names)

# Ensure all columns in the preprocessed DataFrame are numeric
for col in preprocessed_df.columns:
    preprocessed_df[col] = pd.to_numeric(preprocessed_df[col], errors='coerce')

# Handling potential NaNs created by coercion after preprocessing
preprocessed_df.fillna(preprocessed_df.mean(numeric_only=True), inplace=True)

display(preprocessed_df.head())

Unnamed: 0,onehot__gender_Female,onehot__gender_Male,onehot__country_Austria,onehot__country_Belgium,onehot__country_Bulgaria,onehot__country_Croatia,onehot__country_Cyprus,onehot__country_Czech Republic,onehot__country_Denmark,onehot__country_Estonia,...,scaler__hypertension,scaler__asthma,scaler__cirrhosis,scaler__other_cancer,remainder__id,remainder__diagnosis_date,remainder__cancer_stage,remainder__end_treatment_date,remainder__survived,remainder__cancer_stage_encoded
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.735067,-0.941979,1.852068,-0.311069,1,,,,0.0,0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.576347,1.061596,-0.539938,-0.311069,2,,,,1.0,2
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.576347,1.061596,-0.539938,-0.311069,3,,,,0.0,2
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.576347,1.061596,-0.539938,-0.311069,4,,,,0.0,0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.735067,-0.941979,-0.539938,-0.311069,5,,,,0.0,0


# **Developing CNN With Tensorflow**

In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np

# Assuming preprocessed_df is your DataFrame after preprocessing and ensuring numeric types
# Separate features (X) and target (y)
X = preprocessed_df.drop(['remainder__survived', 'remainder__id', 'remainder__cancer_stage', 'remainder__cancer_stage_encoded', 'remainder__diagnosis_date', 'remainder__end_treatment_date'], axis=1) # Drop target and identifier columns
y = preprocessed_df['remainder__survived']

# Convert to numpy arrays
# Setting dtype to float
X_np = X.values
y_np = y.values

X_np = X_np.astype(np.float32)
y_np = y_np.astype(np.float32)

# Reshape X for Conv1D (samples, time steps, features)
# Treat each feature as a time step for a 1D convolution
X_reshaped = X_np.reshape(X_np.shape[0], X_np.shape[1], 1)

# Defining the CNN model
model = Sequential()

# Conv1D layer
model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_reshaped.shape[1], 1)))

# MaxPooling1D layer
model.add(MaxPooling1D(pool_size=2))

# Adds another Conv1D layer
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))

# Second MaxPooling1D layer
model.add(MaxPooling1D(pool_size=2))

# Flatten the output of the convolutional layers
model.add(Flatten())

# Added Dense layers
model.add(Dense(100, activation='relu'))

# Added Dropout for regularization
model.add(Dropout(0.5))

# Output layer (since this is a binary task)
model.add(Dense(1, activation='sigmoid'))

# Compiling
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Display the models summary
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


# **Sectioning Data Into Training Sets**

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_np, test_size=0.2, random_state=42)

# **Training Model and Post Training Analysis**



In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Define the optimizer
optimizer = Adam(learning_rate=0.02)

# Define the learning rate scheduler to prevent LR plateau
LRS = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_lr=1e-6)

# Compile the model
model.compile(optimizer=optimzer, loss='binary_crossentropy', metrics=['accuracy',])

# Training and adding callback for the scheduler
history = model.fit(
    X_train, y_train, epochs=10, batch_size=32,
    validation_split=0.2,
    callbacks=[LRS]
)

# Evaluate results (!!!Add Matplot!!!)