<a href="https://colab.research.google.com/github/MrZuberi/Lung-Cancer-Diagnostic-Tool/blob/main/lungcancerdemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Uploading The Dataset as CSV**

In [None]:
from google.colab import files

uploaded = files.upload()

# **Analyzing Lung Cancer Data Preprocessing**

In [61]:
import pandas as pd

df = pd.read_csv('Lung Cancer.csv')

# Display the first 5 rows
display(df.head())

# Display column names and data types
display(df.info())

# Display descriptive statistics for numerical columns
display(df.describe())

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64.0,Male,Sweden,2016-04-05,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,2017-09-10,0
1,2,50.0,Female,Netherlands,2023-04-20,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,2024-06-17,1
2,3,65.0,Female,Hungary,2023-04-05,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,2024-04-09,0
3,4,51.0,Female,Belgium,2016-02-05,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,2017-04-23,0
4,5,37.0,Male,Luxembourg,2023-11-29,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,2025-01-08,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890000 entries, 0 to 889999
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  890000 non-null  int64  
 1   age                 890000 non-null  float64
 2   gender              890000 non-null  object 
 3   country             890000 non-null  object 
 4   diagnosis_date      890000 non-null  object 
 5   cancer_stage        890000 non-null  object 
 6   family_history      890000 non-null  object 
 7   smoking_status      890000 non-null  object 
 8   bmi                 890000 non-null  float64
 9   cholesterol_level   890000 non-null  int64  
 10  hypertension        890000 non-null  int64  
 11  asthma              890000 non-null  int64  
 12  cirrhosis           890000 non-null  int64  
 13  other_cancer        890000 non-null  int64  
 14  treatment_type      890000 non-null  object 
 15  end_treatment_date  890000 non-nul

None

Unnamed: 0,id,age,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,survived
count,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0
mean,445000.5,55.007008,30.494172,233.633916,0.750024,0.46974,0.225956,0.088157,0.220229
std,256921.014128,9.994485,8.368539,43.432278,0.432999,0.499084,0.418211,0.283524,0.414401
min,1.0,4.0,16.0,150.0,0.0,0.0,0.0,0.0,0.0
25%,222500.75,48.0,23.3,196.0,1.0,0.0,0.0,0.0,0.0
50%,445000.5,55.0,30.5,242.0,1.0,0.0,0.0,0.0,0.0
75%,667500.25,62.0,37.7,271.0,1.0,1.0,0.0,0.0,0.0
max,890000.0,104.0,45.0,300.0,1.0,1.0,1.0,1.0,1.0


#**Pre-processing Data For Training**


In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Define categorical and numerical features
categorical_features = ['gender', 'country', 'family_history', 'smoking_status', 'treatment_type']
# 'cancer_stage' is ordinal, will be handled separately
numerical_features = ['age', 'bmi', 'cholesterol_level', 'treatment_duration', 'diagnosis_year', 'diagnosis_month', 'hypertension', 'asthma', 'cirrhosis', 'other_cancer']

# Create an ordinal encoder for 'cancer_stage'
cancer_stage_order = ['Stage I', 'Stage II', 'Stage III', 'Stage IV']
df['cancer_stage_encoded'] = df['cancer_stage'].map({stage: i for i, stage in enumerate(cancer_stage_order)})

# Create a column transformer for one-hot encoding and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('scaler', StandardScaler(), numerical_features)
    ],
    remainder='passthrough' # Keep other columns (like 'id', 'survived', 'cancer_stage_encoded')
)

# Apply the preprocessing
preprocessed_data = preprocessor.fit_transform(df)

# Convert the preprocessed data back to a DataFrame (optional, but helpful for inspection)
# Get the new column names after one-hot encoding
onehot_feature_names = preprocessor.named_transformers_['onehot'].get_feature_names_out(categorical_features)
# Get the names of the numerical features
numerical_feature_names = numerical_features

# Get the names of the remaining columns that were not transformed
# We need to figure out which columns were passed through.
# One way is to find the columns in the original df that are not in categorical_features or numerical_features
all_original_features = df.columns.tolist()
transformed_features = categorical_features + numerical_features
passthrough_features = [col for col in all_original_features if col not in transformed_features]

all_feature_names = list(onehot_feature_names) + numerical_feature_names + passthrough_features

preprocessed_df = pd.DataFrame(preprocessed_data, columns=all_feature_names)

display(preprocessed_df.head())

Unnamed: 0,gender_Female,gender_Male,country_Austria,country_Belgium,country_Bulgaria,country_Croatia,country_Cyprus,country_Czech Republic,country_Denmark,country_Estonia,...,diagnosis_year,diagnosis_month,hypertension,asthma,cirrhosis,other_cancer,id,cancer_stage,survived,cancer_stage_encoded
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.001276,-0.731335,-1.73216,-0.941206,1.850848,-0.310935,1,Stage I,0,0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.401182,-0.731335,0.577314,1.062467,-0.540293,-0.310935,2,Stage III,1,2
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.401182,-0.731335,0.577314,1.062467,-0.540293,-0.310935,3,Stage III,0,2
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.001276,-1.311131,0.577314,1.062467,-0.540293,-0.310935,4,Stage I,0,0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.401182,1.29795,-1.73216,-0.941206,-0.540293,-0.310935,5,Stage I,0,0


# New Section

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Assuming preprocessed_df is your DataFrame after preprocessing
# Separate features (X) and target (y)
X = preprocessed_df.drop('survived', axis=1) # Assuming 'survived' is your target variable
y = preprocessed_df['survived']

# Convert to numpy arrays for TensorFlow
X_np = X.values
y_np = y.values

# Reshape X for Conv1D (samples, time steps, features)
# We can treat each feature as a time step for a 1D convolution
X_reshaped = X_np.reshape(X_np.shape[0], X_np.shape[1], 1)

# Define the CNN model
model = Sequential()

# Add a Conv1D layer
model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_reshaped.shape[1], 1)))

# Add a MaxPooling1D layer
model.add(MaxPooling1D(pool_size=2))

# Add another Conv1D layer
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))

# Add another MaxPooling1D layer
model.add(MaxPooling1D(pool_size=2))

# Flatten the output of the convolutional layers
model.add(Flatten())

# Add Dense layers
model.add(Dense(100, activation='relu'))

# Add Dropout for regularization
model.add(Dropout(0.5))

# Output layer (since this is a binary classification task)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_np, test_size=0.2, random_state=42)

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Evaluate the AI's performance

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower right')

test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=2)

Compiling the Testing Data

In [None]:
print(test_acc)

# Uploading the Image Data As A CSV

In [None]:
from google.colab import files

uploaded = files.upload()

Import different, needed libraries

In [None]:
import math, json, random
import numpy as np
import PIL
import tensorflow as tf

from matplotlib import pyplot as plt
from base64 import b64encode

from google.colab import files
from io import BytesIO
from PIL import Image

Side note:I'll be doing and figuring out the rest tomorrow --AsiaFaraway...