In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

In [None]:
!pip install shap

In [None]:
pip install xgboost

In [None]:
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import shap
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.inspection import permutation_importance
from sklearn.utils.validation import check_is_fitted

In [None]:
df = pd.read_csv("C:\\Users\\jithi\\Desktop\\lung_cancer_data.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
unwanted_columns = ['Patient_ID', 'Occupation', 'Residential_Area']

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
categorical_cols = ['Gender', 'Smoking_History', 'Occupation', 'Residential_Area',
                    'Physical_Activity_Level', 'Dietary_Habits', 'Comorbidities',
                    'Stage_of_Cancer', 'Treatment_Type', 'Medication_Response', 'Symptom_Progression']

In [None]:
label_encoders = {}
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
df.head(10)

In [None]:
# Set Seaborn style for better aesthetics
sns.set(style="whitegrid")

In [None]:
label_encoders = {}
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
# 2. Gender Proportion - Pie Chart
gender_counts = df['Gender'].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff'])
plt.title("Gender Proportion")
plt.axis('equal')
plt.show()


In [None]:
# 3. Smoking History Impact - Bar Chart (Count per category)
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Smoking_History', order=df['Smoking_History'].value_counts().index, palette="viridis")
plt.title("Smoking History Count")
plt.xlabel("Smoking History")
plt.ylabel("Count")
plt.show()

In [None]:
# 4. Cancer Stage Distribution - Bar Chart
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Stage_of_Cancer', order=df['Stage_of_Cancer'].value_counts().index, palette="magma")
plt.title("Cancer Stage Distribution")
plt.xlabel("Stage of Cancer")
plt.ylabel("Count")
plt.show()

In [None]:
# 5. Tumor Size vs. Survival Years - Scatter Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Tumor_Size_cm', y='Survival_Years', hue='Stage_of_Cancer', palette="deep", alpha=0.6)
plt.title("Tumor Size vs. Survival Years")
plt.xlabel("Tumor Size (cm)")
plt.ylabel("Survival Years")
plt.legend(title="Stage of Cancer")
plt.show()

In [None]:
# 6. Metastasis Status vs. Stage of Cancer - Stacked Bar Chart
# Prepare data for stacked bar chart
stack_data = df.groupby(['Stage_of_Cancer', 'Metastasis_Status']).size().unstack(fill_value=0)
stack_data = stack_data.loc[stack_data.sum(axis=1).sort_values(ascending=False).index]

In [None]:
stack_data.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='Set2')
plt.title("Metastasis Status vs. Stage of Cancer")
plt.xlabel("Stage of Cancer")
plt.ylabel("Count")
plt.legend(title="Metastasis Status", labels=["False", "True"])
plt.show()

In [None]:
# Display initial info about the dataset
print("Initial Data Information:")
print(df.info())
print("\nMissing Values per Column:")
print(df.isnull().sum())

In [None]:
 #Data Type Conversions
# Ensure boolean columns are of bool type
bool_columns = ['Family_History_Cancer', 'Exposure_to_Toxins', 'Chest_Pain_Symptoms',
                'Shortness_of_Breath', 'Chronic_Cough', 'Weight_Loss',
                'Previous_Cancer_Diagnosis', 'Metastasis_Status']

In [None]:
for col in bool_columns:
    df[col] = df[col].astype(bool)

In [None]:
#Handling Outliers (Example: Tumor_Size_cm)
# You can apply various methods to handle outliers. Here's a simple example using the IQR method:
Q1 = df['Tumor_Size_cm'].quantile(0.25)
Q3 = df['Tumor_Size_cm'].quantile(0.75)
IQR = Q3 - Q1

In [None]:
# Define a threshold for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
# Option: Cap outliers at the threshold
df['Tumor_Size_cm'] = np.where(df['Tumor_Size_cm'] < lower_bound, lower_bound, df['Tumor_Size_cm'])
df['Tumor_Size_cm'] = np.where(df['Tumor_Size_cm'] > upper_bound, upper_bound, df['Tumor_Size_cm'])


In [None]:
#Feature Engineering (Example: Calculate BMI category)
def bmi_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi < 25:
        return 'Normal'
    elif bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

In [None]:
df['BMI_Category'] = df['BMI'].apply(bmi_category)
df['BMI_Category'] = df['BMI_Category'].astype('category')

In [None]:
#Encoding Categorical Variables (Optional: If needed for modeling)
# For example, using one-hot encoding for 'Stage_of_Cancer'
df_encoded = pd.get_dummies(df, columns=['Stage_of_Cancer'], drop_first=True)

In [None]:
# Display cleaned data info
print("\nCleaned Data Information:")
print(df.info())
print("\nSample of Cleaned Data:")
print(df.head())

In [None]:
# Optionally, save the cleaned dataset to a new CSV file
df.to_csv("lung_cancer_data_cleaned.csv", index=False)


In [None]:
#encoding

In [None]:
# List of categorical columns to encode
categorical_columns = [
    'Gender', 'Smoking_History', 'Occupation', 'Residential_Area',
    'Physical_Activity_Level', 'Dietary_Habits', 'Comorbidities',
    'Stage_of_Cancer', 'Treatment_Type', 'Medication_Response', 'Symptom_Progression'
]

In [None]:
# --- Option 1: Label Encoding ---
# This method is useful if the categorical variable is ordinal or if you prefer a single column representation.
df_label_encoded = df.copy()  # make a copy for label encoding
le = LabelEncoder()

In [None]:
df.head(10)

In [None]:
df.describe().transpose()

In [None]:
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
df_numeric = df[numeric_columns]


In [None]:
corr_matrix = df_numeric.corr()


In [None]:
# Display the correlation matrix
print("Correlation Matrix:")
print(corr_matrix)


In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
df.info()

In [None]:
#Prepare data for predictive modeling

In [None]:
bool_columns = [
    'Family_History_Cancer', 'Exposure_to_Toxins', 'Chest_Pain_Symptoms',
    'Shortness_of_Breath', 'Chronic_Cough', 'Weight_Loss',
    'Previous_Cancer_Diagnosis', 'Metastasis_Status'
]

In [None]:
for col in bool_columns:
    df[col] = df[col].astype(bool)

In [None]:
# Remove non-predictive columns 

In [None]:
df_model = df.drop(columns=['Patient_ID', 'Year_of_Diagnosis'])

In [None]:
# Specify the target variable (predicting Survival_Years)
target = 'Survival_Years'
X = df_model.drop(columns=[target])
y = df_model[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_test.shape

In [None]:
target = 'Metastasis_Status'
# Drop columns that are not features (e.g., identifiers)


In [None]:
X = df.drop(columns=['Patient_ID', 'Year_of_Diagnosis', target])
y = df[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
OneHotEncoder(drop='first', sparse_output=False)


In [None]:
numeric_features = ['Age', 'Years_Smoked', 'Pack_Years', 'BMI', 
                    'Lung_Function_Test_Result', 'Air_Quality_Index', 
                    'Tumor_Size_cm', 'Survival_Years', 'Follow_Up_Visits']


In [None]:
categorical_features = ['Gender', 'Smoking_History', 'Occupation', 
                        'Residential_Area', 'Physical_Activity_Level', 
                        'Dietary_Habits', 'Comorbidities', 'Stage_of_Cancer', 
                        'Treatment_Type', 'Medication_Response', 'Symptom_Progression']



In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ]
)


In [None]:
estimator = RandomForestClassifier(random_state=42)  # Change to LogisticRegression(max_iter=1000) if desired

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(max_iter=1000))
])


In [None]:
pipeline.fit(X_train, y_train)

# Make predictions and evaluate the model.
y_pred = pipeline.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
#RandomForestClassifier

In [None]:
X = df.drop(columns=['Patient_ID', 'Year_of_Diagnosis', 'Metastasis_Status'])
y = df['Metastasis_Status']

In [None]:
# 4. Split the data into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
param_grid = {
    'logreg__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'logreg__penalty': ['l2'],            # For lbfgs solver, only 'l2' is allowe
    # You can add more parameters if needed
}

In [None]:


grid = GridSearchCV(pipeline, param_grid, cv=3, verbose=2)
grid.fit(X_train, y_train)

In [None]:
print("Best Parameters:", grid.best_params_)

In [None]:
y_pred = grid.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
#Decision Tree Classifier

In [None]:
X_encoded = pd.get_dummies(X, drop_first=True)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)


In [None]:
dtc = DecisionTreeClassifier(random_state=42)
param_grid = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 7, 11],
    'max_depth': [None, 5, 10]
}

In [None]:
grid_dtc = GridSearchCV(dtc, param_grid, cv=3, verbose=2)
grid_dtc.fit(X_train, y_train)

In [None]:
# Now, the best estimator is available:
best_dtc = grid_dtc.best_estimator_
print("Best Parameters:", grid_dtc.best_params_)

In [None]:
# Make predictions on the test set.
y_predict_dtc = best_dtc.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_predict_dtc))
print("\nClassification Report:")
print(classification_report(y_test, y_predict_dtc))

In [None]:
#K-Nearest Neighbor

In [None]:
knn=KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train,y_train)

In [None]:
y_pred_knn = knn.predict(X_test)
y_pred_knn

In [None]:
knn_accuracy  = accuracy_score(y_test, y_pred_knn)
print(f"accuracy of knn model:{knn_accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred_knn))
confusion_matrix(y_test, y_pred_knn)

In [None]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=9)
}


In [None]:

# Train and evaluate models
accuracy_scores = {}

for model_name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores[model_name] = accuracy * 100


In [None]:
# Print accuracy scores
print("Accuracy Scores of Each Model:")
for model, acc in accuracy_scores.items():
    print(f"{model}: {acc:.2f}%")