# Lung Cancer Analysis

This notebook analyzes the lung cancer dataset to understand risk factors, symptoms, and develop predictive models.

## 1. Setup and Data Loading

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn')
sns.set_palette('husl')

In [None]:
# Load the dataset
df = pd.read_csv('../data/LungCancer/cancer patient data sets.csv')
print("Dataset shape:", df.shape)
df.head()

## 2. Data Overview

In [None]:
# Basic information about the dataset
print("\nDataset Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())

print("\nBasic Statistics:")
print(df.describe())

## 3. Exploratory Data Analysis

In [None]:
# Distribution of target variable
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Level')
plt.title('Distribution of Lung Cancer Levels')
plt.show()

In [None]:
# Correlation analysis
plt.figure(figsize=(20, 16))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Distribution of key features
key_features = ['Age', 'Smoking', 'Air Pollution', 'Genetic Risk']
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, feature in enumerate(key_features):
    sns.histplot(data=df, x=feature, hue='Level', multiple='stack', ax=axes[idx])
    axes[idx].set_title(f'Distribution of {feature}')

plt.tight_layout()
plt.show()

## 4. Feature Engineering

In [None]:
# Create risk score based on key factors
df['Risk_Score'] = df['Smoking'] + df['Air Pollution'] + df['Genetic Risk'] + df['Obesity']

# Create symptom score
symptom_columns = ['Chest Pain', 'Coughing of Blood', 'Fatigue', 'Weight Loss', 
                  'Shortness of Breath', 'Wheezing', 'Swallowing Difficulty',
                  'Clubbing of Finger Nails', 'Frequent Cold', 'Dry Cough', 'Snoring']
df['Symptom_Score'] = df[symptom_columns].sum(axis=1)

print("New features created:")
print(df[['Risk_Score', 'Symptom_Score']].describe())

## 5. Model Development

In [None]:
# Prepare features and target
X = df.drop(['Level', 'Patient Id'], axis=1)
y = df['Level']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_scaled)

# Print model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## 6. Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
plt.title('Top 10 Most Important Features')
plt.show()

## 7. Risk Factor Analysis

In [None]:
# Analyze risk factors
risk_factors = ['Smoking', 'Air Pollution', 'Genetic Risk', 'Obesity', 'Alcohol use']

plt.figure(figsize=(15, 10))
for i, factor in enumerate(risk_factors, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(data=df, x='Level', y=factor)
    plt.title(f'{factor} by Cancer Level')

plt.tight_layout()
plt.show()

## 8. Symptom Analysis

In [None]:
# Analyze symptoms
symptom_columns = ['Chest Pain', 'Coughing of Blood', 'Fatigue', 'Weight Loss', 
                  'Shortness of Breath', 'Wheezing', 'Swallowing Difficulty',
                  'Clubbing of Finger Nails', 'Frequent Cold', 'Dry Cough', 'Snoring']

plt.figure(figsize=(15, 10))
for i, symptom in enumerate(symptom_columns, 1):
    plt.subplot(3, 4, i)
    sns.countplot(data=df, x=symptom, hue='Level')
    plt.title(f'{symptom} Distribution')
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 9. Conclusions and Recommendations

Based on our analysis, we can draw the following conclusions:

1. Key Risk Factors:
   - Smoking
   - Air Pollution
   - Genetic Risk
   - Obesity
   - Alcohol use

2. Important Symptoms:
   - Chest Pain
   - Coughing of Blood
   - Shortness of Breath
   - Weight Loss

3. Model Performance:
   - The Random Forest model shows good performance in predicting lung cancer levels
   - Feature importance analysis helps identify the most significant factors

Recommendations:
1. Regular screening for high-risk individuals
2. Focus on preventive measures for modifiable risk factors
3. Early detection through symptom monitoring
4. Public health initiatives targeting smoking and air pollution