<h1 align=center>Pancreatic Cancer Prediction</h1>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# model selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [3]:
df = pd.read_csv("pancreatic_cancer_prediction_sample.csv")

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    print(f"Unique values in {col}: {df[col].unique()}")
    print(df[col].value_counts())
    print("\n")

In [9]:
sns.set(style="whitegrid")

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'], bins=30, kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Survival_Status', data=df)
plt.title('Count of Survival Status')
plt.xlabel('Survival Status')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Gender', hue='Survival_Status', data=df)
plt.title('Survival Status by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Stage_at_Diagnosis', hue='Survival_Status', data=df)
plt.title('Survival Status by Stage at Diagnosis')
plt.xlabel('Stage at Diagnosis')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Treatment_Type', hue='Survival_Status', data=df)
plt.title('Survival Status by Treatment Type')
plt.xlabel('Treatment Type')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Economic_Status', hue='Survival_Status', data=df)
plt.title('Survival Status by Economic Status')
plt.xlabel('Economic Status')
plt.ylabel('Count')
plt.show()

In [None]:
df_encoded = pd.get_dummies(df, drop_first=True)

# Generate the correlation matrix
corr_matrix_encoded = df_encoded.corr()

# Plot the heatmap
plt.figure(figsize=(20, 16))
sns.heatmap(corr_matrix_encoded, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap (Including Encoded Categorical Columns)')
plt.show()

In [17]:
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

In [None]:
df_encoded

In [19]:
X = df_encoded.drop(['Survival_Status'], axis=1)
y = df_encoded['Survival_Status']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    print(f"📌 {name}")
    print(f"✅ Accuracy: {accuracy:.4f}")
    print(f"✅ ROC-AUC Score: {auc:.4f}")
    print(classification_report(y_test, y_pred))
    print("-" * 50)