In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import shap
import xgboost as xgb


In [None]:
file_path = r'C:\Users\milto\OneDrive\Desktop\Health Informatics\Artificial Intelligence in Health Care\Projects\Dataset\GDSC_DATASET.csv'
data = pd.read_csv(file_path)
data.head()


In [None]:
# Show basic info and null values
print(data.info())
print(data.isnull().sum())

# Visualize null values
plt.figure(figsize=(12,6))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Data Heatmap")
plt.show()


In [None]:
drop_columns = ['COSMIC_ID', 'CELL_LINE_NAME', 'TCGA_DESC']  # Adjust based on your dataset
data = data.drop(columns=[col for col in drop_columns if col in data.columns])

In [None]:
# Convert Z_SCORE to binary class: 0 = Sensitive, 1 = Resistant
data['Drug_Sensitivity'] = np.where(data['Z_SCORE'] > 0, 1, 0)

# Drop Z_SCORE
data = data.drop(columns=['Z_SCORE'])

In [None]:
# Drop any remaining rows with missing values
data = data.dropna()
print(f"Data shape after dropping NA: {data.shape}")

In [None]:
# One-hot encoding for categorical variables
data_encoded = pd.get_dummies(data, drop_first=True)


In [None]:
X = data_encoded.drop(columns=['Drug_Sensitivity'])
y = data_encoded['Drug_Sensitivity']

In [None]:
plt.figure(figsize=(16, 10))
sns.heatmap(X.corr(), cmap='coolwarm', linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training size: {X_train.shape}, Test size: {X_test.shape}")

In [None]:
# Train Logistic Regression
log_model = LogisticRegression(max_iter=100)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

# Evaluate performance
from sklearn.metrics import classification_report, confusion_matrix

print("🔍 Logistic Regression Metrics")
print("Accuracy:", accuracy_score(y_test, log_pred))
print("Precision:", precision_score(y_test, log_pred))
print("Recall:", recall_score(y_test, log_pred))
print("F1 Score:", f1_score(y_test, log_pred))
print("\nClassification Report:\n", classification_report(y_test, log_pred))

# Confusion Matrix
cm_log = confusion_matrix(y_test, log_pred)
sns.heatmap(cm_log, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Train Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

print("🔍 Random Forest Metrics")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("Precision:", precision_score(y_test, rf_pred))
print("Recall:", recall_score(y_test, rf_pred))
print("F1 Score:", f1_score(y_test, rf_pred))
print("\nClassification Report:\n", classification_report(y_test, rf_pred))

cm_rf = confusion_matrix(y_test, rf_pred)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens')
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Train XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

print("🔍 XGBoost Metrics")
print("Accuracy:", accuracy_score(y_test, xgb_pred))
print("Precision:", precision_score(y_test, xgb_pred))
print("Recall:", recall_score(y_test, xgb_pred))
print("F1 Score:", f1_score(y_test, xgb_pred))
print("\nClassification Report:\n", classification_report(y_test, xgb_pred))

cm_xgb = confusion_matrix(y_test, xgb_pred)
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Oranges')
plt.title("Confusion Matrix - XGBoost")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
# Train KNN
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

print("🔍 KNN Metrics")
print("Accuracy:", accuracy_score(y_test, knn_pred))
print("Precision:", precision_score(y_test, knn_pred))
print("Recall:", recall_score(y_test, knn_pred))
print("F1 Score:", f1_score(y_test, knn_pred))
print("\nClassification Report:\n", classification_report(y_test, knn_pred))

cm_knn = confusion_matrix(y_test, knn_pred)
sns.heatmap(cm_knn, annot=True, fmt='d', cmap='Reds')
plt.title("Confusion Matrix - KNN")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()