In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,precision_recall_curve, roc_curve
from sklearn.metrics import roc_auc_score, PrecisionRecallDisplay, RocCurveDisplay
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import learning_curve, validation_curve, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, FunctionTransformer, TargetEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from scipy.stats import loguniform,rankdata
import seaborn as sns
import matplotlib.pyplot as plt
import time
import shap



In [None]:
data = pd.read_csv('../datasets/titanic_preprocessed.csv')
print(data.head())
print(data.info())
print(data.describe())

X = data.drop('Survived', axis=1) # We `drop` the target column from features
y = data['Survived']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Splitting the data into training and testing sets 20% test size 80% train size


In [None]:
# Training the Logistic Regression model
model = LogisticRegression(max_iter=5000, tol=1e-3, random_state=42)

# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Ensures each fold has the same proportion of classes as the whole dataset

# Defining hyperparameter grid for Grid Search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'], # solvers that support both l1 and l2 penalties
    'penalty': ['l1', 'l2']
}

# Defining hyperparameter distribution for Randomized Search
param_distributions = {
    'C': loguniform(1e-3,1e3),
    'penalty': ['l1','l2'],
    'solver': ['liblinear', 'saga']
}

# Grid Search
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=skf,
    scoring='accuracy',
    n_jobs=-1, # Use all available cores
    verbose=1 # Print progress messages
)

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    cv=skf,
    scoring='accuracy',
    n_jobs=-1, 
    verbose=1,
    n_iter=50, # Number of parameter settings that are sampled
    random_state=42
)

# Measure the time taken for both searches

start_time = time.time()
grid_search.fit(X_train, y_train)
print(f"Grid Search took {time.time() - start_time:.2f} seconds")

start_time = time.time()
random_search.fit(X_train, y_train)
print(f"Randomized Search took {time.time() - start_time:.2f} seconds")

grid_model = grid_search.best_estimator_  # Best model from Grid Search
random_model = random_search.best_estimator_  # Best model from Randomized Search

y_pred_grid = grid_model.predict(X_test)
y_pred_random = random_model.predict(X_test)

# Evaluating Grid Search model
print("Grid Search Model Evaluation:")
print("Best Hyperparameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_grid))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_grid))

# Evaluating Randomized Search model
print("\nRandomized Search Model Evaluation:")
print("Best Hyperparameters:", random_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_random))

# Confusion Matrix: actual vs predicted shows counts of TN, FP, FN, TP

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_random))

# Accuracy: how many predictions were correct
# Precision: of all positive predictions, how many were actually positive
# Recall: of all actual positives, how many were correctly predicted
# F1-Score: harmonic mean of precision and recall

print("Classification Report:\n", classification_report(y_test, y_pred_random))


In [None]:
# Confusion Matrix Visualization
conf_matrix = confusion_matrix(y_test, y_pred_random)
plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Reds')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of Logistic Regression Classifier')
plt.show()


In [None]:
# Feature Importance: coefficients of the logistic regression model
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': random_model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(data=coefficients, x='Coefficient', y='Feature')
plt.title("Feature Importance (Logistic Regression Coefficients)") 
plt.show()

In [None]:
# Learning Curve: model performance vs training set size
learning_curves = learning_curve(
    estimator=random_model,
    X=X_train,
    y=y_train,
    cv=skf,
    scoring='accuracy',
    n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10)
)

train_sizes, train_scores, test_scores = learning_curves
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure(figsize=(10,6))
plt.plot(train_sizes, train_scores_mean, label='Training Accuracy')
plt.plot(train_sizes, test_scores_mean, label='Validation Accuracy')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.title('Learning Curve')
plt.legend()
plt.show()

In [None]:
# Validation Curve: model performance vs hyperparameter values
param_range = np.logspace(-3, 3, 7)
validation_curves = validation_curve(
    estimator=random_model,
    X=X_train,
    y=y_train,
    param_name='C',
    param_range=param_range,
    cv=skf,
    scoring='accuracy',
    n_jobs=-1
)

train_scores, test_scores = validation_curves
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure(figsize=(10,6))
plt.semilogx(param_range, train_scores_mean, label='Training Accuracy')
plt.semilogx(param_range, test_scores_mean, label='Validation Accuracy')
plt.xlabel('C (Inverse of Regularization Strength)')
plt.ylabel('Accuracy')
plt.title('Validation Curve')
plt.legend()
plt.show()

In [None]:
# Error Analysis (Visual Only)
if hasattr(random_model, "predict_proba"):
    y_proba = random_model.predict_proba(X_test)[:, 1]
elif hasattr(random_model, "decision_function"):
    scores = random_model.decision_function(X_test)
    ranks = pd.Series(scores, index=y_test.index).rank(method="average")
    y_proba = ((ranks - ranks.min()) / (ranks.max() - ranks.min())).to_numpy()
else:
    y_proba = y_pred_random.astype(float)

auc = roc_auc_score(y_test, y_proba)
fpr, tpr, _ = roc_curve(y_test, y_proba)
prec, rec, _ = precision_recall_curve(y_test, y_proba)

fig, axes = plt.subplots(1, 2, figsize=(10, 4))
RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc, estimator_name="LogReg").plot(ax=axes[0])
axes[0].set_title(f"ROC Curve (AUC={auc:.3f})")
PrecisionRecallDisplay(precision=prec, recall=rec, estimator_name="LogReg").plot(ax=axes[1])
axes[1].set_title("Precision–Recall Curve")

plt.tight_layout()
plt.show()

In [None]:
# SHAP Explanations
sample_size = 200  # adjust for speed vs detail
background = X_train.sample(min(sample_size, len(X_train)), random_state=42)
eval_sample = X_test.sample(min(sample_size, len(X_test)), random_state=42)
explainer = shap.LinearExplainer(random_model, background)
shap_values = explainer.shap_values(eval_sample)

shap.summary_plot(shap_values, eval_sample, plot_type='bar', show=False)
plt.title('Mean Absolute SHAP Value (Global Importance)')
plt.tight_layout()
plt.show()

In [None]:
# SHAP on misclassifications (FP/FN): which features drive the mistakes?

sample_size = 200
background = X_train.sample(min(sample_size, len(X_train)), random_state=42)
explainer = shap.LinearExplainer(random_model, background)

# Evaluate on the full test set (or sample for speed)
eval_df = X_test.copy()
y_true = y_test.loc[eval_df.index]
y_pred = random_model.predict(eval_df)

# SHAP values for the evaluation slice
shap_vals = explainer.shap_values(eval_df)
if isinstance(shap_vals, list):  # some SHAP versions return list per class
    shap_vals = shap_vals[1] if len(shap_vals) > 1 else shap_vals[0]

shap_df = pd.DataFrame(shap_vals, index=eval_df.index, columns=eval_df.columns)

# Masks for error types
fp_mask = (y_true == 0) & (y_pred == 1)
fn_mask = (y_true == 1) & (y_pred == 0)
correct_mask = ~fp_mask & ~fn_mask

# Aggregate mean absolute contributions
mean_abs_fp = np.abs(shap_df[fp_mask.values]).mean() if fp_mask.any() else pd.Series(0, index=shap_df.columns)
mean_abs_fn = np.abs(shap_df[fn_mask.values]).mean() if fn_mask.any() else pd.Series(0, index=shap_df.columns)
mean_abs_correct = np.abs(shap_df[correct_mask.values]).mean() if correct_mask.any() else pd.Series(0, index=shap_df.columns)

# Directional differences (error vs correct)
signed_fp = shap_df[fp_mask.values].mean() if fp_mask.any() else pd.Series(0, index=shap_df.columns)
signed_fn = shap_df[fn_mask.values].mean() if fn_mask.any() else pd.Series(0, index=shap_df.columns)
signed_correct = shap_df[correct_mask.values].mean() if correct_mask.any() else pd.Series(0, index=shap_df.columns)

# Features whose absolute impact is larger in errors than in correct predictions are suspect
error_overuse_score = ((mean_abs_fp + mean_abs_fn)/2 - mean_abs_correct).sort_values(ascending=False)

# Plot top 5 by error_overuse_score
k = min(5, len(error_overuse_score))
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
error_overuse_score.head(k).plot(kind='barh', ax=axes[0], color='tomato')
axes[0].invert_yaxis()
axes[0].set_title('Features over-emphasized in errors (|SHAP| error − |SHAP| correct)')
axes[0].set_xlabel('Delta mean |SHAP|')

# Directionality: where errors systematically push the wrong way vs correct
delta_fp = (signed_fp - signed_correct)
delta_fn = (signed_fn - signed_correct)

# Combine by max abs per feature to highlight strongest misdirection
misdirection = pd.concat({'FP': delta_fp, 'FN': delta_fn}, axis=1)
misdirection['max_abs'] = misdirection.abs().max(axis=1)
misdirection_sorted = misdirection.sort_values('max_abs', ascending=False).drop(columns=['max_abs'])

misdirection_sorted.head(k).plot(kind='barh', ax=axes[1])
axes[1].invert_yaxis()
axes[1].set_title('Signed SHAP delta (error − correct): FP and FN')
axes[1].set_xlabel('Mean SHAP difference')
plt.tight_layout()
plt.show()

print('Top potentially harmful-by-overuse features:')
print(error_overuse_score.head(5))

In [None]:
# Data Analysis and Visualization

new_data = pd.read_csv('../datasets/Titanic-Dataset.csv')
print(new_data.info())
print(new_data.isna().sum())

fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Visualizing Survival Count by Embarkation Point
sns.countplot(data=new_data, x='Survived', hue='Embarked', ax=axes[0])
axes[0].set_title('Survival Count by Embarkation Point')
axes[0].set_xlabel('Survived')
axes[0].set_ylabel('Count')

# Visualizing Survival Count by Sex
sns.countplot(data=new_data, x='Survived', hue='Sex', ax=axes[1])
axes[1].set_title('Survival Count by Sex')
axes[1].set_xlabel('Survived')
axes[1].set_ylabel('Count')
axes[1].legend(title='Sex')
plt.tight_layout()
plt.show()

numerical_features = ['Age', 'SibSp', 'Parch', 'Fare', 'Survived', 'Pclass']

# Heatmap of Correlation Matrix
plt.figure(figsize=(12,8))
corr_matrix = new_data[numerical_features].corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
# Feature Engineering Functions (for FunctionTransformer)

def drop_and_impute(X):
    """Drop irrelevant columns and impute missing values"""
    X = X.copy()
    X = X.drop(['Name', 'Ticket', 'PassengerId'], axis=1, errors='ignore')
    X['Age'] = X['Age'].fillna(X['Age'].median())
    X['Cabin'] = X['Cabin'].fillna('Unknown')
    X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0] if not X['Embarked'].mode().empty else 'S')
    return X

def create_features(X):
    """Create new features from existing ones"""
    X = X.copy()
    
    # Family features
    X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
    X['IsAlone'] = (X['FamilySize'] == 1).astype(int)
    
    # Age grouping
    X['AgeGroup'] = pd.cut(X['Age'], bins=[-1, 12, 20, 40, 60, 100], labels=[0,1,2,3,4]).astype(int)
    
    # Fare grouping
    X['FareGroup'] = pd.qcut(X['Fare'], 4, labels=[0,1,2,3]).astype(int)
    
    # Fare per person
    X['FarePerPerson'] = X['Fare'] / X['FamilySize']
    X['FarePerPerson'] = X['FarePerPerson'].fillna(X['Fare'])
    
    # Encode Sex and Cabin
    X['Sex'] = X['Sex'].map({'male': 0, 'female': 1})
    X['Cabin'] = X['Cabin'].apply(lambda x: 0 if x == 'Unknown' else 1)
    
    # Interaction features
    X['Sex_Pclass'] = X['Sex'] * X['Pclass']
    X['Age_Pclass'] = X['Age'] * X['Pclass']
    X['Fare_Pclass'] = X['Fare'] * X['Pclass']

    # Log Transformations
    X['Log_Fare'] = np.log1p(X['Fare'])
    X['Log_Age'] = np.log1p(X['Age'])
    X['Log_FarePerPerson'] = np.log1p(X['FarePerPerson'])
    
    return X

In [None]:
# Full ML Pipeline with No Data Leakage

X = new_data.drop('Survived', axis=1)
y = new_data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the pipeline - all steps fit only on training data
pipeline = Pipeline([
    # Step 1: Drop and impute
    ('drop_impute', FunctionTransformer(drop_and_impute)),
    
    # Step 2: Create new features
    ('create_features', FunctionTransformer(create_features)),
    
    # Step 3: One-hot encode Embarked (proper encoder in pipeline)
    ('encoder', ColumnTransformer([
        ('embarked', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), ['Embarked'])
        # verbose_feature_names_out=False means original names are kept so we can identify features easily
    ], remainder='passthrough', verbose_feature_names_out=False)), # remainder='passthrough' keeps other columns unchanged 
   
    
    # Step 4: Scale numerical features (fit only on training data)
    ('scaler', StandardScaler()),
    
    # Step 5: Logistic Regression model
    ('model', LogisticRegression(max_iter=5000, tol=1e-3, random_state=42))
])

# Hyperparameter tuning with pipeline
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_distributions_pipeline = {
    'model__C': loguniform(1e-3, 1e3),
    'model__penalty': ['l1', 'l2'],
    'model__solver': ['liblinear', 'saga']
}

random_search_eng = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions_pipeline,
    cv=skf,
    scoring='accuracy',
    n_jobs=-1, 
    verbose=0,
    n_iter=50, 
    random_state=42
)

start_time = time.time()
random_search_eng.fit(X_train, y_train)
print(f"Randomized Search took {time.time() - start_time:.2f} seconds")

# Get best model
model_eng = random_search_eng.best_estimator_
y_pred_eng = model_eng.predict(X_test)
y_proba_eng = model_eng.predict_proba(X_test)[:, 1]

print("\nEnhanced Model with Feature Engineering Pipeline:")
print("Best Hyperparameters:", random_search_eng.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_eng))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_eng))
print("Classification Report:\n", classification_report(y_test, y_pred_eng))

In [None]:
# Comprehensive Model Comparison

# Baseline model (from cell 3)
baseline_acc = accuracy_score(y_test, y_pred_random)
baseline_auc = roc_auc_score(y_test, y_proba)

# Enhanced model
enhanced_acc = accuracy_score(y_test, y_pred_eng)
enhanced_auc = roc_auc_score(y_test, y_proba_eng)

# Performance comparison
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# ROC Curves
fpr_base, tpr_base, _ = roc_curve(y_test, y_proba)
fpr_enh, tpr_enh, _ = roc_curve(y_test, y_proba_eng)
axes[0,0].plot(fpr_base, tpr_base, label=f'Baseline (AUC={baseline_auc:.3f})', linewidth=2)
axes[0,0].plot(fpr_enh, tpr_enh, label=f'Enhanced (AUC={enhanced_auc:.3f})', linewidth=2)
axes[0,0].plot([0,1], [0,1], 'k--', alpha=0.3)
axes[0,0].set_xlabel('False Positive Rate')
axes[0,0].set_ylabel('True Positive Rate')
axes[0,0].set_title('ROC Curve Comparison')
axes[0,0].legend()
axes[0,0].grid(alpha=0.3)

# Accuracy comparison
models = ['Baseline', 'Enhanced']
accuracies = [baseline_acc, enhanced_acc]
colors = ['blue', 'red']
axes[0,1].bar(models, accuracies, color=colors, alpha=0.7)
axes[0,1].set_ylabel('Accuracy')
axes[0,1].set_title('Accuracy Comparison')
axes[0,1].set_ylim([0.7, 0.95])
for i, v in enumerate(accuracies):
    axes[0,1].text(i, v+0.01, f'{v:.4f}', ha='center', fontweight='bold')

# Validation Curve for Enhanced Model (using pipeline)
param_range = np.logspace(-3, 3, 7)
vc_eng = validation_curve(
    estimator=model_eng,
    X=X_train,
    y=y_train,
    param_name='model__C',
    param_range=param_range,
    cv=skf,
    scoring='accuracy',
    n_jobs=-1
)

train_scores_eng, test_scores_eng = vc_eng
train_mean_eng = np.mean(train_scores_eng, axis=1)
test_mean_eng = np.mean(test_scores_eng, axis=1)
axes[0,2].semilogx(param_range, train_mean_eng, label='Training', linewidth=2)
axes[0,2].semilogx(param_range, test_mean_eng, label='Validation', linewidth=2)
axes[0,2].set_xlabel('C (Inverse of Regularization Strength)')
axes[0,2].set_ylabel('Accuracy')
axes[0,2].set_title('Validation Curve (Enhanced Model)')
axes[0,2].legend()
axes[0,2].grid(alpha=0.3)

# Learning curve for enhanced model (using pipeline)
lc_eng = learning_curve(model_eng, X_train, y_train, cv=skf, 
                         scoring='accuracy', n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))
train_sizes_eng, train_scores_eng, test_scores_eng = lc_eng
train_mean_eng = np.mean(train_scores_eng, axis=1)
test_mean_eng = np.mean(test_scores_eng, axis=1)

axes[1,0].plot(train_sizes_eng, train_mean_eng, label='Training', linewidth=2)
axes[1,0].plot(train_sizes_eng, test_mean_eng, label='Validation', linewidth=2)
axes[1,0].set_xlabel('Training Set Size')
axes[1,0].set_ylabel('Accuracy')
axes[1,0].set_title('Learning Curve (Enhanced Model)')
axes[1,0].legend()
axes[1,0].grid(alpha=0.3)

# Feature importance (top 15) 
final_model = model_eng.named_steps['model']

# Get feature names by transforming data and tracking columns
X_sample = X_train.head(10)
X_after_features = model_eng.named_steps['create_features'].transform(
    model_eng.named_steps['drop_impute'].transform(X_sample)
)
base_features = list(X_after_features.columns)

# After one-hot encoding Embarked
encoder = model_eng.named_steps['encoder']
embarked_cats = encoder.named_transformers_['embarked'].categories_[0][1:]  # drop first
embarked_features = [f'Embarked_{cat}' for cat in embarked_cats]

# Combine all feature names
all_features = embarked_features + [f for f in base_features if f != 'Embarked']
feature_names = all_features

coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': final_model.coef_[0]
}).sort_values(by='Coefficient', key=abs, ascending=False).head(15)

axes[1,1].barh(coef_df['Feature'], coef_df['Coefficient'], color='#2ecc71', alpha=0.7)
axes[1,1].set_xlabel('Coefficient')
axes[1,1].set_title('Top 15 Feature Importances')
axes[1,1].invert_yaxis()
axes[1,1].grid(axis='x', alpha=0.3)

# Confusion Matrix Delta (Enhanced - Baseline)
cm_base = confusion_matrix(y_test, y_pred_random)
cm_enh = confusion_matrix(y_test, y_pred_eng)
cm_delta = cm_enh - cm_base
sns.heatmap(cm_delta, annot=True, fmt='d', cmap='RdYlGn', center=0, ax=axes[1,2], cbar_kws={'label': 'Change'})
axes[1,2].set_xlabel('Predicted')
axes[1,2].set_ylabel('Actual')
axes[1,2].set_title('Confusion Matrix Delta\n(Enhanced - Baseline)')

plt.tight_layout()
plt.show()