#--------------------------------------------------------------------
# Exercises 2.

# Exercise 2.1.

# Titanic passengers data – 1310 observations and 15 variables:
# passenger_id – Unique passenger id
# pclass – Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
# survived – Survival (0 = No, 1 = Yes)
# name – Name and SUrname
# sex – Sex (0 = Male, 1 = Female)
# age – Age in years
# sibsp – # of siblings / spouses aboard the Titanic
# parch – # of parents / children aboard the Titanic
# ticket – Ticket number
# fare – Passenger fare
# cabin – Cabin number
# embarked – Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
# boat – Lifeboat (if survived)
# body – Body number (if did not survive and body was recovered)
# home.dest – Home/Destination
#
# Re-run your best models for all algorithms for 5-fold CV.
# Check the stability of results for repeated K-fold
# Check in repeated k-fold CV if adding stratification changes your results (stability)
# Check if you didnt overfit in your models. Check if you can imrpove you validation score.

In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, RepeatedKFold, RepeatedStratifiedKFold, train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('/mnt/titanic.csv')
df = df.drop(columns=['passenger_id', 'name', 'ticket', 'cabin', 'boat', 'body', 'home.dest']) # I'm dropping due to the probability that these features may cause data leakage
df = df.dropna()
df = pd.get_dummies(df, columns=['sex','embarked'], drop_first=True)

In [None]:
X = df.drop(columns=['survived'])
y = df['survived']

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # scaling
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

models = {
    "Logistic Regression": log_reg,
    "Random Forest": rf_clf
}

In [None]:
for name, model in models.items():
    print(f"{name}")
    scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')
    print(f"Normal 5-Fold Accuracy: {scores.mean():.4f} (std: {scores.std():.4f})")

    kf_shuffled = KFold(n_splits=5, shuffle=True, random_state=42)
    scores_shuffled = cross_val_score(model, X_scaled, y, cv=kf_shuffled, scoring='accuracy')
    print(f"Shuffled 5-Fold Accuracy: {scores_shuffled.mean():.4f} (std: {scores_shuffled.std():.4f})")
    print("")

Logistic Regression
Normal 5-Fold Accuracy: 0.7248 (std: 0.1142)
Shuffled 5-Fold Accuracy: 0.7814 (std: 0.0233)

Random Forest
Normal 5-Fold Accuracy: 0.6376 (std: 0.0827)
Shuffled 5-Fold Accuracy: 0.7747 (std: 0.0376)



In [None]:
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=42)

for name, model in models.items():
    scores = cross_val_score(model, X_scaled, y, cv=rkf, scoring='accuracy')
    print(f"{name} - Repeated K-Fold Accuracy: {scores.mean():.4f} (Std: {scores.std():.4f})")

Logistic Regression - Repeated K-Fold Accuracy: 0.7844 (Std: 0.0274)
Random Forest - Repeated K-Fold Accuracy: 0.7772 (Std: 0.0272)


In [None]:
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

for name, model in models.items():
    scores = cross_val_score(model, X_scaled, y, cv=rskf, scoring='accuracy')
    print(f"{name} - Repeated Stratified Accuracy: {scores.mean():.4f} (Std: {scores.std():.4f})")

Logistic Regression - Repeated Stratified Accuracy: 0.7852 (Std: 0.0232)
Random Forest - Repeated Stratified Accuracy: 0.7759 (Std: 0.0246)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

for name, model in models.items():
    print(f"\nChecking {name}...")
    model.fit(X_train, y_train)

    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)

    print(f"Train Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy:  {test_acc:.4f}")

    if train_acc > test_acc + 0.05:
        print("Could be overfitting")
    else:
        print("No significant overfitting")


Checking Logistic Regression...
Train Accuracy: 0.7998
Test Accuracy:  0.7751
No significant overfitting

Checking Random Forest...
Train Accuracy: 0.9832
Test Accuracy:  0.7799
Could be overfitting
