In [None]:
import pandas as pd
import numpy as np
from scipy.stats import mode
from Models.ensemble import train_model as train_ensemble

# Load train data
X_train = pd.read_csv("Preprocessed_data/X_train_preprocessed.csv")
y_train = pd.read_csv("Preprocessed_data/y_train.csv").squeeze()

# Train ensemble models (tuple of 3 models)
models = train_ensemble(X_train, y_train)
lr, rf, gb = models

# Load test data and PassengerId
X_test = pd.read_csv("Preprocessed_data/X_test_preprocessed.csv")
passenger_ids = pd.read_csv("Preprocessed_data/test_passenger_ids.csv")

# Get predictions from each model
lr_pred = lr.predict(X_test)
rf_pred = rf.predict(X_test)
gb_pred = gb.predict(X_test)

# Majority vote ensemble prediction
ensemble_pred = mode(np.vstack([lr_pred, rf_pred, gb_pred]), axis=0)[0].flatten()

# Create submission DataFrame
submission_df = pd.DataFrame({
    "PassengerId": passenger_ids.squeeze(),  # ensure it's a Series
    "Survived": ensemble_pred.astype(int)    # make sure values are ints 0/1
})

# Validate submission shape & columns
assert submission_df.shape[0] == 418, "Submission must have 418 rows"
assert list(submission_df.columns) == ["PassengerId", "Survived"], "Submission must have exactly 2 columns: PassengerId, Survived"

# Save to csv without index
submission_df.to_csv("submission.csv", index=False)

print("Submission file created: submission.csv")


Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Submission file created: submission.csv


In [None]:
Training Logistic Regression...
Best LR CV score: 0.8482
Training Random Forest...
Best RF CV score: 0.8590
Training Gradient Boosting...
Best GB CV score: 0.8564
Submission file created: submission.csv