<a href="https://colab.research.google.com/github/Joel-Williams-Mathew/SupervisedModel/blob/main/Titanic_Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Titanic Prediction Model(Supervised Model using XGBoost)

In [1]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv


In [3]:
# titanic_pipeline.py
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# ────────────────────────────────────────────────────────────
# 1. LOAD DATA
# ────────────────────────────────────────────────────────────
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

# Preserve PassengerId for submission
test_passenger_ids = test_df["PassengerId"]

# ────────────────────────────────────────────────────────────
# 2. FEATURE ENGINEERING
#    – Create new columns, keep it simple & robust
# ────────────────────────────────────────────────────────────
def add_basic_features(df):
    df = df.copy()

    # Title from Name  (Mr, Mrs, Miss, etc.)
    df["Title"] = df["Name"].str.extract(r',\s*([^\.]*)\s*\.')

    # Simplify rare titles
    rare_titles = [
        'Lady','Countess','Capt','Col','Don','Dr','Major','Rev',
        'Sir','Jonkheer','Dona'
    ]
    df["Title"] = df["Title"].replace(rare_titles, 'Rare')
    df["Title"] = df["Title"].replace({'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs'})

    # Family size / isolation
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"]    = (df["FamilySize"] == 1).astype(int)

    # Ticket group size (people with same ticket number)
    ticket_counts = df["Ticket"].value_counts()
    df["TicketGroup"] = df["Ticket"].map(ticket_counts)

    # Cabin missing indicator (Cabin itself is dropped later)
    df["HasCabin"] = df["Cabin"].notnull().astype(int)

    return df

train_df = add_basic_features(train_df)
test_df  = add_basic_features(test_df)

# ────────────────────────────────────────────────────────────
# 3. DEFINE FEATURES & TARGET
# ────────────────────────────────────────────────────────────
y = train_df["Survived"]
X = train_df.drop(["Survived", "PassengerId", "Name", "Ticket", "Cabin"], axis=1)
X_test = test_df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

numeric_features   = ["Age", "Fare", "FamilySize", "TicketGroup"]
categorical_features = ["Pclass", "Sex", "Embarked", "Title", "IsAlone", "HasCabin"]

# ────────────────────────────────────────────────────────────
# 4. PREPROCESSING PIPELINE
# ────────────────────────────────────────────────────────────
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot",  OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num",  numeric_transformer, numeric_features),
        ("cat",  categorical_transformer, categorical_features),
    ]
)

# ────────────────────────────────────────────────────────────
# 5. MODELS
# ────────────────────────────────────────────────────────────
logreg_clf  = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

rf_clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        n_jobs=-1,
        random_state=42
    ))
])

models = {
    "Logistic Regression" : logreg_clf,
    "Random Forest"       : rf_clf
}

# Helper to evaluate with cross‑validation
def cv_score(model_pipe, X, y, folds=5):
    return cross_val_score(model_pipe, X, y, cv=folds, scoring="accuracy", n_jobs=-1).mean()

print("📊 5‑Fold Cross‑Validation Accuracy")
best_model_name, best_score, best_pipe = None, 0, None
for name, pipe in models.items():
    score = cv_score(pipe, X, y)
    print(f"- {name:<20}: {score:.4f}")
    if score > best_score:
        best_score, best_model_name, best_pipe = score, name, pipe

print(f"\n✅ Best model: {best_model_name} ({best_score:.4f})")

# ────────────────────────────────────────────────────────────
# 6. TRAIN FINAL MODEL ON FULL TRAINING SET
# ────────────────────────────────────────────────────────────
best_pipe.fit(X, y)

# Optional: quick train/valid split for a confusion matrix
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
best_pipe.fit(X_tr, y_tr)
y_pred = best_pipe.predict(X_val)

print("\nConfusion Matrix (20 % hold‑out):")
print(confusion_matrix(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred, digits=4))

# Fit again on 100 % data for submission
best_pipe.fit(X, y)

# ────────────────────────────────────────────────────────────
# 7. PREDICT & SAVE SUBMISSION
# ────────────────────────────────────────────────────────────
test_predictions = best_pipe.predict(X_test)
submission = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Survived": test_predictions.astype(int)
})

📊 5‑Fold Cross‑Validation Accuracy
- Logistic Regression : 0.8249
- Random Forest       : 0.8047

✅ Best model: Logistic Regression (0.8249)

Confusion Matrix (20 % hold‑out):
[[96 14]
 [17 52]]

Classification Report:
              precision    recall  f1-score   support

           0     0.8496    0.8727    0.8610       110
           1     0.7879    0.7536    0.7704        69

    accuracy                         0.8268       179
   macro avg     0.8187    0.8132    0.8157       179
weighted avg     0.8258    0.8268    0.8261       179

