In [1]:
## Step 1: Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
## Create necessary directories
import os
project_folder = '/content/drive/MyDrive/Titanic-Survival-Prediction'
models_folder = os.path.join(project_folder, 'models')
os.makedirs(models_folder, exist_ok=True)

In [3]:
## Step 2: Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [4]:
## Step 3: Load Cleaned Data
file_path = '/content/drive/MyDrive/Titanic-Survival-Prediction/data/processed/Titanic-Cleaned.csv'
df = pd.read_csv(file_path)

In [5]:
## Step 4: Encode Categorical Features
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])

In [6]:
## Step 5: Define Features and Target
# Drop non-numeric columns (e.g., Name and Ticket if present)
X = df.drop(columns=['Survived', 'Name', 'Ticket'])  # Features
y = df['Survived']  # Target Variable

In [7]:
## Step 6: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
## Step 7: Train and Evaluate Multiple Models

models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Support Vector Machine": SVC(kernel='linear', random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

best_model = None
best_accuracy = 0
best_model_name = ""

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f'\n{name} Accuracy: {acc:.2f}')
    print('\nClassification Report:\n', classification_report(y_test, y_pred))
    print('\nConfusion Matrix:\n', confusion_matrix(y_test, y_pred))

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model
        best_model_name = name


Random Forest Accuracy: 0.83

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86       105
           1       0.82      0.74      0.78        74

    accuracy                           0.83       179
   macro avg       0.83      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179


Confusion Matrix:
 [[93 12]
 [19 55]]

Logistic Regression Accuracy: 0.81

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179


Confusion Matrix:
 [[90 15]
 [19 55]]

Support Vector Machine Accuracy: 0.78

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.84

In [9]:
#### Step 8: Save the Best Model
best_model_path = os.path.join(models_folder, f'best_model_{best_model_name.replace(" ", "_")}.pkl')
joblib.dump(best_model, best_model_path)
print(f'Best model ({best_model_name}) saved at: {best_model_path}')

Best model (Random Forest) saved at: /content/drive/MyDrive/Titanic-Survival-Prediction/models/best_model_Random_Forest.pkl
