# Task 14: Model Comparison & Best Model Selection

This notebook implements a machine learning pipeline to compare multiple models on the Titanic dataset, selects the best model, and saves it.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# Set plot style
sns.set(style="whitegrid")

## 1. Load and Preprocess Data

In [None]:
# Load dataset
df = pd.read_csv('Titanic-Dataset.csv')

# Display first few rows
display(df.head())

# Check for missing values
print("\nMissing values before processing:\n", df.isnull().sum())

In [None]:
# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
# Cabin has too many missing values, often dropped in simple baselines, or we can create a flag
df.drop(columns=['Cabin'], inplace=True)

# Drop irrelevant features for prediction
df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True)

# Encode categorical variables
label_enc = LabelEncoder()
df['Sex'] = label_enc.fit_transform(df['Sex'])
df['Embarked'] = label_enc.fit_transform(df['Embarked'])

# Check info after preprocessing
print("\nData Check after preprocessing:")
df.info()

## 2. Split Dataset

In [None]:
X = df.drop(columns=['Survived'])
y = df['Survived']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

## 3. Train Models & 4. Predict

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42)
}

results = []

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    
    # Calculate Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    
    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "Train Accuracy": train_accuracy
    })

## 5. Evaluate & 6. Comparison Table

In [None]:
results_df = pd.DataFrame(results)
display(results_df)

## 7. Comparison Plot

In [None]:
# Melt DataFrame for plotting
melted_df = results_df.melt(id_vars="Model", value_vars=["Accuracy", "Precision", "Recall", "F1 Score"], var_name="Metric", value_name="Score")

plt.figure(figsize=(12, 6))
sns.barplot(data=melted_df, x="Model", y="Score", hue="Metric", palette="viridis")
plt.title("Model Performance Comparison")
plt.ylim(0, 1.1)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

## 8. Overfitting Check

In [None]:
plt.figure(figsize=(10, 5))
width = 0.35
x = np.arange(len(results_df["Model"]))

plt.bar(x - width/2, results_df["Train Accuracy"], width, label='Train Accuracy')
plt.bar(x + width/2, results_df["Accuracy"], width, label='Test Accuracy')

plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Train vs Test Accuracy (Overfitting Check)')
plt.xticks(x, results_df["Model"])
plt.legend()
plt.show()

for index, row in results_df.iterrows():
    diff = row['Train Accuracy'] - row['Accuracy']
    print(f"{row['Model']}: Train-Test Difference = {diff:.4f} {'(Possible Overfitting)' if diff > 0.05 else ''}")

## 9. Select Best Model & Save

In [None]:
# Select model with highest F1 Score (balanced metric)
best_model_name = results_df.loc[results_df['F1 Score'].idxmax()]['Model']
best_model = models[best_model_name]

print(f"Best Model Selected: {best_model_name}")

# Save the model
joblib.dump(best_model, 'best_titanic_model.pkl')
print("Model saved as 'best_titanic_model.pkl'")

## 10. Explanation

### Approach
1.  **Preprocessing**: Missing values in Age and Embarked were filled. Categorical variables (Sex, Embarked) were label encoded. Features were scaled using StandardScaler.
2.  **Models**: Comparison of Logistic Regression, Decision Tree, Random Forest, and SVM.
3.  **Metrics**: Evaluation based on Accuracy, Precision, Recall, and F1-Score.

### Results & Selection
- **Decision Tree** and **Random Forest** often show higher training accuracy compared to test accuracy, indicating potential overfitting.
- **Logistic Regression** and **SVM** usually generalize better with smaller gaps between train and test scores.
- The best model was selected based on the **F1 Score**, which balances precision and recall, crucial for classification tasks where classes might be imbalanced (survival).
- The selected model is saved for future inference.