In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# 1. Read the Dataset
# Load the dataset from the local files
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# 2. Preprocess the Dataset
# Handle Missing Values
imputer = SimpleImputer(strategy='mean')
train_df['Age'] = imputer.fit_transform(train_df[['Age']])
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
train_df.drop(columns=['Cabin'], inplace=True)

# Remove Duplicate Data
train_df.drop_duplicates(inplace=True)

# Categorical Data: Convert categorical data into numerical format
categorical_features = ['Sex', 'Embarked']
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch', 'Pclass']

# Preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split the data
X = train_df.drop(columns=['Survived', 'Name', 'Ticket'])
y = train_df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply transformations to data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# 3. Model Experimentation

# Define models
models = {
    'RandomForest': RandomForestClassifier(),
    'SVC': SVC()
}

# Define hyperparameters for GridSearch
param_grid = {
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20]
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    }
}

# Experiment with different models and hyperparameters
best_models = {}
for model_name in models.keys():
    clf = GridSearchCV(models[model_name], param_grid[model_name], cv=5, scoring='accuracy')
    clf.fit(X_train, y_train)
    best_models[model_name] = clf.best_estimator_

# 4. Model Evaluation

# Evaluate models using precision and recall
evaluation_results = {}
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    evaluation_results[model_name] = {
        'precision': precision,
        'recall': recall,
        'classification_report': classification_report(y_test, y_pred)
    }

# Choose the best model
best_model_name = max(evaluation_results, key=lambda k: (evaluation_results[k]['precision'], evaluation_results[k]['recall']))
best_model = best_models[best_model_name]

# Display results
for model_name, metrics in evaluation_results.items():
    print(f"Model: {model_name}")
    print(f"Precision: {metrics['precision']}")
    print(f"Recall: {metrics['recall']}")
    print(metrics['classification_report'])
    print("\n" + "-"*50 + "\n")

print(f"Best Model: {best_model_name}")

# Discussion
if best_model_name == 'RandomForest':
    print("The RandomForest model was chosen as the best model. This is likely because RandomForest can handle the complexity of the data and is robust to overfitting. It also effectively handles feature interactions and missing values, which is beneficial for the Titanic dataset.")
elif best_model_name == 'SVC':
    print("The SVC model was chosen as the best model. SVC is effective for high-dimensional spaces and when the number of samples is less than the number of features. Its kernel trick allows it to model complex non-linear relationships in the data, which might have led to its better performance on the Titanic dataset.")


Model: RandomForest
Precision: 0.803030303030303
Recall: 0.7162162162162162
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       105
           1       0.80      0.72      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179


--------------------------------------------------

Model: SVC
Precision: 0.803030303030303
Recall: 0.7162162162162162
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       105
           1       0.80      0.72      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179


--------------------------------------------------

Best Model: RandomForest
The RandomForest model was chosen as the best model. This is likely beca