<a href="https://colab.research.google.com/github/Mujtaba-4T4/titanic-survival/blob/main/titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Titanic - Machine Learning from Disaster
Predict if someone survived the titanic or not

https://www.kaggle.com/competitions/titanic

# Data preparation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv('train.csv')

In [3]:
train_df = train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [4]:
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
train_df['Embarked'] = train_df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
train_df = train_df.dropna()

In [5]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,2.0
1,1,1,1,38.0,1,0,71.2833,0.0
2,1,3,1,26.0,0,0,7.925,2.0
3,1,1,1,35.0,1,0,53.1,2.0
4,0,3,0,35.0,0,0,8.05,2.0


## scaling data

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_df[['Age', 'Fare']] = scaler.fit_transform(train_df[['Age', 'Fare']])

## splitting data

In [7]:
X = train_df.drop(['Survived'], axis=1)
y = train_df['Survived']

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, C=10 ,  solver='lbfgs')
model.fit(X_train, y_train)
lr_pred = model.predict(X_val)

# Other models

## Decision Tree Classifier

In [14]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=5, min_samples_split=5)
model.fit(X_train, y_train)
dt_pred = model.predict(X_val)

## Random Forest	Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(max_depth=10, min_samples_split=5, n_estimators=50)
model.fit(X_train, y_train)
rf_pred = model.predict(X_val)

## Gradient Boosting Classifier

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(learning_rate=0.01, max_depth=5, n_estimators=100)
model.fit(X_train, y_train)
gb_pred = model.predict(X_val)

## KNN Classifier

In [18]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(metric='manhattan', n_neighbors=9, weights='uniform')
model.fit(X_train, y_train)
knn_pred = model.predict(X_val)

## SVM Classifier

In [19]:
from sklearn.svm import SVC

model = SVC(C=1, gamma='auto', kernel='rbf')
model.fit(X_train, y_train)
svm_pred = model.predict(X_val)

## XGBoost	Classifier

In [20]:
from xgboost import XGBClassifier

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', learning_rate=0.1, max_depth=5, n_estimators=50)
model.fit(X_train, y_train)
xgb_pred = model.predict(X_val)

# Hyperparameter Tuning

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(probability=True),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [10]:
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10],
        'solver': ['liblinear', 'lbfgs']
    },
    'Decision Tree': {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10]
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, None],
        'min_samples_split': [2, 5]
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    },
    'XGBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
}

In [12]:
from sklearn.model_selection import GridSearchCV

best_models = {}

for name in models:
    grid = GridSearchCV(models[name], param_grids[name], cv=5, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_
    print(f"{name} best score: {grid.best_score_:.4f}")
    print(f"Best Params: {grid.best_params_}")

Logistic Regression best score: 0.8032
Best Params: {'C': 10, 'solver': 'lbfgs'}
Decision Tree best score: 0.8067
Best Params: {'max_depth': 5, 'min_samples_split': 5}
Random Forest best score: 0.8173
Best Params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 50}
Gradient Boosting best score: 0.8155
Best Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
KNN best score: 0.8032
Best Params: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
SVM best score: 0.8172
Best Params: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
XGBoost best score: 0.8260
Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}


## BEST Parameters

Logistic Regression best score: 0.8032
Best Params: {'C': 10, 'solver': 'lbfgs'}

Decision Tree best score: 0.8067
Best Params: {'max_depth': 5, 'min_samples_split': 5}

Random Forest best score: 0.8173
Best Params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 50}

Gradient Boosting best score: 0.8155
Best Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}

KNN best score: 0.8032
Best Params: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}

SVM best score: 0.8172
Best Params: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}

XGBoost best score: 0.8260
Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}

# Results

In [22]:
from sklearn.metrics import accuracy_score

predictions = {
    "Logistic Regression": lr_pred,
    "Decision Tree": dt_pred,
    "Random Forest": rf_pred,
    "Gradient Boosting": gb_pred,
    "KNN": knn_pred,
    "SVM": svm_pred,
    "XGBoost": xgb_pred
}

for name, pred in predictions.items():
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_val, pred))


=== Logistic Regression ===
Accuracy: 0.7972027972027972

=== Decision Tree ===
Accuracy: 0.7202797202797203

=== Random Forest ===
Accuracy: 0.8041958041958042

=== Gradient Boosting ===
Accuracy: 0.7552447552447552

=== KNN ===
Accuracy: 0.7972027972027972

=== SVM ===
Accuracy: 0.8321678321678322

=== XGBoost ===
Accuracy: 0.7902097902097902
