# Titanic survival prediction

This notebook builds a simple machine-learning pipeline to predict which passengers survived the Titanic disaster using the provided `Titanic-Dataset.csv` file. Inputs: passenger features (Pclass, Sex, Age, SibSp, Parch, Fare, Embarked, Title extracted from Name). Outputs: accuracy and classification reports for Logistic Regression and Random Forest, plus saved model files.

In [12]:
# 1) Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import joblib

# Reproducibility
RANDOM_STATE = 42

In [13]:
# 2) Load dataset & quick EDA
df = pd.read_csv('Titanic-Dataset.csv')
print('Dataset shape:', df.shape)
print('Columns:', list(df.columns))
print('\nMissing values:\n', df.isnull().sum())
df.head()

Dataset shape: (891, 12)
Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

Missing values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# 2b) Extract title from Name column
def extract_title(name):
    """Extract title (Mr., Mrs., Miss., Master, etc.) from passenger name"""
    if pd.isna(name):
        return 'Unknown'
    title = name.split(',')[1].split('.')[0].strip()
    # Group rare titles into categories
    if title in ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']:
        return 'Rare'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title == 'Mme':
        return 'Mrs'
    return title

df['Title'] = df['Name'].apply(extract_title)
print('Title distribution:')
print(df['Title'].value_counts())

In [None]:
# 3) Preprocessing (feature selection, imputation, encoding)
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Title']  # Added Title
X = df[features].copy()
y = df['Survived'].copy()

# Simple imputations
X['Age'] = X['Age'].fillna(X['Age'].median())
X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])

# Encoding
X['Sex'] = X['Sex'].map({'male':0, 'female':1})
X = pd.get_dummies(X, columns=['Embarked', 'Title'], drop_first=True)  # Added Title encoding

print('Features after preprocessing:')
X.head()

Features after preprocessing:


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,3,0,22.0,1,0,7.25,False,True
1,1,1,38.0,1,0,71.2833,False,False
2,3,1,26.0,0,0,7.925,False,True
3,1,1,35.0,1,0,53.1,False,True
4,3,0,35.0,0,0,8.05,False,True


In [15]:
# 4) Train / test split and scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# Keep a scaler for models that need it (Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
# 5) Train models (Logistic Regression and Random Forest)
lr = LogisticRegression(random_state=RANDOM_STATE, max_iter=200)
lr.fit(X_train_scaled, y_train)

rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rf.fit(X_train, y_train)  # RF works on unscaled features

print('Models trained: Logistic Regression and Random Forest')

Models trained: Logistic Regression and Random Forest


In [17]:
# 6a) Evaluate Logistic Regression
y_pred_lr = lr.predict(X_test_scaled)
print('Logistic Regression')
print('Accuracy:', accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_lr))

Logistic Regression
Accuracy: 0.8044692737430168
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       110
           1       0.79      0.67      0.72        69

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

Confusion Matrix:
 [[98 12]
 [23 46]]


In [18]:
# 6b) Evaluate Random Forest
y_pred_rf = rf.predict(X_test)
y_pred_rf_proba = rf.predict_proba(X_test)[:,1]
print('Random Forest')
print('Accuracy:', accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_rf))
print('ROC-AUC (RF):', roc_auc_score(y_test, y_pred_rf_proba))

Random Forest
Accuracy: 0.8044692737430168
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       110
           1       0.78      0.68      0.73        69

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

Confusion Matrix:
 [[97 13]
 [22 47]]
ROC-AUC (RF): 0.8347826086956522


In [19]:
# 6c) Feature importances (Random Forest)
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print('Feature importances (Random Forest):')
print(importances)

Feature importances (Random Forest):
Fare          0.274959
Sex           0.264552
Age           0.251037
Pclass        0.085327
SibSp         0.049806
Parch         0.038882
Embarked_S    0.023228
Embarked_Q    0.012210
dtype: float64


In [20]:
# 7) Save models and scaler
joblib.dump(rf, 'titanic_rf.pkl')
joblib.dump(lr, 'titanic_lr.pkl')
joblib.dump(scaler, 'titanic_scaler.pkl')
print('Saved models: titanic_rf.pkl, titanic_lr.pkl, titanic_scaler.pkl')

Saved models: titanic_rf.pkl, titanic_lr.pkl, titanic_scaler.pkl
