# Titanic Survival Prediction with XGBoost

## Download and Load Dataset

In [76]:
!python -m pip install --upgrade pip
!pip install xgboost pandas scikit-learn matplotlib seaborn kagglehub



In [77]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("yasserh/titanic-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\nikhi\.cache\kagglehub\datasets\yasserh\titanic-dataset\versions\1


In [78]:
import pandas as pd
import numpy as np
import os

data_path = os.path.join(path, 'Titanic-Dataset.csv')
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Explore the Data

In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [80]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Data Preprocessing

In [81]:
from sklearn.impute import SimpleImputer

df_clean = df.copy()

# Create HasCabin feature
df_clean['HasCabin'] = df_clean['Cabin'].notna().astype(int)

# Extract Title from Name
df_clean['Title'] = df_clean['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
title_mapping = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare', 
    'Mlle': 'Miss', 'Countess': 'Rare', 'Ms': 'Miss', 'Lady': 'Rare', 
    'Jonkheer': 'Rare', 'Don': 'Rare', 'Dona': 'Rare', 'Mme': 'Mrs', 
    'Capt': 'Rare', 'Sir': 'Rare'
}
df_clean['Title'] = df_clean['Title'].map(title_mapping).fillna('Rare')

# Family Size features
df_clean['FamilySize'] = df_clean['SibSp'] + df_clean['Parch'] + 1
df_clean['IsAlone'] = (df_clean['FamilySize'] == 1).astype(int)

# Fare per person
df_clean['FarePerPerson'] = df_clean['Fare'] / df_clean['FamilySize']
df_clean['FarePerPerson'].fillna(df_clean['Fare'].median(), inplace=True)

# Drop unnecessary columns
df_clean.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'], axis=1, inplace=True)

# Encode categorical variables
df_clean['Sex'] = df_clean['Sex'].map({'male': 0, 'female': 1})
df_clean = pd.get_dummies(df_clean, columns=['Embarked', 'Title', 'Pclass'], drop_first=True)

print(f"Dataset shape: {df_clean.shape}")
df_clean.head()

Dataset shape: (891, 16)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean['FarePerPerson'].fillna(df_clean['Fare'].median(), inplace=True)


Unnamed: 0,Survived,Sex,Age,Fare,HasCabin,FamilySize,IsAlone,FarePerPerson,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,Pclass_2,Pclass_3
0,0,0,22.0,7.25,0,2,0,3.625,False,True,False,True,False,False,False,True
1,1,1,38.0,71.2833,1,2,0,35.64165,False,False,False,False,True,False,False,False
2,1,1,26.0,7.925,0,1,1,7.925,False,True,True,False,False,False,False,True
3,1,1,35.0,53.1,1,2,0,26.55,False,True,False,False,True,False,False,False
4,0,0,35.0,8.05,0,1,1,8.05,False,True,False,True,False,False,False,True


## Split Data

In [82]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

X = df_clean.drop('Survived', axis=1)
y = df_clean['Survived']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Impute missing values using training data only
imputer = SimpleImputer(strategy='median')
X_train = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)
X_test = pd.DataFrame(
    imputer.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

Training set: (712, 15)
Test set: (179, 15)


## Train Model

In [83]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats import uniform, randint

param_dist = {
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'n_estimators': randint(100, 500),
    'min_child_weight': randint(1, 7),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 0.5)
}

xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=0
)

random_search.fit(X_train, y_train)
model = random_search.best_estimator_

print(f"Best CV Score: {random_search.best_score_:.4f}")

Best CV Score: 0.8343


In [84]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Train additional models for ensemble
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf_model = RandomForestClassifier(n_estimators=300, max_depth=7, random_state=42)
rf_model.fit(X_train, y_train)

lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

print("Models trained")

Models trained


## Make Predictions

In [85]:
# Ensemble predictions
xgb_proba = model.predict_proba(X_test)
rf_proba = rf_model.predict_proba(X_test)
lr_proba = lr_model.predict_proba(X_test_scaled)

ensemble_proba = (xgb_proba + rf_proba + lr_proba) / 3
y_pred = (ensemble_proba[:, 1] > 0.5).astype(int)

predictions_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred,
    'Confidence': ensemble_proba[:, 1]
})

print(predictions_df.head(15).to_string(index=False))
print(f"\nTotal predictions: {len(y_pred)}")

 Actual  Predicted  Confidence
      0          0    0.051581
      0          0    0.055585
      1          0    0.127185
      0          0    0.046069
      1          1    0.667263
      1          0    0.418357
      1          1    0.756317
      0          1    0.698762
      0          0    0.429852
      0          0    0.129303
      0          0    0.163656
      0          0    0.141153
      1          0    0.460901
      0          0    0.154480
      0          0    0.215813

Total predictions: 179


## Model Accuracy

In [86]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
correct = (y_test == y_pred).sum()
total = len(y_test)

print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Correct Predictions: {correct}/{total}")
print(f"Predicted Survivors: {(y_pred == 1).sum()}/{total}")

Accuracy: 0.8212 (82.12%)
Correct Predictions: 147/179
Predicted Survivors: 59/179


## Error Analysis

In [87]:
false_positives = ((y_test == 0) & (y_pred == 1)).sum()
false_negatives = ((y_test == 1) & (y_pred == 0)).sum()
true_positives = ((y_test == 1) & (y_pred == 1)).sum()
true_negatives = ((y_test == 0) & (y_pred == 0)).sum()

print(f"True Negatives: {true_negatives}")
print(f"True Positives: {true_positives}")
print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")

precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

print(f"\nPrecision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

True Negatives: 99
True Positives: 48
False Positives: 11
False Negatives: 21

Precision: 0.8136
Recall: 0.6957
