Import necessary libraries

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Load the dataset

In [5]:
data = pd.read_csv('Titanic-Dataset.csv')

In [6]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


 Data Exploration and Cleaning

In [7]:
print(data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [8]:
# Drop columns that aren't useful like 'Cabin' due to high missing values
data = data.drop(columns=['Cabin', 'Ticket'])
# Fill missing values in 'Age' and 'Embarked'
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

Feature Engineering

In [9]:
# Convert 'Sex' and 'Embarked' into numerical values
le = LabelEncoder()
data['Sex'] = le.fit_transform(data['Sex'])  # 0 = female, 1 = male
data['Embarked'] = le.fit_transform(data['Embarked'])  # 0 = C, 1 = Q, 2 = S

In [10]:
#check data for null values
print(data.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64


Data Pre-processing

In [11]:
# Select relevant features for prediction
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = data[features]
y = data['Survived']

In [13]:
# Standardize the feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Split the Data into training and testing sets

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

Model Selection and Training (Random Forest Classifier)

In [15]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

Evaluate the Model

In [16]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Accuracy: 0.8156424581005587
Confusion Matrix:
 [[91 14]
 [19 55]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.81       179
weighted avg       0.81      0.82      0.81       179



Checking whether model is working

In [17]:
# Check if a passenger survived by their index in the dataset
passenger_index = 5 
if data.iloc[passenger_index]['Survived'] == 1:
    print(f"Passenger {passenger_index} survived.")
else:
    print(f"Passenger {passenger_index} did not survive.")

Passenger 5 did not survive.


In [18]:
# Get all passengers who survived
survived_passengers = data[data['Survived'] == 1]
print("Passengers who survived:")
print(survived_passengers)

Passengers who survived:
     PassengerId  Survived  Pclass  \
1              2         1       1   
2              3         1       3   
3              4         1       1   
8              9         1       3   
9             10         1       2   
..           ...       ...     ...   
875          876         1       3   
879          880         1       1   
880          881         1       2   
887          888         1       1   
889          890         1       1   

                                                  Name  Sex   Age  SibSp  \
1    Cumings, Mrs. John Bradley (Florence Briggs Th...    0  38.0      1   
2                               Heikkinen, Miss. Laina    0  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)    0  35.0      1   
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)    0  27.0      0   
9                  Nasser, Mrs. Nicholas (Adele Achem)    0  14.0      1   
..                                                 ...  ...   

In [19]:
# Get all passengers who did not survive
did_not_survive_passengers = data[data['Survived'] == 0]
print("Passengers who did not survive:")
print(did_not_survive_passengers)

Passengers who did not survive:
     PassengerId  Survived  Pclass                                      Name  \
0              1         0       3                   Braund, Mr. Owen Harris   
4              5         0       3                  Allen, Mr. William Henry   
5              6         0       3                          Moran, Mr. James   
6              7         0       1                   McCarthy, Mr. Timothy J   
7              8         0       3            Palsson, Master. Gosta Leonard   
..           ...       ...     ...                                       ...   
884          885         0       3                    Sutehall, Mr. Henry Jr   
885          886         0       3      Rice, Mrs. William (Margaret Norton)   
886          887         0       2                     Montvila, Rev. Juozas   
888          889         0       3  Johnston, Miss. Catherine Helen "Carrie"   
890          891         0       3                       Dooley, Mr. Patrick   

     Se

In [22]:
# Check if a specific passenger survived by their name
passenger_name = "Braund, Mr. Owen Harris"  
passenger = data[data['Name'] == passenger_name]

if passenger.empty:
    print(f"No passenger found with the name {passenger_name}.")
else:
    if passenger['Survived'].values[0] == 1:
        print(f"{passenger_name} survived.")
    else:
        print(f"{passenger_name} did not survive.")

Braund, Mr. Owen Harris did not survive.


In [23]:
# Check survival rates by gender
survival_by_gender = data.groupby('Sex')['Survived'].mean()
print("Survival rates by gender:")
print(survival_by_gender)

Survival rates by gender:
Sex
0    0.742038
1    0.188908
Name: Survived, dtype: float64
