In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Exploring Data

In [10]:
print(df.shape)  # Shape of dataset
print()
print("Null values list:")
print(df.isnull().sum() ) # Missing values

(891, 11)

Null values list:
Survived      0
Pclass        0
Sex           0
Age           0
Parch         0
Fare          0
Title         0
FamilySize    0
IsAlone       0
Embarked_Q    0
Embarked_S    0
dtype: int64


In [None]:
# ' ([A-Za-z]+)\.' is a regular expr where:
# ' ' — a space before the title (ensures we capture the title after a space)
# [A-Za-z]+ — one or more alphabetic characters = the title (e.g., Mr, Mrs, Dr, Miss)
# \. — a literal dot, so it matches titles like "Mr.", "Dr."
# expand = False : Tells pandas to return a Series instead of a DataFrame.
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

df['Title'] = df['Title'].replace('Mlle','Miss')
df['Title'] = df['Title'].replace('Ms','Miss')
df['Title'] = df['Title'].replace('Mme','Mrs')

title_map = {"Mr" : 1,"Miss": 2,"Mrs" : 3, "Master" : 4, "Rare" : 5}
df['Title'] = df['Title'].map(title_map)
df['Title'] = df['Title'].fillna(0)  # safe fill

# Both SibSp and Parch columns are weak when compared indidvidually but when combined they are more useful
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Adding IsAlone column
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

  df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,3,2,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,3,2,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,1,1


## Data Cleaning

In [5]:
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

df['Sex'] = df['Sex'].map({'female' : 1,'male' : 0})
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
df = df.drop(drop_elements, axis = 1)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Title,FamilySize,IsAlone,Embarked_Q,Embarked_S
0,0,3,0,22.0,0,7.25,1,2,0,False,True
1,1,1,1,38.0,0,71.2833,3,2,0,False,False
2,1,3,1,26.0,0,7.925,2,1,1,False,True
3,1,1,1,35.0,0,53.1,3,2,0,False,True
4,0,3,0,35.0,0,8.05,1,1,1,False,True


## Splitting Data

In [6]:
from sklearn.model_selection import train_test_split
X = df.drop("Survived", axis=1)
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Training Model

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
new_preds = rf_model.predict(X_test)

print(f"\nAdvanced Model Accuracy: {accuracy_score(y_test, new_preds) * 100:.2f}%")


Advanced Model Accuracy: 83.80%


## Saving Model

In [8]:
import joblib
joblib.dump(rf_model,'titanic_model.pkl')

['titanic_model.pkl']