In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
# Load dataset 
url = "https://raw.githubusercontent.com/MaharLeika18/Data-Mining---Python/refs/heads/main/Titanic-Dataset.csv"
data = pd.read_csv(url)

In [15]:
# Clean and preprocess the data
data = data.drop_duplicates()   # Drop dupes
data['Age'] = data['Age'].fillna(data['Age'].median())  # Fill missing cells in Age w/ median
data['Fare'] = data['Fare'].fillna(data['Fare'].median()) # Same but in Fare
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0]) # Same but in Embarked and fill w/ mode

# Drop columns that won't be used
data = data.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)

# Create new columns for:
data['Family'] = data['SibSp'] + data['Parch'] + 1  # Total family members on board
data['IsAlone'] = (data['Family'] == 1).astype(int) # Whether they are travelling alone or not

# Convert text values into measurable numerical values
data['Sex'] = data['Sex'].map({'male':0, 'female':1})   # Male = 0, Female = 1
data = pd.get_dummies(data, columns=['Embarked'], drop_first=True)  # one-hot encode (S, C, Q)


In [16]:
# Preview the cleaned data, see what columns can be used for analysis
print(data.describe()) # Summary
data.sample(5)  # Random 5 rows


       Survived  Pclass     Sex     Age   SibSp   Parch    Fare  Family  IsAlone
count    891.00  891.00  891.00  891.00  891.00  891.00  891.00  891.00   891.00
mean       0.38    2.31    0.35   29.36    0.52    0.38   32.20    1.90     0.60
std        0.49    0.84    0.48   13.02    1.10    0.81   49.69    1.61     0.49
min        0.00    1.00    0.00    0.42    0.00    0.00    0.00    1.00     0.00
25%        0.00    2.00    0.00   22.00    0.00    0.00    7.91    1.00     0.00
50%        0.00    3.00    0.00   28.00    0.00    0.00   14.45    1.00     1.00
75%        1.00    3.00    1.00   35.00    1.00    0.00   31.00    2.00     1.00
max        1.00    3.00    1.00   80.00    8.00    6.00  512.33   11.00     1.00


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Family,IsAlone,Embarked_Q,Embarked_S
206,0,3,"Backstrom, Mr. Karl Alfred",0,32.0,1,0,15.85,2,0,False,True
135,0,2,"Richard, Mr. Emile",0,23.0,0,0,15.05,1,1,False,False
161,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Mi...",1,40.0,0,0,15.75,1,1,False,True
327,1,2,"Ball, Mrs. (Ada E Hall)",1,36.0,0,0,13.0,1,1,False,True
423,0,3,"Danbom, Mrs. Ernst Gilbert (Anna Sigrid Maria ...",1,28.0,1,1,14.4,3,0,False,True
