In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
# Load dataset 
url = "https://raw.githubusercontent.com/MaharLeika18/Data-Mining---Python/refs/heads/main/Titanic-Dataset.csv"
data = pd.read_csv(url)

In [None]:
# Clean and preprocess the data
data = data.drop_duplicates()   # Drop dupes
data['Age'] = data['Age'].fillna(data['Age'].median())  # Fill missing cells in Age w/ median
data['Fare'] = data['Fare'].fillna(data['Fare'].median()) # Same but in Fare
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0]) # Same but in Embarked and fill w/ mode

# Drop columns that won't be used
data = data.drop(['PassengerId', 'Ticket', 'Cabin', 'Name'], axis=1)

# Create new columns for:
data['Family'] = data['SibSp'] + data['Parch'] + 1  # Total family members on board
data['IsAlone'] = (data['Family'] == 1).astype(int) # Whether they are travelling alone or not

# Convert text values into measurable numerical values
data['Sex'] = data['Sex'].map({'male':0, 'female':1})   # Male = 0, Female = 1
data['Embarked'] = data['Embarked'].map({'S':0, 'C':1, 'Q':2}) # S = 0, C = 1, Q = 2

data['Survived'] = data['Survived'].astype('category')


In [None]:
# Preview the cleaned data, see what columns can be used for analysis
print(data.describe()) # Summary
data.sample(10)  # Random 5 rows


In [None]:
# Display the count of categories across multiple columns - Bar chart
categories = ["Sex", "Pclass", "IsAlone", "Survived", "Embarked"]
converted = data[categories].melt(var_name="Feature", value_name="Category")  # Convert the columns into rows

sns.countplot(data=converted, x="Feature", hue="Category")
plt.title("Counts of Categories per Feature")
plt.ylabel("Count")
plt.xlabel("Feature")
plt.ylim(top=750)
plt.legend(["0 (Male/No/S)", "1 (Female/Yes/C)", "2 (Q)", "3"])

plt.show()



In [None]:
# Compare survival rate of passengers based on whether they travelled alone or not - Bar
sns.countplot(x="Survived", hue="IsAlone",data=data)

plt.title("Survival Count by Traveling Alone vs With Family")
plt.xlabel("Survival")
plt.ylabel("Count")
plt.xticks([0, 1], ["Did Not Survive", "Survived"])
plt.legend(["With Family", "Alone"])

plt.show()

In [None]:
# Compare survival rate of passengers based on where they embarked from - Bar
sns.countplot(x="Survived", hue="Embarked",data=data)

plt.title("Survival Count of Passengers by Where They Embarked From")
plt.xlabel("Survival")
plt.ylabel("Count")
plt.xticks([0, 1], ["Did Not Survive", "Survived"])
plt.legend(["Southampton", "Cherbourg", "Queenstown"])

plt.show()


In [None]:
# Compare fare of passengers that survived and those that didn't - Scatter chart

In [None]:
# Display the survival rate of passengers based on their age and gender - Scatter 


In [None]:
# Compare age and family sizes of passengers that survived and those that didn't - Scatterplot
sns.stripplot(x="Family", y="Age", data=data, hue="Survived")

plt.title("Age and Family Size of Survivors and Non-survivors")
plt.xlabel("Family Size")
plt.ylabel("Age")
plt.legend(title="Survived",labels=["No", "Yes"])

plt.show()

In [None]:
# Compare survival rate of passengers based on passenger class and fare - Scatter
lowerHalf = (0, 300)
upperHalf = (500, 530)

fig, (ax1, ax2) = plt.subplots(
    2, 1, sharex=True, figsize=(8,6), gridspec_kw={'height_ratios':[1,3]}
)

sns.stripplot(x="Pclass", y="Fare", hue="Survived", data=data, ax=ax1, jitter=True, dodge=True)
sns.stripplot(x="Pclass", y="Fare", hue="Survived", data=data, ax=ax2, legend=False, jitter=True, dodge=True)

ax1.set_ylim(upperHalf)
ax2.set_ylim(lowerHalf)

ax1.spines['bottom'].set_visible(False)
ax2.spines['top'].set_visible(False)

ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax1.tick_params(axis='y', which='both', labelbottom=False)

d = .015
kwargs = dict(transform=ax1.transAxes, color='k', clip_on=False)
ax1.plot((-d,+d), (-d,+d), **kwargs)        
ax1.plot((1-d,1+d), (-d,+d), **kwargs)     

kwargs.update(transform=ax2.transAxes) 
ax2.plot((-d,+d), (1-d,1+d), **kwargs)     
ax2.plot((1-d,1+d), (1-d,1+d), **kwargs)   

ax2.set_xlabel("Passenger Class")
ax1.set_ylabel("Fare")
ax2.set_ylabel("Fare")
# ax1.legend(title="Survival", labels=["Non-Survivor", "Survivor"])     Color is wrong and idk why
plt.suptitle("Passenger Class and Fare of Survivors and Non-survivors")

plt.show()

In [None]:
# Boxplot Version
sns.boxplot(x="Pclass", y="Fare", data=data, showfliers=False)

plt.title("Passenger Class and Fare of Survivors and Non-survivors")
plt.xlabel("Passenger Class")
plt.ylabel("Fare")

plt.show()


In [None]:
# Decision Tree
