In [None]:
"""
🧹 Titanic Dataset Data Cleaning Script
Author: Jemar John J. Lumingkit
Description: Loads Titanic dataset, performs cleaning, and saves cleaned version.
"""

In [None]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import re

In [None]:
# 2. Load Dataset
print("=== Loading Dataset ===")
df = pd.read_csv("../data/raw_dataset.csv")
print("Original Shape:", df.shape)

In [None]:
# 3. Initial Exploration
print("\n=== Dataset Info ===")
df.info()

In [None]:
print("\n=== Dataset Description ===")
print(df.describe(include="all"))

In [None]:
print("\n=== Missing Values Per Column ===")
print(df.isnull().sum())

In [None]:
duplicates = df.duplicated().sum()
print(f"\n=== Duplicate Rows Found: {duplicates} ===")

In [None]:
# ================================================================
# 4. Handle Missing Values
# ================================================================
print("\n=== Handling Missing Values ===")

In [None]:
# Age → fill missing with median, convert to int
df['Age'] = df['Age'].fillna(df['Age'].median()).astype(int)

In [None]:
# Embarked → fill missing with most common (mode)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [None]:
# Cabin → replace NaN with 'Unknown'
df['Cabin'] = df['Cabin'].fillna('Unknown')

In [None]:
# Drop rows that are entirely empty
df = df.dropna(how="all")

In [None]:
# ================================================================
# 5. Clean Name Column
# ================================================================
print("\n=== Cleaning Name Column ===")

In [None]:
# Remove titles (Mr., Mrs., Miss., Master., Don., Dr., Rev., etc.)
df['Name'] = df['Name'].str.replace(
    r"(Mr\.|Mrs\.|Miss\.|Master\.|Don\.|Dr\.|Rev\.)", "", regex=True
)

In [None]:
# Remove parentheses and contents inside them
df['Name'] = df['Name'].str.replace(r"\(.*\)", "", regex=True).str.strip()

In [None]:
# Reorder names: "Surname, Firstname" → "Firstname Surname"
def reorder_name(name):
    if "," in name:
        parts = [p.strip() for p in name.split(",")]
        if len(parts) >= 2:
            return parts[1] + " " + parts[0]
    return name

In [None]:
df['Name'] = df['Name'].apply(reorder_name)

In [None]:
# ================================================================
# 6. Ticket Column Cleaning
# ================================================================
print("\n=== Cleaning Ticket Column ===")

In [None]:
# Extract numeric part of ticket (remove letters, spaces, dots)
df['Ticket'] = df['Ticket'].astype(str).str.replace(r"\D", "", regex=True)

In [None]:
# Drop duplicates based on Ticket Number
df = df.drop_duplicates(subset=['Ticket'])

In [None]:
# Ensure Ticket column is numeric
df['Ticket'] = pd.to_numeric(df['Ticket'], errors='coerce')

In [None]:
# ================================================================
# 7. Standardize Sex Column
# ================================================================
df['Sex'] = df['Sex'].str.strip().str.lower()

In [None]:
# ================================================================
# 8. Fare Column
# ================================================================
print("\n=== Cleaning Fare Column ===")
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Fare'] = df['Fare'].round(2)

In [None]:
# ================================================================
# 9. Cabin Column
# ================================================================
print("\n=== Simplifying Cabin Column ===")
df['Cabin'] = df['Cabin'].apply(lambda x: x[0] if x != 'Unknown' else 'Unknown')

In [None]:
# ================================================================
# 10. Remove Duplicate Rows
# ================================================================
df = df.drop_duplicates()

In [None]:
# ================================================================
# 11. Save Cleaned Dataset
# ================================================================
print("\n=== Saving Cleaned Dataset ===")
print("Cleaned Shape:", df.shape)
df.to_csv("../data/cleaned_dataset.csv", index=False)
print("✅ Cleaned dataset saved at ../data/cleaned_dataset.csv")