In [None]:
# 1. Import libraries
import pandas as pd

In [None]:
# 2. Load raw dataset
df = pd.read_csv("../data/raw_dataset.csv")

In [None]:
# 3. Exploratory checks
print("=== Dataset Info ===")
df.info()

In [None]:
print("\n=== Dataset Description ===")
print(df.describe(include="all"))

In [None]:
# 4. Missing values check
print("\n=== Missing Values Per Column ===")
print(df.isnull().sum())

In [None]:
# 5. Duplicate rows check
duplicates = df.duplicated().sum()
print(f"\n=== Duplicate Rows Found: {duplicates} ===")

🧹 Data Cleaning Notebook for Titanic Dataset

In [None]:
## 1. Import Libraries
import pandas as pd
import numpy as np
import re

In [None]:
## 2. Load Dataset
df = pd.read_csv("../data/raw_dataset.csv")
print("Original Shape:", df.shape)
df.head()

In [None]:
## 3. Initial Exploration
df.info()
df.describe(include="all")

================================================================
4. Handle Missing Values
================================================================

In [None]:
# Age → fill missing with median, convert to int
df['Age'] = df['Age'].fillna(df['Age'].median()).astype(int)

In [None]:
# Embarked → fill missing with most common (mode)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [None]:
# Cabin → replace NaN with 'Unknown'
df['Cabin'] = df['Cabin'].fillna('Unknown')

In [None]:
# Drop rows that are entirely empty
df = df.dropna(how="all")

================================================================
5. Clean Name Column
================================================================

In [None]:
# Remove titles (Mr., Mrs., Miss., Master., Dr., etc.)
df['Name'] = df['Name'].str.replace(r"(Mr\.|Mrs\.|Miss\.|Master\.|Don\.|Dr\.|Rev\.)", "", regex=True)

In [None]:
# Remove parentheses and contents inside them
df['Name'] = df['Name'].str.replace(r"\(.*\)", "", regex=True).str.strip()

In [None]:
# Split Surname, Firstname -> reorder to Firstname Surname
def reorder_name(name):
    if "," in name:
        parts = [p.strip() for p in name.split(",")]
        if len(parts) >= 2:
            return parts[1] + " " + parts[0]
    return name

In [None]:
df['Name'] = df['Name'].apply(reorder_name)

================================================================
6. Ticket Column Cleaning
================================================================

In [None]:
# Extract numeric part of ticket (remove letters, spaces, dots)
df['Ticket'] = df['Ticket'].astype(str).str.replace(r"\D", "", regex=True)

In [None]:
# Drop duplicates based on Ticket Number
df = df.drop_duplicates(subset=['Ticket'])

In [None]:
# Ensure Ticket column is numeric
df['Ticket'] = pd.to_numeric(df['Ticket'], errors='coerce')

================================================================
7. Standardize Sex Column
================================================================

In [None]:
df['Sex'] = df['Sex'].str.strip().str.lower()

================================================================
8. Fare Column
================================================================

In [None]:
# Fill missing fares with median
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

In [None]:
# Round to 2 decimal places
df['Fare'] = df['Fare'].round(2)

================================================================
9. Cabin Column
================================================================

In [None]:
# Keep only the first letter (deck)
df['Cabin'] = df['Cabin'].apply(lambda x: x[0] if x != 'Unknown' else 'Unknown')

================================================================
10. Remove Duplicate Rows
================================================================

In [None]:
df = df.drop_duplicates()

In [None]:
print("Cleaned Shape:", df.shape)

In [None]:
# ================================================================
# 11. Drop Cabin Column
# ================================================================
if 'Cabin' in df.columns:
    df = df.drop(columns=['Cabin'])
    print("\n=== Dropped Cabin Column ===")

In [None]:
# ================================================================
# 12. Save Cleaned Dataset
# ================================================================
df.to_csv("../data/cleaned_dataset.csv", index=False)
print("✅ Cleaned dataset saved!")