# Titanic Dataset Preprocessing (Modified Version)

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# ---------------------------------
# Load Dataset
# ---------------------------------
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

print("First 5 rows of the dataset:")
display(df.head())

# ---------------------------------
# Step 1: Handling Missing Values (Revised)
# ---------------------------------
print("\nMissing values before processing:\n", df.isnull().sum())

df1 = df.copy()

# Fill Age using median age within each Sex group
df1["Age"] = df1.groupby("Sex")["Age"].transform(lambda x: x.fillna(x.median()))

# Fill Embarked using most frequent value
df1["Embarked"] = df1["Embarked"].fillna(df1["Embarked"].mode()[0])

# Drop Cabin due to excessive missing data
df1.drop(columns=["Cabin"], inplace=True)

print("\nMissing values after processing:\n", df1.isnull().sum())

# ---------------------------------
# Step 2: Noise Simulation and Smoothing
# ---------------------------------
np.random.seed(1)
df1["Fare_noisy"] = df1["Fare"] + np.random.normal(0, 3, len(df1))

# Exponential smoothing instead of moving average
df1["Fare_smooth"] = df1["Fare_noisy"].ewm(span=10).mean()

plt.figure()
plt.plot(df1["Fare_noisy"], alpha=0.4, label="Noisy Fare")
plt.plot(df1["Fare_smooth"], color="red", label="Smoothed Fare")
plt.legend()
plt.title("Noise Reduction using Exponential Smoothing")
plt.show()

# ---------------------------------
# Step 3: Outlier Detection Using Modified Z-Score
# ---------------------------------
median_fare = np.median(df1["Fare"])
mad = np.median(np.abs(df1["Fare"] - median_fare))
modified_z = 0.6745 * (df1["Fare"] - median_fare) / mad

df_no_outliers = df1[np.abs(modified_z) < 3.5]

print("\nOriginal shape:", df1.shape)
print("After removing outliers:", df_no_outliers.shape)

# ---------------------------------
# Step 4: Feature Transformation
# ---------------------------------
df2 = df_no_outliers.copy()

# Create family size feature
df2["FamilySize"] = df2["SibSp"] + df2["Parch"] + 1

# One-hot encode categorical variables
df2 = pd.get_dummies(df2, columns=["Sex", "Embarked"], drop_first=True)

print("\nTransformed dataset preview:")
display(df2.head())

# ---------------------------------
# Step 5: Feature Scaling
# ---------------------------------
num_cols = ["Age", "Fare", "FamilySize"]

std_scaler = StandardScaler()
df2[num_cols] = std_scaler.fit_transform(df2[num_cols])

mm_scaler = MinMaxScaler()
df2[num_cols] = mm_scaler.fit_transform(df2[num_cols])

print("\nScaled feature sample:")
display(df2[num_cols].head())

# ---------------------------------
# Step 6: Distribution Visualization
# ---------------------------------
plt.figure()
plt.hist(df["Fare"], bins=30, alpha=0.5, label="Original Fare")
plt.hist(df_no_outliers["Fare"], bins=30, alpha=0.5, label="After Outlier Removal")
plt.legend()
plt.title("Fare Distribution Before and After Cleaning")
plt.show()

print("\nData preprocessing complete.")
