# Titanic Dataset - Data Cleaning & Preprocessing

In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Sample data to simulate Titanic dataset (replace with pd.read_csv('titanic.csv') in real use)
data = {
    'PassengerId': [1, 2, 3, 4, 5],
    'Survived': [0, 1, 1, 1, 0],
    'Pclass': [3, 1, 3, 1, 3],
    'Name': ['Allen', 'Braund', 'Heikkinen', 'Futrelle', 'Cumings'],
    'Sex': ['male', 'female', 'female', 'female', 'female'],
    'Age': [22.0, 38.0, 26.0, 35.0, np.nan],
    'SibSp': [1, 1, 0, 1, 1],
    'Parch': [0, 0, 0, 0, 0],
    'Fare': [7.25, 71.2833, 7.925, 53.1, 8.05],
    'Embarked': ['S', 'C', 'S', 'S', np.nan]
}
df = pd.DataFrame(data)
df.head()


## Step 1: Handle Missing Values

In [None]:

df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


## Step 2: Encode Categorical Variables

In [None]:

df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
df.head()


## Step 3: Standardize Numerical Features

In [None]:

scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])
df.head()


## Step 4: Outlier Detection and Removal

In [None]:

Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1
df_cleaned = df[~((df['Fare'] < (Q1 - 1.5 * IQR)) | (df['Fare'] > (Q3 + 1.5 * IQR)))]
df_cleaned.head()
