# Task 1: Data Cleaning & Preprocessing (Titanic Dataset)

## Step 1: Import Libraries & Dataset

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load dataset (make sure titanic.csv is in the same folder)
df = pd.read_csv("titanic.csv")
print("Shape:", df.shape)
df.head()


## Step 2: Explore Missing Values

In [None]:

df.info()
df.isnull().sum()


## Step 3: Handle Missing Values

In [None]:

df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop(columns=['Cabin'], inplace=True)
df.isnull().sum()


## Step 4: Encode Categorical Features

In [None]:

le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])  # Male=1, Female=0
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
df.head()


## Step 5: Drop Irrelevant Columns

In [None]:

df.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)
df.head()


## Step 6: Detect Outliers with Boxplots

In [None]:

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.boxplot(df['Age'], ax=axes[0])
sns.boxplot(df['Fare'], ax=axes[1])
plt.show()


## Step 7: Handle Outliers using IQR

In [None]:

for col in ['Age', 'Fare']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] > upper, upper,
                np.where(df[col] < lower, lower, df[col]))
df.head()


## Step 8: Normalize Features

In [None]:

scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])
df.head()


## Step 9: Save Cleaned Dataset

In [None]:

df.to_csv("titanic_cleaned.csv", index=False)
print("✅ Preprocessing Completed & Saved!")
