In [None]:
# Ques 3: Steps in Data Preprocessing

# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Step 2: Load the Dataset
# You can replace this with any CSV path you prefer
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Step 3: Understand the Dataset
print("First 5 Rows:")
print(df.head())

print("\nData Info:")
print(df.info())

print("\nSummary Statistics:")
print(df.describe())

# Step 4: Handle Missing Data
print("\nMissing values before handling:")
print(df.isnull().sum())

# Fill missing Age with mean
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Fill missing Embarked with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop Cabin column due to too many missing values
df.drop('Cabin', axis=1, inplace=True)

print("\nMissing values after handling:")
print(df.isnull().sum())

# Step 5: Handle Categorical Data
# Convert 'Sex' and 'Embarked' using Label Encoding
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])

# Step 6: Feature Scaling
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

# Step 7: Remove Duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())
df.drop_duplicates(inplace=True)

# Step 8: Outlier Detection (Optional - using IQR for Fare)
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df['Fare'] < Q1 - 1.5 * IQR) | (df['Fare'] > Q3 + 1.5 * IQR)]
print("\nNumber of outliers in Fare:", outliers.shape[0])

# Optional: Save cleaned data
# df.to_csv("cleaned_titanic.csv", index=False)

print("\nPreprocessing completed successfully.")