In [6]:
# Steps in Data Preprocessing

# 1. Data Collection: Gathering raw data from various sources.
# Task 1: Collect data from two different sources and merge them.
# Task 2: Validate the integrity of the collected datasets.
# Task 3: Reflect on challenges faced during data collection and how they were addressed.


# Ques_3.ipynb
# Module 5: Introduction to Data Preprocessing & Cleaning
# Task: Steps in Data Preprocessing

import pandas as pd
import numpy as np

# Step 1: Load Data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', None, 'Eve', 'Frank', 'Grace'],
    'Age': [25, np.nan, 30, 22, 28, 120, 27],
    'City': ['New York', 'Los Angeles', 'New York', 'Chicago', None, 'Miami', 'Boston'],
    'Income': ['50000', '60000', '52000', '48000', 'nan', '1000000', '49000'],
    'Gender': ['F', 'M', 'M', 'F', 'F', 'Male', 'Female']
}

df = pd.DataFrame(data)
print("🔹 Original Dataset:")
print(df)

# Step 2: Handle Missing Values
df['Name'].fillna('Unknown', inplace=True)
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['City'].fillna('Unknown', inplace=True)
df['Income'].replace('nan', np.nan, inplace=True)
df['Income'] = df['Income'].astype(float)
df['Income'].fillna(df['Income'].mean(), inplace=True)

# Step 3: Remove Duplicates (if any)
df.drop_duplicates(inplace=True)

# Step 4: Fix Incorrect Data (e.g., outlier in Age and Income)
# Using IQR for Age
Q1_age = df['Age'].quantile(0.25)
Q3_age = df['Age'].quantile(0.75)
IQR_age = Q3_age - Q1_age
upper_age = Q3_age + 1.5 * IQR_age
df['Age'] = df['Age'].apply(lambda x: df['Age'].median() if x > upper_age else x)

# Using IQR for Income
Q1_inc = df['Income'].quantile(0.25)
Q3_inc = df['Income'].quantile(0.75)
IQR_inc = Q3_inc - Q1_inc
upper_inc = Q3_inc + 1.5 * IQR_inc
df['Income'] = df['Income'].apply(lambda x: df['Income'].median() if x > upper_inc else x)

# Step 5: Standardize Categorical Values
df['Gender'] = df['Gender'].str.upper().replace({'MALE': 'M', 'FEMALE': 'F'})

# Step 6: Encode Categorical Variables (if required)
df_encoded = pd.get_dummies(df, columns=['City', 'Gender'])

# Step 7: Normalize/Scale Numerical Features (optional)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_encoded[['Age', 'Income']] = scaler.fit_transform(df_encoded[['Age', 'Income']])

# Final Preprocessed Data
print("\n✅ Preprocessed Dataset:")
print(df_encoded)

# Optional: Save preprocessed data
# df_encoded.to_csv("preprocessed_data.csv", index=False)


🔹 Original Dataset:
      Name    Age         City   Income  Gender
0    Alice   25.0     New York    50000       F
1      Bob    NaN  Los Angeles    60000       M
2  Charlie   30.0     New York    52000       M
3     None   22.0      Chicago    48000       F
4      Eve   28.0         None      nan       F
5    Frank  120.0        Miami  1000000    Male
6    Grace   27.0       Boston    49000  Female

✅ Preprocessed Dataset:
      Name   Age    Income  City_Boston  City_Chicago  City_Los Angeles  \
0    Alice  0.15  0.012358        False         False             False   
1      Bob  1.00  0.074150        False         False              True   
2  Charlie  0.40  0.024717        False         False             False   
3  Unknown  0.00  0.000000        False          True             False   
4      Eve  0.30  1.000000        False         False             False   
5    Frank  0.30  0.024717        False         False             False   
6    Grace  0.25  0.006179         True       

In [7]:
# 2. Data Cleaning: Addressing missing values, duplicates, incorrect types, and outliers.
# Task 1: Clean a given dataset and document the changes made.
# Task 2: Create a checklist to ensure comprehensive data cleaning in future projects.
# Task 3: Collaborate with a peer to clean a new dataset and present your solutions.



In [8]:
# 3. Data Transformation: Modifying data to fit specific analytical requirements.
# Task 1: Transform a date column into separate 'day', 'month', and 'year' columns.
# Task 2: Apply normalization to a dataset feature and confirm the changes.
# Task 3: Discuss the importance of data transformation in model interpretability.




In [9]:
# 4. Feature Scaling: Adjusting data features to a common scale.
# Task 1: Apply Min-Max scaling to a dataset.
# Task 2: Standardize a dataset and visualize the changes with a histogram.
# Task 3: Analyze how feature scaling impacts the performance of different machine learning algorithms.





In [10]:
# 5. Feature Engineering: Creating new features from existing ones to improve model accuracy.
# Task 1: Create a new synthetic feature from existing dataset features.
# Task 2: Evaluate the impact of new features on model accuracy.
# Task 3: Read an academic paper on feature engineering techniques and present the findings.


