# Homework 6: Data Cleaning
This notebook demonstrates data cleaning using functions from `src/cleaning.py`.
We will load a raw dataset, apply cleaning functions, save processed data, and compare original vs cleaned data.


In [1]:
import sys
import os
sys.path.append(os.path.abspath('../src'))


In [3]:
import sys
import os

# Add the src folder (one level up from notebooks) to Python path
sys.path.append(os.path.abspath('../src'))

# Verify
print(sys.path[-1])  # Should print the full path to homework6/src


/Users/mayurakshi/bootcamp_mayurakshi_biswas/homework/homework6/src


In [1]:
import os
import sys
sys.path.append('../src')  # ensure Python can find src folder

from cleaning import fill_missing_median, drop_missing, normalize_data


In [3]:
import pandas as pd
import numpy as np
import os

# Ensure the raw folder exists
raw_dir = '../data/raw'
os.makedirs(raw_dir, exist_ok=True)

# Sample dataset
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
csv_path = os.path.join(raw_dir, 'sample_data.csv')
df.to_csv(csv_path, index=False)
print(f'Sample dataset created and saved to {csv_path}')


Sample dataset created and saved to ../data/raw/sample_data.csv


In [4]:
import os
import sys
import pandas as pd

# Add src folder to Python path so we can import cleaning.py
sys.path.append('../src')

from cleaning import fill_missing_median, drop_missing, normalize_data

# Paths
raw_dir = '../data/raw'
processed_dir = '../data/processed'
os.makedirs(processed_dir, exist_ok=True)

# Load raw dataset
csv_path = os.path.join(raw_dir, 'sample_data.csv')
df = pd.read_csv(csv_path)
print("Original DataFrame:")
display(df.head())

# Step 1: Fill missing values with median for numeric columns
numeric_cols = ['age', 'income', 'score', 'extra_data']
df_filled = fill_missing_median(df.copy(), numeric_cols)

# Step 2: Drop rows with too many missing values (more than 50%)
df_cleaned = drop_missing(df_filled, threshold=0.5)

# Step 3: Normalize numeric columns to 0-1 range
df_cleaned = normalize_data(df_cleaned, ['age', 'income', 'score'])

print("Cleaned DataFrame:")
display(df_cleaned.head())

# Step 4: Save cleaned dataset to processed folder
cleaned_csv_path = os.path.join(processed_dir, 'sample_data_cleaned.csv')
df_cleaned.to_csv(cleaned_csv_path, index=False)
print(f"Cleaned dataset saved to {cleaned_csv_path}")


Original DataFrame:


Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,
1,45.0,,0.91,10001,New York,42.0
2,29.0,42000.0,,60614,Chicago,
3,50.0,58000.0,0.76,94103,SF,
4,38.0,,0.88,73301,Austin,


Cleaned DataFrame:


Unnamed: 0,age,income,score,zipcode,city,extra_data
0,0.238095,0.8125,0.653846,90210,Beverly,23.5
1,0.761905,0.625,1.0,10001,New York,42.0
2,0.0,0.0,0.596154,60614,Chicago,23.5
3,1.0,1.0,0.423077,94103,SF,23.5
4,0.428571,0.625,0.884615,73301,Austin,23.5


Cleaned dataset saved to ../data/processed/sample_data_cleaned.csv


In [5]:
print("Original Data Summary:")
display(df.describe(include='all'))

print("Cleaned Data Summary:")
display(df_cleaned.describe(include='all'))


Original Data Summary:


Unnamed: 0,age,income,score,zipcode,city,extra_data
count,6.0,4.0,6.0,7.0,7,2.0
unique,,,,,7,
top,,,,,Beverly,
freq,,,,,1,
mean,39.5,51000.0,0.801667,62097.0,,23.5
std,7.556454,7071.067812,0.092826,36869.63632,,26.162951
min,29.0,42000.0,0.65,10001.0,,5.0
25%,35.0,47250.0,0.7675,36479.5,,14.25
50%,39.5,52000.0,0.805,73301.0,,23.5
75%,44.0,55750.0,0.865,92156.5,,32.75


Cleaned Data Summary:


Unnamed: 0,age,income,score,zipcode,city,extra_data
count,7.0,7.0,7.0,7.0,7,7.0
unique,,,,,7,
top,,,,,Beverly,
freq,,,,,1,
mean,0.5,0.589286,0.585165,62097.0,,23.5
std,0.328479,0.314281,0.325952,36869.63632,,10.68098
min,0.0,0.0,0.0,10001.0,,5.0
25%,0.333333,0.53125,0.480769,36479.5,,23.5
50%,0.5,0.625,0.596154,73301.0,,23.5
75%,0.666667,0.71875,0.769231,92156.5,,23.5
