# Homework Starter — Stage 6: Data Preprocessing
Use this notebook to apply your cleaning functions and save processed data.

In [2]:
import pandas as pd
import os
from src import cleaning

In [3]:
# Generate CSV if not exists

csv_path = 'data/instructor_dirty.csv'
os.makedirs('data', exist_ok=True)

if not os.path.exists(csv_path):
    df_demo = pd.DataFrame({
        'numeric_col': [10, None, 40, 55, 70],
        'category_col': ['A', 'B', 'A', 'B', 'C'],
        'price': ['$100', '$200', '$150', None, '$250'],
        'date_str': ['2025-08-01','2025-08-02',None,'2025-08-04','2025-08-05'],
        'category': ['Electronics','Furniture','Toys','Clothing',None]
    })
    df_demo.to_csv(csv_path, index=False)
    print(f"Demo CSV created at {csv_path}")
else:
    print(f"CSV already exists at {csv_path}")

Demo CSV created at data/instructor_dirty.csv


In [4]:
os.makedirs("data/processed", exist_ok=True)

df = pd.read_csv("data/instructor_dirty.csv")
print("raw data：")
print(df)

#pre-process
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)

df['date_str'] = pd.to_datetime(df['date_str'], errors='coerce')

df = cleaning.fill_missing_median(df, ['numeric_col', 'price'])
df = cleaning.drop_missing(df, threshold=0.5)
df = cleaning.normalize_data(df, ['numeric_col', 'price'])

print("\nprocessed data：")
print(df)

df.to_csv("data/processed/instructor_clean.csv", index=False)
print("\nsave the processed data to data/processed/instructor_clean.csv")

raw data：
   numeric_col category_col price    date_str     category
0         10.0            A  $100  2025-08-01  Electronics
1          NaN            B  $200  2025-08-02    Furniture
2         40.0            A  $150         NaN         Toys
3         55.0            B   NaN  2025-08-04     Clothing
4         70.0            C  $250  2025-08-05          NaN

processed data：
   numeric_col category_col     price   date_str     category
0        0.000            A  0.000000 2025-08-01  Electronics
1        0.625            B  0.666667 2025-08-02    Furniture
2        0.500            A  0.333333        NaT         Toys
3        0.750            B  0.500000 2025-08-04     Clothing
4        1.000            C  1.000000 2025-08-05          NaN

save the processed data to data/processed/instructor_clean.csv
