<a href="https://colab.research.google.com/github/Leo-xxx12/bootcamp_Leo_Xu/blob/main/Copy_of_stage06_data_preprocessing_homework_starter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup: Generate Sample Dataset

This cell creates the required folder structure (`data/raw/` and `data/processed/`) relative to the notebook, and generates the sample CSV dataset with missing values.
This ensures the dataset is ready for cleaning functions and saves it to `data/raw/sample_data.csv`.

In [1]:
import os
import pandas as pd
import numpy as np

# Define folder paths relative to this notebook
raw_dir = '../data/raw'
processed_dir = '../data/processed'

# Create folders if they don't exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# Define the sample data
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in raw data folder
csv_path = os.path.join(raw_dir, 'sample_data.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Sample dataset created and saved to {csv_path}')
else:
    print(f'File already exists at {csv_path}. Skipping CSV creation to avoid overwrite.')


Sample dataset created and saved to ../data/raw/sample_data.csv


# Homework Starter — Stage 6: Data Preprocessing
Use this notebook to apply your cleaning functions and save processed data.

In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def fill_missing_median(df: pd.DataFrame) -> pd.DataFrame:
    """Fill numeric NaNs with column medians (returns a new DataFrame)."""
    out = df.copy()
    med = out.median(numeric_only=True)
    out[med.index] = out[med.index].fillna(med)
    return out

def drop_missing(df: pd.DataFrame) -> pd.DataFrame:
    """Drop rows that still contain any NaNs."""
    return df.dropna().reset_index(drop=True)

def normalize_data(df: pd.DataFrame) -> pd.DataFrame:
    """Min-max scale numeric columns to [0,1]; leave non-numeric unchanged."""
    out = df.copy()
    num_cols = out.select_dtypes(include="number").columns
    if len(num_cols) == 0:
        return out
    scaler = MinMaxScaler()
    out[num_cols] = scaler.fit_transform(out[num_cols])
    return out


ModuleNotFoundError: No module named 'src'

## Load Raw Dataset

In [None]:
import os, glob, datetime as dt
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv

# load env
load_dotenv()
RAW = Path(os.getenv("DATA_DIR_RAW", "project/data/raw"))
PROC = Path(os.getenv("DATA_DIR_PROCESSED", "project/data/processed"))
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)

from src.cleaning import fill_missing_median, drop_missing, normalize_data

def ts(): return dt.datetime.now().strftime("%Y%m%d-%H%M%S")


In [None]:
raw_files = sorted(glob.glob(str(RAW / "*.csv")))
assert raw_files, "No CSV files found in data/raw/"
raw_path = Path(raw_files[0])

df_raw = pd.read_csv(raw_path)  # add parse_dates=['date'] if you have dates
df_raw.head()


In [None]:
df_clean = df_raw.pipe(fill_missing_median).pipe(drop_missing).pipe(normalize_data)
df_clean.head()


In [None]:
out_pq = PROC / f"cleaned_{ts()}.parquet"
try:
    df_clean.to_parquet(out_pq, index=False)
    print(f"Saved cleaned parquet → {out_pq}")
except Exception as e:
    print("Parquet engine missing. Install one of:\n  pip install pyarrow\n  # or\n  pip install fastparquet")
    raise


In [None]:
df = pd.read_csv('../data/raw/sample_data.csv')
df.head()

In [None]:
summary = pd.DataFrame({
    "metric": [
        "rows", "columns",
        "rows_after_clean",
        "num_cols_raw", "num_cols_clean",
        "cols_with_any_na_raw", "cols_with_any_na_clean",
    ],
    "value": [
        len(df_raw), df_raw.shape[1],
        len(df_clean),
        df_raw.select_dtypes("number").shape[1],
        df_clean.select_dtypes("number").shape[1],
        df_raw.isna().any().sum(),
        df_clean.isna().any().sum(),
    ],
})
summary


## Apply Cleaning Functions

In [None]:
# TODO: Apply your functions here
# Example:
# df = cleaning.fill_missing_median(df, ['col1','col2'])
# df = cleaning.drop_missing(df, threshold=0.5)
# df = cleaning.normalize_data(df, ['col1','col2'])

## Save Cleaned Dataset

In [None]:
# df.to_csv('../data/processed/sample_data_cleaned.csv', index=False)