In [9]:
# 01_preprocessing.ipynb

import pandas as pd
import numpy as np
import datetime
import os
import logging

# ✅ Create output folder if not exists
os.makedirs('../outputs', exist_ok=True)

# ✅ Now configure logger
logging.basicConfig(filename='../outputs/preprocessing.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Set logging
logging.basicConfig(filename='../outputs/preprocessing.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
logging.info("📦 Starting full preprocessing pipeline")


In [21]:
# Load data
import pandas as pd
train = pd.read_csv('../data/train.csv', low_memory=False)
test = pd.read_csv('../data/test.csv', low_memory=False)
store = pd.read_csv('../data/store.csv')

logging.info(f"Loaded train: {train.shape}, test: {test.shape}, store: {store.shape}")

In [23]:
# Preprocess function for date features
def preprocess_date(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['WeekOfYear'] = df['Date'].dt.isocalendar().week
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['IsWeekend'] = df['DayOfWeek'] >= 5
    return df

In [25]:
# Apply to train/test
train = preprocess_date(train)
test = preprocess_date(test)
logging.info("✅ Date features extracted for train and test")


In [27]:
# Fill missing values in train/test
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [29]:
# Save initial processed versions
train.to_csv(f'../data/processed_train_{timestamp}.csv', index=False)
test.to_csv(f'../data/processed_test_{timestamp}.csv', index=False)
logging.info("Saved processed train/test before merge")

In [31]:
# Preprocess store data
store.fillna(0, inplace=True)
for col in ['StoreType', 'Assortment', 'PromoInterval']:
    if col in store.columns:
        store[col] = store[col].astype('category').cat.codes
store.to_csv(f'../data/processed_store_{timestamp}.csv', index=False)
logging.info("✅ Store preprocessed and saved")


In [33]:
# Merge with store
store_train = pd.merge(train, store, on='Store', how='left')
store_test = pd.merge(test, store, on='Store', how='left')
logging.info("🔗 Merged train/test with store")

In [35]:
# Post-merge fill NAs (just in case)
store_train.fillna(0, inplace=True)
store_test.fillna(0, inplace=True)


In [37]:
# Save final merged versions
store_train.to_csv(f'../data/store_train_{timestamp}.csv', index=False)
store_test.to_csv(f'../data/store_test_{timestamp}.csv', index=False)
logging.info("✅ Final merged datasets saved with timestamp")