# Rossmann Sales â€” Data Loading and EDA

This notebook loads the raw Rossmann CSVs (`train.csv`, `test.csv`, `store.csv`) from `data/raw/`, performs basic validation and exploratory analysis, merges `train` with `store`, applies light cleaning, and saves a processed dataset to `data/processed/rossmann_processed.csv` for modeling.


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

RAW_DIR = Path('data/raw')
PROC_DIR = Path('data/processed')
PROC_DIR.mkdir(parents=True, exist_ok=True)

train_path = RAW_DIR / 'train.csv'
test_path = RAW_DIR / 'test.csv'
store_path = RAW_DIR / 'store.csv'

for p in [train_path, test_path, store_path]:
    assert p.exists(), f"Missing file: {p}"

print('Files found:')
print(train_path, test_path, store_path, sep='\n')


In [None]:
# Load CSVs with date parsing
train = pd.read_csv(train_path, parse_dates=['Date'])
test = pd.read_csv(test_path, parse_dates=['Date'])
store = pd.read_csv(store_path)

print(train.shape, test.shape, store.shape)
train.head()


In [None]:
# Basic validation
required_train_cols = {'Store','Date','Sales','Open'}
required_test_cols = {'Store','Date','Open'}
assert required_train_cols.issubset(train.columns), 'Train missing required columns'
assert required_test_cols.issubset(test.columns), 'Test missing required columns'

# Merge train with store
train_merged = train.merge(store, on='Store', how='left')

# Filter closed days and non-positive sales
train_merged = train_merged[(train_merged['Open'] == 1) & (train_merged['Sales'] > 0)]

# Add calendar features
train_merged['dow'] = train_merged['Date'].dt.dayofweek
train_merged['month'] = train_merged['Date'].dt.month
train_merged['year'] = train_merged['Date'].dt.year
train_merged['week'] = train_merged['Date'].dt.isocalendar().week.astype(int)

# Log-transform target
train_merged['Sales_log'] = np.log1p(train_merged['Sales'])

train_merged.sort_values(['Store','Date'], inplace=True)
train_merged.head()


In [None]:
# Quick EDA: sales distribution and time trend for a sample store
fig, axes = plt.subplots(1, 2, figsize=(12,4))
axes[0].hist(train_merged['Sales'], bins=50)
axes[0].set_title('Sales distribution')

sample_store = int(train_merged['Store'].sample(1, random_state=42))
plot_df = train_merged[train_merged['Store']==sample_store].sort_values('Date')
axes[1].plot(plot_df['Date'], plot_df['Sales'])
axes[1].set_title(f'Store {sample_store} sales over time')
plt.tight_layout()
plt.show()


In [None]:
# Save processed dataset
out_path = PROC_DIR / 'rossmann_processed.csv'
cols_to_keep = ['Store','Date','Sales','Sales_log','Open','Promo','SchoolHoliday','dow','month','year','week']
train_merged[cols_to_keep].to_csv(out_path, index=False)
print('Saved to:', out_path)
