In [None]:
import pandas as pd

train_df = pd.read_csv("train.csv")
store_df = pd.read_csv("store.csv")


df = pd.merge(train_df, store_df, on="Store", how="left")


df["Date"] = pd.to_datetime(df["Date"])

# Create time-based features
df["year"] = df["Date"].dt.year
df["month"] = df["Date"].dt.month
df["week"] = df["Date"].dt.isocalendar().week
df["day"] = df["Date"].dt.day
df["day_of_week"] = df["Date"].dt.dayofweek
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

# Sort by Store and Date before generating lag features
df.sort_values(by=["Store", "Date"], inplace=True)

# Generate lag and rolling window features (shifted to avoid leakage)
df["lag_1"] = df.groupby("Store")["Sales"].shift(1)
df["lag_7"] = df.groupby("Store")["Sales"].shift(7)
df["rolling_mean_7"] = df.groupby("Store")["Sales"].shift(1).rolling(7).mean()
df["rolling_std_7"] = df.groupby("Store")["Sales"].shift(1).rolling(7).std()

# Drop rows with NaNs caused by lag/rolling operations
df.dropna(inplace=True)

df.to_csv("rossmann_processed.csv", index=False)
print("rossmann_processed.csv saved!")


In [None]:
from sklearn.preprocessing import MinMaxScaler


df = pd.read_csv("rossmann_processed.csv")

# Select the columns to normalize
cols_to_normalize = [
    "Sales", "lag_1", "lag_7", "rolling_mean_7", "rolling_std_7", "CompetitionDistance"
]

# Apply MinMaxScaler
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[[col + "_normalized" for col in cols_to_normalize]] = scaler.fit_transform(df[cols_to_normalize])

df_scaled.to_csv("rossmann_normalized.csv", index=False)

