In [1]:
import pandas as pd
import numpy as np
import os


In [2]:
# Load cleaned data
df = pd.read_csv("../data/processed/cleaned_data.csv")
df['Date'] = pd.to_datetime(df['Date'])

In [3]:
# okay, set date as index (time-series needs it)
df = df.sort_values(by=["Material_Name","Date"])
print("Data after sorting:", df.shape)


Data after sorting: (2193, 9)


In [4]:
# Quick check
print(df.head())

         Date Material_Name  Quantity_Consumed  Lead_Time  Sales_Volume  \
1  2023-01-01        Cement                148         14           144   
4  2023-01-02        Cement                143         14           133   
7  2023-01-03        Cement                128         11           129   
10 2023-01-04        Cement                146          6           149   
13 2023-01-05        Cement                142         12           143   

   Vendor_Name   Location  Month  Year  
1     Vendor_B  Ghaziabad      1  2023  
4     Vendor_A      Delhi      1  2023  
7     Vendor_B      Noida      1  2023  
10    Vendor_C  Ghaziabad      1  2023  
13    Vendor_B      Delhi      1  2023  


In [6]:
# Feature Engineering
final_data = []

for mat in df['Material_Name'].unique():
    temp = df[df['Material_Name']==mat].copy()
    temp = temp.set_index('Date')

    # lag features (demand history)
    for lag in [7, 14, 30]:
        temp[f'lag_{lag}'] = temp['Quantity_Consumed'].shift(lag)

    # rolling mean features
    temp['rolling_7'] = temp['Quantity_Consumed'].rolling(window=7).mean()
    temp['rolling_30'] = temp['Quantity_Consumed'].rolling(window=30).mean()

    # month & dayofweek as seasonality proxies
    temp['month'] = temp.index.month
    temp['dayofweek'] = temp.index.dayofweek

    final_data.append(temp)


In [7]:
# Combine back
df_feat = pd.concat(final_data)


In [8]:
# Reset index
df_feat = df_feat.reset_index()

In [9]:
# Outlier Handling: cap at 99th percentile
cap_val = df_feat['Quantity_Consumed'].quantile(0.99)
df_feat['Quantity_Consumed'] = np.where(df_feat['Quantity_Consumed']>cap_val, cap_val, df_feat['Quantity_Consumed'])

In [10]:
# Encode categorical vars
df_feat = pd.get_dummies(df_feat, columns=['Material_Name','Vendor_Name','Location'], drop_first=True)


In [12]:
# Drop NA (from lag/rolling)
df_feat = df_feat.dropna().reset_index(drop=True)

print("Final dataset shape after feature engineering:", df_feat.shape)
print(df_feat.head())

Final dataset shape after feature engineering: (2103, 19)
        Date  Quantity_Consumed  Lead_Time  Sales_Volume  Month  Year  lag_7  \
0 2023-01-31              146.0         14           148      1  2023  160.0   
1 2023-02-01              162.0         13           170      2  2023  157.0   
2 2023-02-02              167.0          9           168      2  2023  158.0   
3 2023-02-03              141.0         14           135      2  2023  146.0   
4 2023-02-04              156.0          9           153      2  2023  157.0   

   lag_14  lag_30   rolling_7  rolling_30  month  dayofweek  \
0   154.0   148.0  153.714286  151.866667      1          1   
1   144.0   143.0  154.428571  152.500000      2          2   
2   159.0   128.0  155.714286  153.800000      2          3   
3   145.0   146.0  155.000000  153.633333      2          4   
4   152.0   142.0  154.857143  154.100000      2          5   

   Material_Name_Sand  Material_Name_TMT_Steel  Vendor_Name_Vendor_B  \
0         

In [13]:
# Save processed dataset
os.makedirs("../data/processed", exist_ok=True)
df_feat.to_csv("../data/processed/feature_data.csv", index=False)
print("✅ Feature-engineered dataset saved to data/processed/feature_data.csv")

✅ Feature-engineered dataset saved to data/processed/feature_data.csv
