# Notebook 3 — Feature Engineering and Minimal Modeling (Sanity Check)

This notebook creates deterministic features (lags, rolling stats, cyclical month features), logs data-loss due to feature creation, saves a feature-engineered dataset, and runs a minimal sanity-check modeling step (train/test split and quick models).

Notes:
- Uses the canonical cleaned CSV produced by Notebook 2: ../4_data_analysis/model_datasets/model_ready_dataset_clean.csv
- Saves the feature-engineered CSV to ../4_data_analysis/model_datasets/model_ready_dataset_fe.csv


In [None]:
# Imports and paths
import os
import pandas as pd
import numpy as np
from pprint import pprint

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

np.random.seed(42)
data_in_paths = [
    os.path.join('..','4_data_analysis','model_datasets','model_ready_dataset_fe.csv'),  
    os.path.join('..','4_data_analysis','model_datasets','model_ready_dataset_clean.csv'),
    os.path.join('..','4_data_analysis','model_datasets','model_ready_dataset.csv'),
    os.path.join('..','1_datasets','Final_dataset','final_merged_dataset.csv')
]
out_dir = os.path.join('..','4_data_analysis','model_datasets')
os.makedirs(out_dir, exist_ok=True)
fe_out_path = os.path.join(out_dir, 'model_ready_dataset_fe.csv')

print('Looking for input files (in order):')
pprint(data_in_paths)


Looking for input files (in order):
['..\\4_data_analysis\\model_datasets\\model_ready_dataset_fe.csv',
 '..\\4_data_analysis\\model_datasets\\model_ready_dataset_clean.csv',
 '..\\4_data_analysis\\model_datasets\\model_ready_dataset.csv',
 '..\\1_datasets\\Final_dataset\\final_merged_dataset.csv']


In [2]:
# Robust loader: prefer FE if present, else cleaned, else model_ready, else wide final_merged
loaded = None
for p in data_in_paths:
    if os.path.exists(p):
        print('Loading', p)
        loaded = p
        break
if loaded is None:
    # Try a recursive search as a helpful fallback
    import glob
    matches = glob.glob('**/model_ready_dataset*.csv', recursive=True) + glob.glob('**/final_merged_dataset.csv', recursive=True)
    matches = sorted(set(matches))
    if matches:
        print('Found candidates:', matches)
        loaded = matches[0]
    else:
        raise FileNotFoundError('No input dataset found. Place the cleaned dataset at ../4_data_analysis/model_datasets/model_ready_dataset_clean.csv or the wide file at ../1_datasets/Final_dataset/final_merged_dataset.csv')

df = pd.read_csv(loaded)
print('Loaded shape:', df.shape)
    

Loading ..\4_data_analysis\model_datasets\model_ready_dataset_clean.csv
Loaded shape: (2100, 15)


In [4]:
# Ensure YEAR/Month_Num exist and standardize column names
def ensure_year_month(df):
    if 'Year' in df.columns and 'YEAR' not in df.columns:
        df = df.rename(columns={'Year':'YEAR'})
    if 'MONTH' in df.columns and 'Month' not in df.columns:
        df = df.rename(columns={'MONTH':'Month'})
    if 'Month_Num' not in df.columns:
        if 'Month' in df.columns and df['Month'].dtype == object:
            month_map = {"JAN":1,"FEB":2,"MAR":3,"APR":4,"MAY":5,"JUN":6,
                        "JUL":7,"AUG":8,"SEP":9,"OCT":10,"NOV":11,"DEC":12}
            df['Month_Num'] = df['Month'].map(month_map)
        elif 'Month' in df.columns:
            df['Month_Num'] = df['Month']
        else:
            raise KeyError('No Month or Month_Num column found.')
    if 'YEAR' not in df.columns and 'year' in df.columns:
        df = df.rename(columns={'year':'YEAR'})
    return df

def standardize_lag_names(df):
    rename_map = {
        'Rain_lag_1': 'Rainfall_lag_1', 'Rain_lag_2': 'Rainfall_lag_2', 'Rain_lag_3': 'Rainfall_lag_3', 'Rain_lag_12': 'Rainfall_lag_12',
        'Temp_lag_1': 'Temperature_lag_1', 'Temp_lag_2': 'Temperature_lag_2', 'Temp_lag_3': 'Temperature_lag_3', 'Temp_lag_12': 'Temperature_lag_12',
    }
    present = {k:v for k,v in rename_map.items() if k in df.columns}
    if present:
        df = df.rename(columns=present)
    # if both short and canonical exist, drop the short
    for short, canon in rename_map.items():
        if short in df.columns and canon in df.columns:
            df.drop(columns=[short], inplace=True)
    return df

df = ensure_year_month(df)
df = standardize_lag_names(df)
print('Columns after standardization:', df.columns.tolist())


Columns after standardization: ['REGION', 'YEAR', 'Month', 'Rainfall', 'Temperature', 'Month_Num', 'Time', 'Rainfall_lag_1', 'Rainfall_lag_2', 'Rainfall_lag_3', 'Rainfall_lag_12', 'Temperature_lag_1', 'Temperature_lag_2', 'Temperature_lag_3', 'Temperature_lag_12']


In [7]:
# Feature creation settings
LAGS = [1,2,3,12]
ROLL_WINDOWS = {'roll3':3, 'roll12':12}
ROLL_MIN_PERIODS = {'roll3':3, 'roll12':12}  # can change to allow partial windows if desired

# Create lag columns if missing
for var in ['Rainfall','Temperature']:
    for lag in LAGS:
        cname = f"{var}_lag_{lag}"
        if cname not in df.columns:
            df[cname] = df.groupby('REGION')[var].shift(lag)
            print('Created', cname)

# Create rolling features using prior months only (shift(1) before rolling) to avoid leakage
for var in ['Rainfall','Temperature']:
    for label, window in ROLL_WINDOWS.items():
        minp = ROLL_MIN_PERIODS[label]
        colname = f"{var}_{label}"
        if colname not in df.columns:
            df[colname] = df.groupby('REGION')[var].transform(lambda x: x.shift(1).rolling(window=window, min_periods=minp).mean())
            print('Created', colname)

# Cyclical month features
df['Month_sin'] = np.sin(2 * np.pi * (df['Month_Num'] / 12))
df['Month_cos'] = np.cos(2 * np.pi * (df['Month_Num'] / 12))

print('\nColumns now include lags and rolls. Total columns:', len(df.columns))



Columns now include lags and rolls. Total columns: 21


In [8]:
# Report row counts before and after dropping NaNs for required features
required_for_model = [f"Rainfall_lag_{i}" for i in LAGS] + [f"Temperature_lag_{i}" for i in LAGS] + \
                 [f"Rainfall_{k}" for k in ROLL_WINDOWS.keys()] + [f"Temperature_{k}" for k in ROLL_WINDOWS.keys()] + ['Month_sin','Month_cos','YEAR','Time']
required_for_model = [c for c in required_for_model if c in df.columns]

print('Number of rows before dropping:', df.shape[0])

# Determine exactly what would be dropped by required columns
na_mask = df[required_for_model].isna().any(axis=1)
n_missing_rows = na_mask.sum()
print('Rows with any missing required feature:', n_missing_rows)
print('Percentage lost:', 100.0 * n_missing_rows / df.shape[0])

# Optionally inspect which regions lose many rows
if n_missing_rows > 0:
    dropped_by_region = df[na_mask].groupby('REGION').size().sort_values(ascending=False)
    print('\nRows dropped per REGION (top):')
    display(dropped_by_region.head(20))

# Now drop and save the FE dataset
df_fe = df.dropna(subset=required_for_model).reset_index(drop=True)
print('Number of rows after dropping:', df_fe.shape[0])

df_fe.to_csv(fe_out_path, index=False)
print('Saved feature-engineered dataset to', fe_out_path)


Number of rows before dropping: 2100
Rows with any missing required feature: 60
Percentage lost: 2.857142857142857

Rows dropped per REGION (top):


REGION
Central    12
East       12
North      12
South      12
West       12
dtype: int64

Number of rows after dropping: 2040
Saved feature-engineered dataset to ..\4_data_analysis\model_datasets\model_ready_dataset_fe.csv


In [9]:
# Quick time-aware train/test split per region (sanity check)
def train_test_time_split(df, group_col='REGION', time_col='Time', test_periods=24):
    train_parts, test_parts = [], []
    for name, g in df.groupby(group_col):
        g_sorted = g.sort_values(time_col).reset_index(drop=True)
        if len(g_sorted) <= test_periods:
            raise ValueError(f"Region {name} has <= {test_periods} rows; reduce test_periods or drop region")
        train_parts.append(g_sorted.iloc[:-test_periods].copy())
        test_parts.append(g_sorted.iloc[-test_periods:].copy())
    return pd.concat(train_parts).reset_index(drop=True), pd.concat(test_parts).reset_index(drop=True)

test_periods = 24
train_df, test_df = train_test_time_split(df_fe, group_col='REGION', time_col='Time', test_periods=test_periods)
print('Train shape:', train_df.shape, 'Test shape:', test_df.shape)


Train shape: (1920, 21) Test shape: (120, 21)


In [10]:
# Minimal modeling sanity-check: train a Ridge (linear) and RandomForest for Rainfall (no heavy tuning)
from math import sqrt

def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

# Choose features for quick test: time + cyclical + lag features
quick_features = ['YEAR','Time','Month_sin','Month_cos'] + [f'Rainfall_lag_{i}' for i in LAGS] + [f'Temperature_lag_{i}' for i in [1]]
quick_features = [c for c in quick_features if c in train_df.columns]
print('Quick features length:', len(quick_features))

X_tr = train_df[quick_features]
X_te = test_df[quick_features]
y_tr = train_df['Rainfall']
y_te = test_df['Rainfall']

# Ridge
lr = Ridge(alpha=1.0, random_state=42)
lr.fit(X_tr, y_tr)
lr_pred = lr.predict(X_te)
print('Ridge RMSE:', rmse(y_te, lr_pred), 'MAE:', mean_absolute_error(y_te, lr_pred))

# Random Forest (small)
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_tr, y_tr)
rf_pred = rf.predict(X_te)
print('RF RMSE:', rmse(y_te, rf_pred), 'MAE:', mean_absolute_error(y_te, rf_pred))

print('\nSanity-check modeling completed. These are baseline results — full CV and tuning are in Notebook 4.')


Quick features length: 9
Ridge RMSE: 0.9151413571029801 MAE: 0.5323600950631743
RF RMSE: 0.8627483051848939 MAE: 0.4905724202097507

Sanity-check modeling completed. These are baseline results — full CV and tuning are in Notebook 4.


What we saved and what to do next
- Saved: ../4_data_analysis/model_datasets/model_ready_dataset_fe.csv (feature-engineered dataset with lags, rolls, cyclical month features)
- Notebook 4 (next) performs robust model selection using grouped time-series cross-validation and simple hyperparameter search. Run that notebook once this one finishes successfully.

