In [None]:
# Train RandomForest for Drought Risk Prediction

This notebook loads sample data, preprocesses the date into year/month/dayofyear, trains a RandomForestClassifier to predict `score_class`, and saves the model to `model/random_forest_model.joblib`. 


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

BASE = '..'
DATA_DIR = os.path.join(BASE, 'data')
MODEL_PATH = os.path.join(BASE, 'model', 'random_forest_model.joblib')

sample_train = os.path.join(DATA_DIR, 'sample_train_timeseries.csv')
sample_soil = os.path.join(DATA_DIR, 'sample_soil_data.csv')

print('Loading sample data...')
df_train = pd.read_csv(sample_train)
df_soil = pd.read_csv(sample_soil) if os.path.exists(sample_soil) else None

# Identify date column
date_col = None
for c in ['date','timestamp','time','Date','DATE']:
    if c in df_train.columns:
        date_col = c
        break

if date_col is not None:
    df_train[date_col] = pd.to_datetime(df_train[date_col], errors='coerce')
    df_train['year'] = df_train[date_col].dt.year
    df_train['month'] = df_train[date_col].dt.month
    df_train['dayofyear'] = df_train[date_col].dt.dayofyear
    df_train = df_train.drop(columns=[date_col])

# Merge soil attributes if a key exists in both (e.g., 'location_id')
merge_key = None
if df_soil is not None:
    common = set(df_train.columns).intersection(set(df_soil.columns))
    for cand in ['location_id','station_id','site_id','region','grid_id']:
        if cand in common:
            merge_key = cand
            break
    if merge_key is not None:
        df_train = df_train.merge(df_soil, on=merge_key, how='left')

# Target
if 'score_class' not in df_train.columns:
    raise ValueError('score_class column not found in training data')
y = df_train['score_class']
X = df_train.drop(columns=['score_class'])

# Encode categoricals simply with codes
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].astype('category').cat.codes
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median(numeric_only=True))
X = X.fillna(0)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1, class_weight='balanced_subsample')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_val)
print(classification_report(y_val, y_pred))

os.makedirs(os.path.join(BASE, 'model'), exist_ok=True)
joblib.dump(clf, MODEL_PATH)
print('Saved model to', MODEL_PATH)
