# Tomorrow Rain Random Forest Forecaster
This notebook retrains a compact RandomForest on `weatherAUS.csv` and saves `aussie_rain.joblib` with compression.


In [4]:
import pandas as pd, numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import inspect

DATA_PATH = './data/weatherAUS.csv'

raw_df = pd.read_csv(DATA_PATH)
raw_df = raw_df.dropna(subset=['RainTomorrow']).reset_index(drop=True)
year = pd.to_datetime(raw_df['Date']).dt.year
train_df = raw_df[year < 2015]
val_df   = raw_df[year == 2015]
test_df  = raw_df[year >  2015]

import inspect
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# 1) Розбиття вхідних/цільових
input_cols = list(train_df.columns)[1:-1]
target_col = 'RainTomorrow'

train_inputs, train_targets = train_df[input_cols].copy(), train_df[target_col].copy()
val_inputs,   val_targets   = val_df[input_cols].copy(),   val_df[target_col].copy()
test_inputs,  test_targets  = test_df[input_cols].copy(),  test_df[target_col].copy()

# 2) Типи ознак
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes(include='object').columns.tolist()

# 3) Імпутація (fit на train, transform на всіх)
imputer = SimpleImputer(strategy='mean').fit(train_inputs[numeric_cols])
train_num = pd.DataFrame(imputer.transform(train_inputs[numeric_cols]),
                         columns=numeric_cols, index=train_inputs.index)
val_num   = pd.DataFrame(imputer.transform(val_inputs[numeric_cols]),
                         columns=numeric_cols, index=val_inputs.index)
test_num  = pd.DataFrame(imputer.transform(test_inputs[numeric_cols]),
                         columns=numeric_cols, index=test_inputs.index)

# 4) Масштабування (fit на train, transform на всіх)
scaler = MinMaxScaler().fit(train_num)
train_num = pd.DataFrame(scaler.transform(train_num), columns=numeric_cols, index=train_num.index)
val_num   = pd.DataFrame(scaler.transform(val_num),   columns=numeric_cols, index=val_num.index)
test_num  = pd.DataFrame(scaler.transform(test_num),  columns=numeric_cols, index=test_num.index)

# 5) One-Hot Encoder (сумісність зі старими/новими версіями sklearn)
if "sparse_output" in inspect.signature(OneHotEncoder).parameters:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
else:
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

encoder.fit(train_inputs[categorical_cols])
encoded_cols = encoder.get_feature_names_out(categorical_cols)

# Матриці кодування
train_cat = pd.DataFrame(encoder.transform(train_inputs[categorical_cols]),
                         columns=encoded_cols, index=train_inputs.index)
val_cat   = pd.DataFrame(encoder.transform(val_inputs[categorical_cols]),
                         columns=encoded_cols, index=val_inputs.index)
test_cat  = pd.DataFrame(encoder.transform(test_inputs[categorical_cols]),
                         columns=encoded_cols, index=test_inputs.index)

# 6) Фінальні X: concat за один крок (жодної фрагментації)
X_train = pd.concat([train_num, train_cat], axis=1)
X_val   = pd.concat([val_num,   val_cat],   axis=1)
X_test  = pd.concat([test_num,  test_cat],  axis=1)

print(f"X_train: {X_train.shape}, X_val: {X_val.shape}, X_test: {X_test.shape}")

# 7) Тренування моделі
rf = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_leaf=5, n_jobs=-1, random_state=42)
rf.fit(X_train, train_targets)
print('Accuracy train/val/test:', rf.score(X_train, train_targets), rf.score(X_val, val_targets), rf.score(X_test, test_targets))

# 8) Збереження моделі
bundle = {
 'model': rf,
 'imputer': imputer,
 'scaler': scaler,
 'encoder': encoder,
 'input_cols': input_cols,
 'target_col': target_col,
 'numeric_cols': numeric_cols,
 'categorical_cols': categorical_cols,
 'encoded_cols': encoded_cols,
}
joblib.dump(bundle, './model/aussie_rain.joblib', compress=('xz', 3))
print('Saved aussie_rain.joblib (compressed).')


X_train: (98988, 119), X_val: (17231, 119), X_test: (25974, 119)
Accuracy train/val/test: 0.8926536549884835 0.8543323080494458 0.8415338415338416
Saved aussie_rain.joblib (compressed).
