## imports

In [None]:
from dask.distributed import Client
client = Client()
client

In [None]:
import os
os.environ["SCIPY_ARRAY_API"] = "1"

In [None]:
# Data Processing
import pandas as pd
import numpy as np
import dask

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

import imblearn
from dask_ml.model_selection import train_test_split 

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz
import xarray as xr
import numpy as np

from joblib import dump

## data prep

In [None]:
# ds = xr.open_zarr('../data/_ZARR_READY/la_main_data', decode_timedelta=False)
ds = xr.open_zarr('../data/_ZARR_READY/canada', decode_timedelta=False)
# prior_data = xr.open_zarr('./data/_ZARR_READY/la_prior_data')
print(list(ds.data_vars))

In [None]:
features = ['d2m', 'lai_hv', 'lai_lv', 'mu_t2m_180', 'mu_t2m_30', 'mu_t2m_90', 'mu_tp_90', 'mu_tp_30', 'sp', 'mu_tp_180', 't2m', 'tp', 'ws10']
# features = ['d2m', 'lai_hv', 'lai_lv', 'sp', 'tot_t2m', 't2m', 'tot_tp', 'tp', 'ws10']

print('stacking data..')
ds_trim = ds#.sel(time=pd.date_range(start='2010-01-01', end='2010-01-02', freq='D'))
stacked = ds_trim.stack(sample = ('time', 'step', 'latitude', 'longitude'))
print('dropping na...')

# slower (potential eager eval)
# stacked = stacked.dropna(dim = 'sample')

features_stacked = stacked[features].persist()
labels_stacked = stacked['fire'].persist()
valid = ~np.isnan(labels_stacked).compute()
features_clean = features_stacked.where(valid, drop=True)
labels_clean = labels_stacked.where(valid, drop=True)

In [None]:
X = features_clean.to_array().transpose('sample', 'variable').chunk({'sample':700000, 'variable':13}).data
y = labels_clean.chunk({'sample':700000}).data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

## resamplign

In [None]:
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler(sampling_strategy=0.1)
X_under, y_under = under.fit_resample(X_train, y_train)
y_under

In [None]:
from imblearn.over_sampling import RandomOverSampler

over = RandomOverSampler(sampling_strategy=1)
X_over, y_over = over.fit_resample(X_train, y_train)

In [None]:
from imblearn.pipeline import Pipeline

over_partial = RandomOverSampler(sampling_strategy=0.01)
under_partial = RandomUnderSampler(sampling_strategy=0.1)

steps = [('o', over_partial), ('u', under_partial)]
pipeline = Pipeline(steps=steps)
X_both, y_both = pipeline.fit_resample(X_train, y_train)

In [None]:
y_both

### resampling sets before t-t split

In [None]:
full_over = RandomOverSampler(sampling_strategy=0.01)
full_under = RandomUnderSampler(sampling_strategy=0.01)
steps = [('o', full_over), ('u', full_under)]

pipe = Pipeline(steps=steps)
full_X_over, full_Y_over = pipe.fit_resample(X, y)

In [None]:
full_X_train, full_X_test, full_y_train, full_y_test = train_test_split(full_X_over, full_Y_over, test_size=0.2, random_state=10)

## model

In [None]:
clf = RandomForestClassifier(n_jobs = -1)
clf.fit(full_X_train, full_y_train)

In [None]:
## pre-patch
both_clf = RandomForestClassifier()
both_clf.fit(X_both, y_both)

In [None]:
full_y_pred = clf.predict(X)
print(classification_report(y, full_y_pred))

In [None]:
from joblib import dump
dump(clf, './models/rf_0.01-over-under.joblib')

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.ensemble import RandomForestClassifier

## post-patch
second_both_clf = RandomForestClassifier()
second_both_clf.fit(X_both, y_both)

In [None]:
y_pred = second_both_clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
y_train_pred = clf.predict(X_train)
print(classification_report(y_train, y_train_pred))

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_test, y_pred)

In [None]:
fd = pd.read_csv('../data/_FIRE/la_forest_csv/data.csv')
len(fd)

In [None]:
!conda list graphviz

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
plot_tree(tree, 
          feature_names=features,
          class_names=['fire'],
          filled=True)
plt.show()
