## imports

In [None]:
from dask.distributed import Client
client = Client()
client

In [None]:
## ENABLE INTEL ACCELERATION 
from sklearnex import patch_sklearn, unpatch_sklearn
patch_sklearn()
import os
os.environ["SCIPY_ARRAY_API"] = "1"

# Data Processing
import pandas as pd
import numpy as np
import dask
import random_forest
from random_forest import prep_samples

# Modelling
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks, EditedNearestNeighbours
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.pipeline import Pipeline
from dask_ml.model_selection import train_test_split 

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz
import xarray as xr
import numpy as np

from importlib import reload
from joblib import dump, load

## data prep

In [None]:
ds = xr.open_zarr('../data/_ZARR_READY/la_forest', decode_timedelta=False)
# ds = xr.open_zarr('../data/_ZARR_READY/canada', decode_timedelta=False)
# prior_data = xr.open_zarr('./data/_ZARR_READY/la_prior_data')
print(list(ds.data_vars))

In [None]:
ds.fire.values.sum()

In [None]:
import random_forest
reload(random_forest)
from random_forest import *

In [None]:
samples = [1, 2, 3]

ada_small = ADASYN(sampling_strategy=0.01)
enn = EditedNearestNeighbours()
downsampler = RandomUnderSampler(sampling_strategy=0.01)

for i in samples:

    ds = dask.dataframe.read_csv(f'../data/canada-csv/canada-{i}.csv').drop(columns='Unnamed: 0')
    X, y = prep_samples(ds, include_tv=False, compute=False)
    
    X_resampled, y_resampled = ada_small.fit_resample(X, y)
    dump((X_resampled, y_resampled), f'canada-adasyn/canada-{i}.ds_0,01.joblib')

In [None]:
ds = xr.open_zarr('../data/_ZARR_READY/la_forest')
X, y = prep_samples(ds, compute=True)
X_resampled, y_resampled = downsampler.fit_resample(X, y)
dump((X_resampled, y_resampled), 'la-resampled/downsampler.joblib')

In [None]:
X_1, y_1 = load('canada-downsamples/canada-1.ds_0,01.joblib')
X_2, y_2 = load('canada-downsamples/canada-2.ds_0,01.joblib')

X = np.append(X_1, X_2, axis=0)
y = np.append(y_1, y_2, axis=0)

In [None]:
X

In [None]:
clf = RandomForestClassifier(n_jobs=-1)
clf.fit(X, y)

In [None]:
dump(clf, 'models/can.hgb.enn.joblib')

In [None]:
test_ds = dask.dataframe.read_csv('../data/canada-csv/canada-3.csv').drop(columns='Unnamed: 0')
X_test, y_test = prep_samples(test_ds)

In [None]:
X_test, y_test = load('canada-downsamples/canada-3.ds_0,01.joblib')
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
y_pred.sum()

In [None]:
rus = RandomUnderSampler(sampling_strategy=0.1)
ada = ADASYN(sampling_strategy=0.01)
steps = [('ada', ada), ('rus', rus)]
pipe = Pipeline(steps=steps)

X_r, y_r = pipe.fit_resample(X, y)

In [None]:
unpatch_sklearn()
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_jobs=-1)
clf.fit(X_r, y_r)

## resamplign

In [None]:
## imports
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.pipeline import Pipeline

In [None]:
equal_undersampler = RandomUnderSampler(sampling_strategy=1)
equal_SMOTE = SMOTE(sampling_strategy=1)
equal_ADASYN = ADASYN(sampling_strategy=1)
SMOTE_partial = SMOTE(sampling_strategy=0.1)
ADASYN_partial = ADASYN(sampling_strategy=0.1)
SMOTE_half = SMOTE(sampling_strategy=0.5)
ADASYN_half = ADASYN(sampling_strategy=0.5)
under_partial = RandomUnderSampler(sampling_strategy=0.5) # bootstrapping

wack_steps = [('o', SMOTE_half), ('o2', equal_ADASYN)]

smote_steps = [('o', SMOTE_partial), ('u', under_partial)]
adasyn_steps = [('o', ADASYN_partial), ('u', under_partial)]
smote_pipeline = Pipeline(steps=smote_steps)
ada_pipeline = Pipeline(steps=adasyn_steps)
wack_pipe = Pipeline(steps=wack_steps)

In [None]:
X, y = prep_samples(ds, include_tv=True, compute=True)

In [None]:
data = pd.DataFrame(X, columns = ['d2m', 'lai_hv', 'lai_lv', 'mu_t2m_180', 'mu_t2m_30', 'mu_t2m_90', 'mu_tp_90', 'mu_tp_30', 'sp', 'mu_tp_180', 't2m', 'tp', 'ws10', 'tvh', 'tvl'])

In [None]:
with open('la_fire_data.txt', 'w') as outfile:
    outfile.write(','.join([str(i) for i in y]))

In [None]:
data.to_csv('la_forest_input_data.csv')

In [None]:
sample_num = len(y)
train_num = int(sample_num*0.8)
X_train = X[:train_num]
y_train = y[:train_num]
X_test = X[train_num:]
y_test = y[train_num:]

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state=10)

In [None]:
X_resampled, y_resampled = wack_pipe.fit_resample(X_train, y_train)

In [None]:
rf_clf = HistGradientBoostingClassifier()
rf_clf.fit(X_resampled, y_resampled)

In [None]:
dump(rf_clf, f'models/veg_data/laf.hgb.smote_0,5-ada_1.joblib')

In [None]:
test_ds = xr.open_zarr('../data/_ZARR_READY/la_forest_post')
X_post, y_post = prep_samples(test_ds, compute=True, include_tv=True)

In [None]:
clf = load('models/veg_data/laf.rf.ada_0,1-rus_0,5.joblib')

y_pred = clf.predict(X_post)
print(classification_report(y_post, y_pred))

In [None]:
## downsampled to 0.01, upsampled to 0.1
import matplotlib.pyplot as plt
import seaborn as sns

for i, feature in enumerate(features):
    plt.figure(figsize=(20, 10))
    # plt.hist(full_X_over.transpose()[i], bins=30)
    sns.histplot(stacked[feature], bins=30, stat='density', label='Original')
    sns.histplot(full_X_over.transpose()[i], bins=30, stat='density', label='Resampled')
    plt.title(feature)

In [None]:
## downsampled to 0.01

import matplotlib.pyplot as plt
import seaborn as sns

for i, feature in enumerate(features):
    plt.figure(figsize=(20, 10))
    # plt.hist(full_X_over.transpose()[i], bins=30)
    sns.histplot(stacked[feature], bins=30, stat='density', label='Original')
    sns.histplot(full_X_over.transpose()[i], bins=30, stat='density', label='Resampled')
    plt.title(feature)

In [None]:
plt.figure(figsize=(20, 10))
sns.histplot(full_X_over.transpose()[0], bins=30)

In [None]:
test_under = RandomUnderSampler(sampling_strategy=1)
X_under, y_under = test_under.fit_resample(X, y)

## model

In [None]:
clf = RandomForestClassifier(n_estimators=50, max_depth = 8, n_jobs=-1, min_samples_leaf=5)
clf.fit(full_X_train, full_y_train)

In [None]:
## pre-patch
both_clf = RandomForestClassifier()
both_clf.fit(X_both, y_both)

In [None]:
full_y_pred = clf.predict(full_X_test)
print(classification_report(full_y_test, full_y_pred))

In [None]:
from joblib import dump
dump(clf, './models/LA_FOREST_reduced_size.joblib')

## model testing

In [None]:
from joblib import load
clf = load('../main/models/LA_FOREST-rf_0.01-over-under.joblib')

In [None]:
full_y_pred = clf.predict(full_X_test)
print(classification_report(full_y_test, full_y_pred))

In [None]:
## test la forest model on canada data
from random_forest import prep_samples
ds = xr.open_zarr('../data/_ZARR_READY/canada')

In [None]:
from random_forest import prep_samples
canada_X, canada_y = prep_samples(ds.sel(time=pd.date_range('2009-12-31', '2010-12-31', freq='D')))
canada_X = canada_X.persist()
canada_y = canada_y.persist()
canada_X_reduced, canada_y_reduced = pipe.fit_resample(canada_X, canada_y)
canada_y_reduced.sum()

In [None]:
y_pred = clf.predict(canada_X)
print(classification_report(canada_y, y_pred))

In [None]:
cX_train, cX_test, cy_train, cy_test = train_test_split(canada_X_reduced, canada_y_reduced, test_size=0.2, random_state=10, shuffle=True)
canada_rf = RandomForestClassifier(n_jobs=-1)
canada_rf.fit(cX_train, cy_train)
dump(canada_rf, './models/canada_partial_date_range.joblib')

In [None]:

canada_y_pred = clf.predict(canada_X)
print(classification_report(canada_y, canada_y_pred))

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(20, 10))
plot_tree(clf.estimators_[0], feature_names=features, filled=True)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

## post-patch
second_both_clf = RandomForestClassifier()
second_both_clf.fit(X_both, y_both)

In [None]:
y_pred = second_both_clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
y_train_pred = clf.predict(X_train)
print(classification_report(y_train, y_train_pred))