In [None]:
import pandas as pd
import math
import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_theme(style='whitegrid')

In [None]:
data = pd.read_csv('data/data_train_DF.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)

# Replacing clim1 and clim2 by wind
data['wind'] = np.sqrt(data.clim1**2+data.clim2**2)
data.drop(['clim1','clim2'], axis=1, inplace=True)

# Relative humidity
data['RH'] = 100 * np.exp(17.625*(data.clim3-273.15)/(data.clim3-39.11)) / np.exp(17.625*(data.clim4-273.15)/(data.clim4-39.11))

# Dropping NA in new dataframe
data_validation = data[data.BA.isna() | data.CNT.isna()]
data.dropna(inplace=True)
    
data

In [None]:
# List of labels by type
clim_labels = ['altiMean', 'altiSD', 'clim3', 'clim4', 'clim5', 'clim6', 'clim7', 'clim8','clim9', 'clim10', 'wind', 'RH']
land_labels = ['lc1','lc2','lc3','lc4','lc5','lc6','lc7','lc8','lc9','lc10','lc11','lc12','lc13','lc14','lc15','lc16','lc17','lc18']

In [None]:
from sklearn.preprocessing import StandardScaler

# Splitting data to train/test
X_train = data[data.year.isin([1995,2005,2015])].drop('BA', axis=1)
y_train = data[data.year.isin([1995,2005,2015])][['BA']]
X_test  = data[data.year.isin([1997,2007])].drop('BA', axis=1)
y_test  = data[data.year.isin([1997,2007])][['BA']]

# Standardizing climate variables
scaler = StandardScaler()
X_train[clim_labels] = scaler.fit_transform(X_train[clim_labels])
X_test[clim_labels] = scaler.fit_transform(X_test[clim_labels])

In [None]:
# Adding mean CNT per voxel
mean_CNT_voxel = X_train.groupby(['lon','lat'])[['CNT']].mean()
X_train['mean_fire'] = X_train.apply(lambda x: mean_CNT_voxel.loc[x.lon,x.lat], axis=1)
X_test['mean_fire'] = X_test.apply(lambda x: mean_CNT_voxel.loc[x.lon,x.lat], axis=1)

In [None]:
# Defining Thresholds and Scoring Function

u_CNT = list(range(0,11))+[2*x for x in range(6,16)]+[10*x for x in range(4,11)]
u_BA  = [0,1]+[10*x for x in range(1,11)]+[50*x for x in range(3,7)]+[400,500,1000,1500,2000,5000]+[10000*x for x in range(1,6)]+[100000]

w_CNT = [1-(1+(x+1)**2/1000)**(-1/4) for x in u_CNT] 
w_CNT = [x/w_CNT[-1] for x in w_CNT] 

w_BA  = [1-(1+(x+1)/1000)**(-1/4) for x in u_BA]
w_BA  = [x/w_BA[-1] for x in w_BA]

def cnt_score(y_test, prediction_dist):  
    S_CNT  = 0
    for i in range(len(actual)):
        S_CNT = S_CNT + np.sum([w_CNT[x]*(int(u_CNT[x]>=y_test.iloc[i])-prediction_dist.iloc[i,x])**2 for x in range(len(u_CNT))])
    return S_CNT

def ba_score(y_test, prediction_dist):  
    S_BA  = 0
    for i in range(len(y_test)):
        S_BA = S_BA + np.sum([w_BA[x]*(int(u_BA[x]>=y_test.iloc[i])-prediction_dist.iloc[i,x])**2 for x in range(len(u_BA))])
    return S_BA

In [None]:
def dist_log1p(prediction):
    ba_distribution = pd.DataFrame(data=0, index=np.arange(prediction.size), columns=u_BA)
    ba_distribution = ba_distribution.apply(lambda x: stats.norm.cdf(np.log1p(x.name), loc=prediction[x.index], scale=np.std(prediction)), result_type='expand')
    return ba_distribution

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LassoCV

In [None]:
%%time

mRF = RandomForestRegressor(max_depth=5, n_estimators=500, n_jobs=8, bootstrap=False, random_state=42)
mRF.fit(X_train.drop('CNT', axis=1), np.ravel(np.log1p(y_train)))

prediction = mRF.predict(X_test.drop('CNT', axis=1))
ba_distribution = dist_log1p(prediction)
ba_distribution.to_csv('prediction_ba.csv', index=False)

## SCORE=4280

In [None]:
%%time

mLasso = Lasso(alpha=0.1)
mLasso.fit(X_train.drop('CNT', axis=1), np.ravel(np.log1p(y_train)))

residuals = np.ravel(np.log1p(y_train))-mLasso.predict(X_train.drop('CNT', axis=1))

mRERF = RandomForestRegressor(max_depth=5, n_estimators=500, n_jobs=10, bootstrap=False, random_state=42)
mRERF.fit(X_train.drop('CNT', axis=1), residuals)

prediction = mLasso.predict(X_test.drop('CNT', axis=1)) + mRERF.predict(X_test.drop('CNT', axis=1))
ba_distribution = dist_log1p(prediction)
ba_distribution.to_csv('prediction_ba.csv', index=False)

## SCORE=4345

In [None]:
param_grid = {'alpha': np.logspace(0,99,num=10)}

mLasso = Lasso()

clf = GridSearchCV(mLasso, param_grid, n_jobs=5, cv=5)
clf.fit(X_train.drop('CNT', axis=1), np.ravel(np.log1p(y_train)))
print(clf.best_params_)

prediction = clf.predict(X_test.drop('CNT', axis=1))
ba_distribution = dist_log1p(prediction)
ba_distribution.to_csv('prediction_ba.csv', index=False)

In [None]:
pca = PCA()
rForest = RandomForestRegressor(bootstrap=False, random_state=42)

pipe = Pipeline(steps=[('pca', pca), ('rF', rForest)])

param_grid = {
    'pca__n_components': [5, 10, 15, 30, 40],
    'rF__max_depth': [1,5],
    'rF__n_estimators': [500,1000]
}

search = GridSearchCV(pipe, param_grid, n_jobs=8)
search.fit(X_train.drop('CNT', axis=1), np.ravel(np.log1p(y_train)))
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [None]:
search.predict(X_test.drop('CNT', axis=1))

In [None]:
prediction = search.predict(X_test.drop('CNT', axis=1))
ba_distribution = dist_log1p(prediction)
ba_distribution.to_csv('prediction_ba.csv', index=False)