# LGBM Model development

In [None]:
import pandas as pd
import seaborn as sns
import openpyxl
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle

In [None]:
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import optuna

In [None]:
from Functions.helper_functions import * 

In [None]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
np.random.seed(42)

# Data

In [None]:
train_wp1 = pd.read_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',')
train_wp2 = pd.read_csv('Data/Preprocessing/WP2_train_preprocessed.csv', sep=',')
train_wp3 = pd.read_csv('Data/Preprocessing/WP3_train_preprocessed.csv', sep=',')
train_wp4 = pd.read_csv('Data/Preprocessing/WP4_train_preprocessed.csv', sep=',')
train_wp5 = pd.read_csv('Data/Preprocessing/WP5_train_preprocessed.csv', sep=',')
train_wp6 = pd.read_csv('Data/Preprocessing/WP6_train_preprocessed.csv', sep=',')

In [None]:
test_wp1 = pd.read_csv('Data/Preprocessing/WP1_test_preprocessed.csv', sep=',')
test_wp2 = pd.read_csv('Data/Preprocessing/WP2_test_preprocessed.csv', sep=',')
test_wp3 = pd.read_csv('Data/Preprocessing/WP3_test_preprocessed.csv', sep=',')
test_wp4 = pd.read_csv('Data/Preprocessing/WP4_test_preprocessed.csv', sep=',')
test_wp5 = pd.read_csv('Data/Preprocessing/WP5_test_preprocessed.csv', sep=',')
test_wp6 = pd.read_csv('Data/Preprocessing/WP6_test_preprocessed.csv', sep=',')
test_dates = pd.read_csv('Data/Initial/test.csv', sep=',').date.values

In [None]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]

In [None]:
trials_file = 'Data/Hyperparametrization/rf_50trials.xlsx'

# RF functions

In [None]:
def rf_cross_validation(X, y, params):
    if params == None:
        model = RandomForestRegressor()
    else:
        model = RandomForestRegressor(**params)

    print('-----------RF CROSS VALIDATION BEGINNING-----------')
    split = 10
    kf = KFold(n_splits=split, shuffle=True)       
    lgbm_rmse_scores = []
    lgbm_mae_scores = []
    i = 1
    for (train_index, test_index) in kf.split(pd.DataFrame(X), pd.DataFrame(y)):
        X_train, X_test = pd.DataFrame(X).iloc[train_index], pd.DataFrame(X).iloc[test_index]
        Y_train, Y_test = pd.DataFrame(y).iloc[train_index],pd.DataFrame(y).iloc[test_index]

        model.fit(X_train, Y_train, eval_set=[(X_test, Y_test)], verbose=100)

        prediction = model.predict(X_test)
        lgbm_rmse_scores.append(mean_squared_error(Y_test, prediction,squared=False))
        lgbm_mae_scores.append(mean_absolute_error(Y_test, prediction))
        
        print(show_evaluation(prediction, Y_test))
        print(f'-------------------FOLD {i}-----------------')
        i+=1

    print('---------------CROSS VALIDATION COMPLETE-------------')
    print('--------------------------RMSE-----------------------')
    display_scores(lgbm_rmse_scores)
    print('--------------------------MAE------------------------')
    display_scores(lgbm_mae_scores)

In [None]:
def hyperparametrization(trial, X, y):
    param = { 
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'n_estimators':trial.suggest_int('num_leaves', 1, 2000),
        'max_depth':trial.suggest_int("max_depth", 10, 100),
        'min_samples_split':trial.suggest_categorical('min_samples_split',[2, 5, 10]),
        'min_samples_leaf':trial.suggest_categorical('min_samples_leaf',[1, 2, 4]),
    }
    
    model = RandomForestRegressor(**param)  
    rmse = - cross_val_score(clf, X, y, n_jobs=-1, cv=5, scoring = 'neg_root_mean_squared_error')
    
    return rmse

## WP1 

| |  | Mean | Std | Sum up |
| --- | --- | --- | --- | --- |
| No params | RMSE | 0.10344875448880764 | 0.0019070131550065564 |  |
| After tuning 50trials| RMSE | 0.06830085723562579 | 0.0012998671387256361 | To keep, maybe redo optuna with warm start with it - first sub|
| After tuning 100trials| RMSE | 0.0695930431604128 | 0.0015123892627707553 | |
| After tuning warm start 60trials | RMSE | 0.0671868765178121 | 0.0019357892229500213 | second sub |
| --- | --- | --- | --- | --- |
| No params | MAE | 0.07306057409517844 | 0.0009857628815465762 |  |
| After tuning 50trials| MAE | 0.04481696427654311 | 0.000727722171899004 |  |
| After tuning 100trials| MAE | 0.045696725242359994 | 0.0008508558100930331 |  |
| After tuning warm start 60trials | MAE | 0.04322865305311156 | 0.0009454253743042544 | |

In [None]:
wp1_X = train_wp1[[c for c in train_wp1 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)

X1 = wp1_X.drop('wp', axis=1)
y1 = wp1_X['wp']

def objective_wp1(trial,data=X1,target=y1):
    return hyperparametrization(trial, data, target)

In [None]:
rf_cross_validation(X1, y1, None)

In [None]:
# try_these_first = []

# study = optuna.create_study(direction='minimize')
# study.enqueue_trial(try_these_first[0])
# study.enqueue_trial(try_these_first[1])

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp1, n_trials=10)
write_results(trials_file, 'wp1', study.trials_dataframe())
best_trial = study.best_trial.params
best_trial

In [None]:
params_1 = best_trial

In [None]:
rf_cross_validation(X1, y1, params_1)

## WP2

| |  | Mean | Std | Sum up|
| --- | --- | --- | --- | |
| No params | RMSE | 0.10935335541057582 | 0.0014425096116734836 | |
| After tuning - 50trials| RMSE | 0.0725081520968898 | 0.0016974702626377217 | |
| After tuning 100trials| RMSE | 0.0707064364904941 | 0.001396820290618349 | More stable, to keep |
| --- | --- | --- | --- |---|
| No params | MAE | 0.07681923856705511 | 0.0008670825615244791 | |
| After tuning - 50trials| MAE | 0.04512164110351975 |  0.0006579433030966575 | |
| After tuning 100trials| MAE | 0.04457902842458915 | 0.0006807155447311589 | |

In [None]:
wp2_X = train_wp2[[c for c in train_wp2 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X2 = wp2_X.drop('wp', axis=1)
y2 = wp2_X['wp']

def objective_wp2(trial,data=X2,target=y2):
    return hyperparametrization(trial, data, target)

In [None]:
rf_cross_validation(X2, y2, None)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp2, n_trials=50)
write_results(trials_file, 'wp2', study.trials_dataframe())
best_trial = study.best_trial.params
best_trial

In [None]:
params_2 = best_trial

In [None]:
rf_cross_validation(X2, y2, params_2)

## WP3

| |  | Mean | Std ||
| --- | --- | --- | --- ||
| No params | RMSE | 0.10392558077951244 | 0.0019038044796542812 ||
| After tuning - 50trials| RMSE | 0.058253804820626545 | 0.0009893279354834155 | More stable, to keep |
| After tuning 100trials| RMSE | 0.058338944346627106 | 0.0017133930174837203 ||
| --- | --- | --- | --- | --- |
| No params | MAE | 0.07550802464973318 | 0.0012006073434917633 ||
| After tuning - 50trials| MAE | 0.03787310900962521 | 0.000442034368456366 ||
| After tuning 100trials| MAE | 0.03838030476025398 | 0.0007480100565996748 ||

In [None]:
wp3_X = train_wp3[[c for c in train_wp3 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X3 = wp3_X.drop('wp', axis = 1)
y3 = wp3_X['wp']

def objective_wp3(trial,data = X3,target = y3):
    return hyperparametrization(trial, X3, y3)

In [None]:
rf_cross_validation(X3, y3, None)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp3, n_trials=50)
write_results('Data/Hyperparametrization/lgbm_100trials.xlsx', 'wp3', study.trials_dataframe())
best_trial = study.best_trial.params
best_trial

In [None]:
params_3 = best_trial

In [None]:
rf_cross_validation(X3, y3, params_3)

## WP4

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE | 0.10486204816363351 | 0.0015105949978751166 |
| After tuning - 50trials| RMSE | 0.06513233717204232 | 0.0015891617240032727 |
| After tuning 100trials| RMSE | 0.06357594848470964 | 0.0013676749030776929 |
| No params | MAE | 0.07564776733421566 | 0.00104638869825841 |
| After tuning - 50trials| MAE | 0.04219236028055372 | 0.0008190579419060266 |
| After tuning 100trials| MAE |0.04172111697148837  | 0.0009349285385250968 |

In [None]:
wp4_X = train_wp4[[c for c in train_wp4 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1).drop(to_drop, axis = 1)
X4 = wp4_X.drop('wp', axis = 1)
y4 = wp4['wp']

def objective_wp4(trial,data = X4,target = y4):
    return hyperparametrization(trial, X4, y4)

In [None]:
rf_cross_validation(X4, y4, None)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp4, n_trials=50)
write_results('Data/Hyperparametrization/lgbm_100trials.xlsx', 'wp4', study.trials_dataframe())
best_trial = study.best_trial.params
best_trial

In [None]:
params_4 = best_trial

In [None]:
rf_cross_validation(X4, y4, params_4)

## WP5

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE | 0.11722129743692011 | 0.0017732599261516583 |
| After tuning - 50trials| RMSE | 0.07721413638593042 | 0.0011020420293213135 |
| After tuning - 100trials| RMSE | 0.07297648991888442 | 0.0014970317509404526 |
| After tuning - 100trials - warm trials | RMSE | 0.07362803793800192 | 0.0013223501622953715 |
| --- | --- | --- | --- |
| No params | MAE | 0.08497074568090211 | 0.0009101526501392155 |
| After tuning - 50trials| MAE | 0.051677856581467195 | 0.0006374939894477714 |
| After tuning - 100trials| MAE | 0.04765271414503236 | 0.0006257356756510128 |
| After tuning - 100trials - warm trials | MAE | 0.04785179154681675 | 0.0005795839605605526 |

In [None]:
wp5_X = train_wp5[[c for c in train_wp5 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X5 = wp5_X.drop('wp', axis = 1)
y5 = wp5_X['wp']

def objective_wp5(trial, data = X5,target = y5):
    return hyperparametrization(trial, X5, y5)

In [None]:
rf_cross_validation(X5, y5, None)

In [1]:
# try_these_first = []

# study = optuna.create_study(direction='minimize')
# study.enqueue_trial(try_these_first[0])
# study.enqueue_trial(try_these_first[1])

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp5, n_trials=50)
write_results(trials_file, 'wp5', study.trials_dataframe())
best_trial = study.best_trial.params
best_trial

In [None]:
params_5 = best_trial

In [None]:
rf_cross_validation(X5, y5, params_5)

## WP6

| |  | Mean | Std |
| --- | --- | --- | --- |
| No params | RMSE | 0.0940394026188472 | 0.0010749562915831372 |
| After tuning - 50trials| RMSE | 0.05404362835213171 | 0.0008595325139047733 |
| After tuning 100trials| RMSE | 0.054861488499908594 | 0.0007335378238383901 |
| No params | MAE | 0.070455643271004 | 0.0006641538274191148 |
| After tuning - 50trials| MAE | 0.03657758274248596 | 0.0005325521314198646 |
| After tuning 100trials| MAE | 0.03783933495157941 | 0.00045956939815828987 |

In [None]:
wp6_X = train_wp6[[c for c in train_wp6 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X6 = wp6_X.drop('wp', axis = 1)
y6 = wp6_X['wp']

def objective_wp6(trial, data = X6, target = y6):
    return hyperparametrization(trial, X6, y6)

In [None]:
rf_cross_validation(X6, y6, None)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wp6, n_trials=50)
write_results(trials_file, 'wp6', study.trials_dataframe())
best_trial = study.best_trial.params
best_trial

In [None]:
params_6 = best_trial

In [None]:
rf_cross_validation(X6, y6, params_6)

# Predictions

## Functions

In [None]:
to_drop_test = ['date','wd','forecast_time', 'forecast', "forecast_dist", 'wp']
def make_prediction_dataset(test, to_drop=to_drop_test):
    test_to_predict = test.dropna(subset=['ws','u','v'], how = 'any') # keeps only lines with u,v,ws,wd
    test_to_predict = test_to_predict[test_to_predict['wp'].isna()] # keeps only lines with no wp
    test_to_predict = test_to_predict.sort_values(by=['date', 'forecast_time'], ascending = [True, False]).drop_duplicates(subset='date')
    test_to_predict = test_to_predict.drop(to_drop, axis = 1)
    return test_to_predict

In [None]:
def make_submission_file(lst_X_trains, lst_y_trains, lst_tests, lst_models, dates):
    i = 1
    lst_prediction = []
    for X, y, test, model in zip(lst_X_trains, lst_y_trains, lst_tests, lst_models):
        print(f'--------------Model {i}--------------')
        model.fit(X, y)
        print(f'True:\n\tMin:{min(y)}\n\tMax:{max(y)}\n\tMean:{y.mean()}')
        predictions = model.predict(test)
        print(f'Prediction:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
        predictions = [min(y) if i < 0 else i for i in predictions]
        predictions = [max(y) if i > 1 else i for i in predictions]
        print(f'Prediction corrected:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
        lst_prediction.append(predictions)
        i+=1
    
    df_predictions = pd.DataFrame({
        'date': test_dates,
        'wp1': lst_prediction[0],
        'wp2': lst_prediction[1],
        'wp3': lst_prediction[2],
        'wp4': lst_prediction[3],
        'wp5': lst_prediction[4],
        'wp6': lst_prediction[5],        
    })
    return df_predictions

## Submission 

In [None]:
model_1 = RandomForestRegressor(**params_1)
model_2 = RandomForestRegressor(**params_2)
model_3 = RandomForestRegressor(**params_3)
model_4 = RandomForestRegressor(**params_4)
model_5 = RandomForestRegressor(**params_5)
model_6 = RandomForestRegressor(**params_6)

lst_models = [model_1, model_2, model_3, model_4, model_5, model_6]
lst_X_trains = [X1, X2, X3, X4, X5, X6]
lst_y_trains = [y1, y2, y3, y4, y5, y6]

In [None]:
lst_tests = []
for test in [test_wp1, test_wp2, test_wp3, test_wp4, test_wp5, test_wp6]:
    test = make_prediction_dataset(test)
    lst_tests.append(test)

In [None]:
df_predictions = make_submission_file(lst_X_trains, lst_y_trains, lst_tests, lst_models, test_dates)

In [None]:
df_predictions.to_csv('Predictions/submission_nb_3_full_rf.csv', index=False, sep=';')

In [None]:
# pkl_model = "Models/LGBM/LGBM-wp1-50trials_best_warm_start.pkl"
# with open(pkl_model, 'wb') as file:
#     pickle.dump(model_1, file)
    
    
# pkl_model = "Models/LGBM/LGBM-wp2-100trials_best.pkl"
# with open(pkl_model, 'wb') as file:
#     pickle.dump(model_2, file)
    

# pkl_model = "Models/LGBM/LGBM-wp3-50trials_best.pkl"
# with open(pkl_model, 'wb') as file:
#     pickle.dump(model_3, file)


# pkl_model = "Models/LGBM/LGBM-wp4-100trials_best.pkl"
# with open(pkl_model, 'wb') as file:
#     pickle.dump(model_4, file)


# pkl_model = "Models/LGBM/LGBM-wp5-100trials_best.pkl"
# with open(pkl_model, 'wb') as file:
#     pickle.dump(model_5, file)


# pkl_model = "Models/LGBM/LGBM-wp6-100trials_best.pkl"
# with open(pkl_model, 'wb') as file:
#     pickle.dump(model_6, file)