# Bias correction of numerical prediction model temperature forecast Dataset

URL: http://archive.ics.uci.edu/ml/machine-learning-databases/00514/

## Content

1) [Data preprocessing](#dataproc)

2) [Model training and evaluation](#train) 

2.a) [Random Forest](#rf)
    
2.b) [Linear regression](#linear)

2.c) [Lasso Regression](#lasso)

2.d) [kNN](#knn)

---

In [None]:
# Basic imports
import sys
import math
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

print(sys.executable)

# models for random forest
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

# models for linear regression
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

# models for Lasso regression
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# models for kNN
from sklearn.neighbors import KNeighborsRegressor

# statistic tools
from sklearn import metrics
from statistics import stdev
from sklearn.model_selection import GridSearchCV

# preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

---
# 1) Data preprocessing
---

In [None]:
# Might be good idea to have a function for dataset analysis --> standardized output
# Just a prototype/idea for now
def analyse_dataset(frame, name='not-given'):
    print(f"Analysis of <{name}>")
    print('-'*40)
    print("Info:")
    print(frame.info())
    print('-'*40)
    print("Shape:")
    print(frame.shape)
    print('-'*40)
    print("Index:")
    print(frame.index)
    print('-'*40)
    print("Columns:")
    print(frame.columns)

In [None]:
dataset_name = 'Bias_correction_ucl.csv'
print("Opening: ", dataset_name)
df = pd.read_csv(dataset_name,  sep = ',')#, header = 0)
print(df)

In [None]:
# Quick analysis
print(f"Analysis of <{dataset_name}>")
print('-'*40)
print("Info:")
print(df.info())

print("Columns:")
print(df.columns)

In [None]:
col_dict = {c: i for i, c in enumerate(df.columns)}

## Description of data columns

1. station - used weather station number: 1 to 25 
2. Date - Present day: yyyy-mm-dd ('2013-06-30' to '2017-08-30') 
3. Present_Tmax - Maximum air temperature between 0 and 21 h on the present day (Â°C): 20 to 37.6 
4. Present_Tmin - Minimum air temperature between 0 and 21 h on the present day (Â°C): 11.3 to 29.9 
5. LDAPS_RHmin - LDAPS model forecast of next-day minimum relative humidity (%): 19.8 to 98.5 
6. LDAPS_RHmax - LDAPS model forecast of next-day maximum relative humidity (%): 58.9 to 100 
7. LDAPS_Tmax_lapse - LDAPS model forecast of next-day maximum air temperature applied lapse rate (Â°C): 17.6 to 38.5 
8. LDAPS_Tmin_lapse - LDAPS model forecast of next-day minimum air temperature applied lapse rate (Â°C): 14.3 to 29.6 
9. LDAPS_WS - LDAPS model forecast of next-day average wind speed (m/s): 2.9 to 21.9 
10. LDAPS_LH - LDAPS model forecast of next-day average latent heat flux (W/m2): -13.6 to 213.4 
11. LDAPS_CC1 - LDAPS model forecast of next-day 1st 6-hour split average cloud cover (0-5 h) (%): 0 to 0.97 
12. LDAPS_CC2 - LDAPS model forecast of next-day 2nd 6-hour split average cloud cover (6-11 h) (%): 0 to 0.97 
13. LDAPS_CC3 - LDAPS model forecast of next-day 3rd 6-hour split average cloud cover (12-17 h) (%): 0 to 0.98 
14. LDAPS_CC4 - LDAPS model forecast of next-day 4th 6-hour split average cloud cover (18-23 h) (%): 0 to 0.97 
15. LDAPS_PPT1 - LDAPS model forecast of next-day 1st 6-hour split average precipitation (0-5 h) (%): 0 to 23.7 
16. LDAPS_PPT2 - LDAPS model forecast of next-day 2nd 6-hour split average precipitation (6-11 h) (%): 0 to 21.6 
17. LDAPS_PPT3 - LDAPS model forecast of next-day 3rd 6-hour split average precipitation (12-17 h) (%): 0 to 15.8 
18. LDAPS_PPT4 - LDAPS model forecast of next-day 4th 6-hour split average precipitation (18-23 h) (%): 0 to 16.7 
19. lat - Latitude (Â°): 37.456 to 37.645 
20. lon - Longitude (Â°): 126.826 to 127.135 
21. DEM - Elevation (m): 12.4 to 212.3 
22. Slope - Slope (Â°): 0.1 to 5.2 
23. Solar radiation - Daily incoming solar radiation (wh/m2): 4329.5 to 5992.9 
24. Next_Tmax - The next-day maximum air temperature (Â°C): 17.4 to 38.9 
25. Next_Tmin - The next-day minimum air temperature (Â°C): 11.3 to 29.8

In [None]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000): 
        with pd.option_context("display.max_columns", 1000): 
            display(df)

## First look on data and information

In [None]:
display_all(df.tail().transpose())
print('#'*40)
display('Some more info')
print('#'*40)
display(df.info())

In [None]:
# Split into train and test
def split_simple(df, n): 
    '''n... number to split at'''
    return df[:n].copy(), df[n:].copy()

In [None]:
df_prep_rf = df
cols_to_drop = ['station', 'Date', 'Next_Tmax']
df_prep_rf = df_prep_rf.drop(cols_to_drop, axis=1)
np.where(np.isnan(df_prep_rf))

In [None]:
# Fix missing values
from sklearn.impute import SimpleImputer
np.where(np.isnan(df_prep_rf))

imp = SimpleImputer(missing_values=np.nan, strategy='mean')

df_imputed = imp.fit_transform(df_prep_rf)
df_prep_rf = pd.DataFrame(df_imputed, columns=df_prep_rf.columns)
np.where(np.isnan(df_prep_rf))

In [None]:
display(df_prep_rf.columns.values)
display(df_prep_rf.index)

In [None]:
display(df_prep_rf)

In [None]:
df_rf = df_prep_rf

In [None]:
plt.figure()
sns.lmplot("LDAPS_Tmin_lapse","Next_Tmin",df_prep_rf)

sns.lmplot("Present_Tmin","Next_Tmin",df_prep_rf)

sns.lmplot("LDAPS_RHmin","Next_Tmin",df_prep_rf)

sns.lmplot("LDAPS_RHmax","Next_Tmin",df_prep_rf)

sns.lmplot("DEM","Next_Tmin",df_prep_rf)

---
# 2. Model training and evaluation
---

# a) Random forest

In [None]:
# Feature importance
from prettytable import PrettyTable as PT # pip install PTable
def print_RF_featureImportance(rf, X):
    table = PT()
    table.field_names = ['Feature', 'Score', 'Comment']
    for name, score in zip(X.columns.values, rf.feature_importances_):
        print(f"{name}: {score:.5f}\t\t... {col_dict[name]}")
        table.add_row([name, round(score, ndigits=4), col_dict[name]])
    print(table)

def print_GridSearchResult(grid):
    print(grid.best_params_)
    print(grid.best_estimator_)

In [None]:
# Split for random forest

rnd_state = 42
ratio = 0.2 # test/num_samples
#####
num_instances, _ = df_rf.shape
print(f"From {num_instances} using {num_instances*ratio:.0f} for testing and {num_instances*(1-ratio):.0f} for training. Ratio = {ratio*100:.2f}%")
X, y = (df_rf.drop(['Next_Tmin'], axis=1), df_rf.Next_Tmin)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = ratio, random_state = rnd_state)
display(X_test)


In [None]:
# Simple training of RFRegressor
before = 0
n_cores = 2
rf_model = RandomForestRegressor(n_jobs=n_cores)

rf_model.fit(X_train, y_train)
pred = rf_model.predict(X_test)
error = math.sqrt(metrics.mean_squared_error(y_test, pred))

print("Model performance:")
print("RMSE: {}".format(error))
print("R^2-score: {}".format(rf_model.score(X_test, y_test)))

In [None]:
print_RF_featureImportance(rf_model, X_train)

In [None]:
rf_model_prediction = rf_model.predict(X_test)

In [None]:
sns.distplot(y_test)
sns.distplot(rf_model_prediction, color="red")

In [None]:
sns.distplot(y_test-rf_model_prediction)

In [None]:
n_cores = 2
number_of_trees = 500 # default = 100
rf = RandomForestRegressor(n_jobs=n_cores, n_estimators=number_of_trees, bootstrap=True) #, verbose=1)

rf.fit(X_train, y_train)
print("Model performance:")
print("RMSE: {}".format(error))
print("R^2-score: {}".format(rf_model.score(X_test, y_test)))

print("Feature importance")
print_RF_featureImportance(rf, X_train)
rf_RD = rf

In [None]:
rfRD_prediction = rf_RD.predict(X_test)

In [None]:
sns.distplot(y_test)
sns.distplot(rf_model_prediction, color="red")

In [None]:
sns.distplot(y_test-rfRD_prediction)

## Optimize Hyperparameters via GridSearch


## Notes on the RandomForestRegressor from scikit-learn
-----
The default values for the parameters controlling the size of the trees
(e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
unpruned trees which can potentially be very large on some data sets. To
reduce memory consumption, the complexity and size of the trees should be
controlled by setting those parameter values.

## Number of variables/features per tree --> 'max_features'

A good starting point is/might be: *the square root of the number of features presented to the tree*. Then, test some values below and above that starting point.

## Number of trees in the forest --> 'n_estimators'

The more the merrier

In [None]:
from numpy import sqrt
num_features = X.shape[1]
print(num_features)
sqrt_num_features = round(sqrt(num_features), 0)
sqrt_num_features

In [None]:
n_cores = 2
# brute forcing
param_grid = [
    {
        'n_estimators': [10, 30, 100], 'max_features': [i for i in range(3, 7)]
    }
#,{'bootstrap': [False], 'n_estimators': [3, 30, 100], 'max_features': [3, 4, 5, 6]},
]
k = 10
forest_reg = RandomForestRegressor(n_jobs=n_cores)
grid_search = GridSearchCV(forest_reg, param_grid, n_jobs=n_cores , cv=k, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)

In [None]:
print_GridSearchResult(grid_search)
grid_search.scorer_
scores = grid_search.score(X_test, y_test)
pred = grid_search.predict(X_test)
error = math.sqrt(metrics.mean_squared_error(y_test, pred))

print("Model performance:")
print("RMSE: {}".format(error))
print("R^2-score: {}".format(grid_search.score(X_test, y_test)))

---
# b) Linear regression
---

In [None]:
df_lin = df_prep_rf

In [None]:
linreg = LinearRegression(normalize=True)
linreg.fit(X_train, y_train)

In [None]:
linreg.coef_

pred_linreg = linreg.predict(X_test)
error = math.sqrt(metrics.mean_squared_error(y_test, pred_linreg))

print("Model performance:")
print("RMSE: {}".format(error))
print("R^2-score: {}".format(linreg.score(X_test, y_test)))

sns.distplot(y_test)
sns.distplot(pred_linreg, color='red')

In [None]:
sns.distplot(y_test-pred_linreg)

---
# c) Lasso Regression
---

In [None]:
lasso = Lasso(normalize = True)
parameters = {'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,2,5,10,20,30,35,40,45,50,55,100]}
lasso_regressor = GridSearchCV(lasso, parameters, scoring = 'neg_mean_squared_error',cv = 5)

In [None]:
lasso_regressor.fit(X_train,y_train)
print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)

In [None]:
y_lasso_pred = lasso_regressor.predict(X_test)

In [None]:
sns.distplot(y_test)
sns.distplot(y_lasso_pred, color='red')

In [None]:
sns.distplot(y_test-y_lasso_pred)

---
# d) kNN
---

In [None]:
df_knn = df_prep_rf

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))

X_train_knn_scaled = scaler.fit_transform(X_train)
X_train_knn = pd.DataFrame(X_train_knn_scaled)

X_test_knn_scaled = scaler.fit_transform(X_test)
X_test_knn = pd.DataFrame(X_test_knn_scaled)

In [None]:
rmse_val_knn = [] # to store rmse values for different k
for k in range(35):
    k = k + 1
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train_knn, y_train)
    pred = model.predict(X_test_knn)
    error = math.sqrt(metrics.mean_squared_error(y_test, pred))
    rmse_val_knn.append(error)
    print("RMSE for k={}: {}".format(k, error))
    print("R^2 for k={}: {}\n".format(k, model.score(X_test_knn, y_test)))

In [None]:
plt.figure(figsize=(15,8))
plt.plot(range(1,36), rmse_val_knn, color='blue', linestyle='dashed', marker='o',
        markerfacecolor='red', markersize=5)
plt.title('RMSE vs. k-Value')
plt.xlabel('k')
plt.ylabel('RMSE')

## Optimizing kNN-search for optimal k-Value via Gridsearch

In [None]:
params = {'n_neighbors': range(1, 35)}

knn = KNeighborsRegressor()

model = GridSearchCV(knn, params, cv=100)
model.fit(X_train_knn, y_train)
print("Best k-Value is: ", model.best_params_['n_neighbors'])

In [None]:
model_cv = KNeighborsRegressor(n_neighbors=model.best_params_['n_neighbors'])
model_cv.fit(X_train_knn, y_train)
pred_cv = model.predict(X_test_knn)
sns.distplot(y_test)
sns.distplot(pred_cv, color='red')

In [None]:
sns.distplot(y_test-pred_cv)