In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
#import plotly.express as px
import math
import gc
import seaborn as sns

from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
df_stat = pd.read_csv('./data/df_stat.csv')

def metric(y,x):
    return round(np.sqrt(mean_squared_error(x,y)),3)

df_stat = pd.get_dummies(df_stat, columns=['location'], drop_first=True)

RMSE_train = []
RMSE_test = []
model_name = []

feat_stat = ['mean_atmos_press',
 'median_wind_spd',
  'mean_wind_dir_sin',
   'mean_wind_dir_cos',
# 'mean_wind_dir',
# 'mean_wind_spd',
 'mean_temp',
 'mean_rel_humidity',
 'std_wind_dir',
# 'median_atmos_press',
# 'var_wind_dir',
# 'ptp_wind_spd',
# 'median_wind_dir',
 'std_rel_humidity',
 'std_atmos_press',
# 'var_rel_humidity',
# 'var_atmos_press',
 'std_wind_spd',
# 'var_temp',
# 'var_wind_spd',
 'std_temp',
 'min_atmos_press',
 'mean_precip',
# 'median_rel_humidity',
# 'median_temp',
 'min_temp',
 'std_precip',
# 'ptp_wind_dir',
# 'var_precip',
# 'ptp_atmos_press',
 'max_atmos_press',
# 'min_wind_spd',
 'max_rel_humidity',
 'min_wind_dir',
# 'ptp_temp',
# 'ptp_rel_humidity',
# 'max_wind_spd',
 'max_wind_dir',
 'max_temp',
 'min_rel_humidity',
 'max_precip',
# 'ptp_precip',
# 'location_E',
# 'location_B',
# 'location_D',
# 'location_C',
# 'median_precip',
# 'min_precip'
       ]
RSEED= 42

In [None]:
df_stat['mean_wind_dir_sin'] = np.sin(2*np.pi*df_stat.mean_wind_dir/360)
df_stat['mean_wind_dir_cos'] = np.cos(2*np.pi*df_stat.mean_wind_dir/360)

In [None]:
df_stat.mean_wind_dir_cos.describe()

In [None]:
df_time = pd.read_csv('./data/df_time.csv')

#df_time.shape
#df_time.info()
#df_time.isnull()
#df_time.isnull().count
df_time.isna().sum()
df_time.drop('Unnamed: 0',axis=1 ,inplace=True)
col_names = df_time.columns

for cn in col_names:
    for i in range(len(df_time)):
        if np.isnan(df_time[cn][i]):
            if 'temp' in cn:
                df_time[cn][i] = df_stat['mean_temp'][i]
            elif 'precip' in cn:
                df_time[cn][i] = df_stat['mean_precip'][i]
            elif 'rel_humidity' in cn:
                df_time[cn][i] = df_stat['mean_rel_humidity'][i]
            elif 'wind_dir' in cn:
                df_time[cn][i] = df_stat['mean_wind_dir'][i]
            elif 'wind_spd' in cn:
                df_time[cn][i] = df_stat['mean_wind_spd'][i]
            elif 'atmos_press' in cn:
                df_time[cn][i] = df_stat['mean_atmos_press'][i]              

In [None]:
imp_features = pd.read_csv('./data/feat_imp_ranked.csv')
df_all = df_stat #pd.concat([df_stat, df_time[imp_features.feature.to_list()].iloc[:,:]], axis=1)
df_all.shape

In [None]:
df_stat.head(1)

In [None]:
df_stat['mean_temp'][1]

In [None]:
df_time.isnull().sum().sum()

In [None]:
df_all.head()

In [None]:
#imp_features = pd.read_csv('./data/feat_imp_ranked.csv')
X = df_stat.iloc[:,3:] #df_all.iloc[:,3:] #df_time[imp_features.feature.to_list()].iloc[:,0:350]
y = df_stat.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = RSEED)

columns = X_train.columns

In [None]:
X.head()

In [None]:
scalerStand = StandardScaler()
X_train = scalerStand.fit_transform(X_train)
X_test = scalerStand.transform(X_test)

# scaled X_train data to pandas Dataframe
X_train = pd.DataFrame(X_train)
X_train.columns = columns
#X_train.drop('Unnamed: 0', axis=1, inplace=True)

# scaled X_test data to pandas Dataframe
X_test = pd.DataFrame(X_test)
X_test.columns = columns
#X_test.drop('Unnamed: 0', axis=1, inplace=True)

## Random Forest Regressor

In [None]:
rfr = RandomForestRegressor()
rfr = rfr.fit(X_train, y_train)

# predict on test-set
y_pred_rfr = rfr.predict(X_test)
y_pred_train_rfr = rfr.predict(X_train)

# RMSE on train and test set
print("Train RMSE:", metric(y_train, y_pred_train_rfr))
print("Test RMSE:", metric(y_test, y_pred_rfr))
RMSE_train.append(metric(y_train, y_pred_train_rfr))
RMSE_test.append(metric(y_test, y_pred_rfr))
model_name.append('Random Forest')

In [None]:
importances = rfr.feature_importances_

fi = pd.DataFrame(importances)
fi = fi.set_index(columns)
fi.plot.bar(figsize=(15,10))

In [None]:
fi_sort = fi.reset_index()
fi_sort.columns =['feature', 'importance']
fi_sort = fi_sort.sort_values('importance', ascending=False)
feat = fi_sort.feature.to_list()
fi_sort.head(20)

In [None]:
fi_sort.to_csv("./data/feat_imp_ranked.csv")

In [None]:
# Hyperparameter grid
param_grid_rfr = {
    'n_estimators': np.linspace(100, 150, 5).astype(int),
#    'max_depth': [None] + list(np.linspace(3, 11, 5).astype(int)),
    'max_features': ['auto', 'sqrt', 'log2', None],# + [0.5, 0,75, 0.9],
#    'max_leaf_nodes': [None] + list(np.linspace(10, 50, 40).astype(int)),
#    'min_samples_split': [2, 5, 10],
#    'bootstrap': [True, False]
}

# Create the random search model
gs_rfr = GridSearchCV(rfr, param_grid_rfr, n_jobs = -1, 
                        cv = 5, 
                        verbose = 5)

# Fit 
gs_rfr.fit(X_train, y_train)

    

gs_rfr.best_params_

# predict on test-set
y_pred_gs_rfr = gs_rfr.predict(X_test)
y_pred_train_gs_rfr = gs_rfr.predict(X_train)

# RMSE on train and test set
print("Train RMSE:", metric(y_train, y_pred_train_gs_rfr))
print("Test RMSE:", metric(y_test, y_pred_gs_rfr))
RMSE_train.append(metric(y_train, y_pred_train_gs_rfr))
RMSE_test.append(metric(y_test, y_pred_gs_rfr))
model_name.append('Random Forest Opt')

## KNN

In [None]:
knn = KNeighborsRegressor()
knn = knn.fit(X_train, y_train)
# predict on test-set
y_pred_knn = knn.predict(X_test)
y_pred_train_knn = knn.predict(X_train)

# RMSE on train and test set
print("Train RMSE:", metric(y_train, y_pred_train_knn))
print("Test RMSE:", metric(y_test, y_pred_knn))
RMSE_train.append(metric(y_train, y_pred_train_knn))
RMSE_test.append(metric(y_test, y_pred_knn))
model_name.append('KNN')

In [None]:
# Hyperparameter grid
param_grid_knn = {
    'weights': ['uniform', 'distance'],
    'n_neighbors': [5, 15, 25, 35],
    'p': [1, 2, 3, 100]
}

# Create the random search model
gs_knn = GridSearchCV(knn, param_grid_knn, n_jobs = -1, 
                        cv = 5, 
                        verbose = 5)

# Fit 
gs_knn.fit(X_train, y_train)

gs_knn.best_params_

In [None]:
# predict on test-set
y_pred_gs_knn = gs_knn.predict(X_test)
y_pred_train_gs_knn = gs_knn.predict(X_train)

# RMSE on train and test set
print("Train RMSE:", metric(y_train, y_pred_train_gs_knn))
print("Test RMSE:", metric(y_test, y_pred_gs_knn))
RMSE_train.append(metric(y_train, y_pred_train_gs_knn))
RMSE_test.append(metric(y_test, y_pred_gs_knn))
model_name.append('KNN Opt')

## Plot different RMSE

In [None]:
RMSE = {'RMSE_train': RMSE_train, 'RMSE_test': RMSE_test, 'Model': model_name}
baseline = pd.DataFrame(RMSE)
baseline = baseline.set_index('Model')
baseline.plot.bar()