# Random Forest Model (months) with previous clustering
# In preparation for the May 19th submission to the Kopuru challenge

In [133]:
# Base packages -----------------------------------
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Linear Regression -------------------------------
from statsmodels.formula.api import ols

# SKLearn -----------------------------------------
from sklearn.model_selection import train_test_split


# Random Forest Regressort------------------------
from sklearn.ensemble import RandomForestRegressor

# Loading and massaging data

In [134]:
# Importing datasets from GitHub as Pandas Dataframes
queen_train = pd.read_csv("../Feeder_years/WBds03_QUEENtrain_years.csv") #2018+2019 test df
queen_predict = pd.read_csv("../Feeder_years/WBds03_QUEENpredict_years.csv") #2018+2019 test df
queen_cluster = pd.read_csv("../../../Other_open_data/cluster.csv") 

In [135]:
#queen_train = queen_train.loc[queen_train.year_x == 2019,:].copy(True)

In [136]:
# Adding cluster labels
queen_train = pd.merge(queen_train, queen_cluster, how = 'left', left_on = 'municip_code', right_on = 'CODIGO MUNICIPIO')
queen_predict = pd.merge(queen_predict, queen_cluster, how = 'left', left_on = 'municip_code', right_on = 'CODIGO MUNICIPIO')

In [137]:
queen_train.columns

Index(['year_x', 'municip_name', 'species', 'municip_code', 'year_offset',
       'waspbust_id', 'colonies_amount', 'food_fruit', 'food_apple',
       'food_txakoli', 'food_kiwi', 'food_pear', 'food_blueberry',
       'food_raspberry', 'station_code', 'freez', 'hum', 'lev_max', 'lev_mid',
       'lev_min', 'rain', 'rain_1mm', 'rain_cum', 'rain_max_10',
       'rain_max_day', 'sun', 'temp_avg', 'temp_max_abs', 'temp_max_avg',
       'temp_min_abs', 'wind_avg', 'wind_max', 'wind_max_avg', 'population',
       'CODIGO MUNICIPIO', 'Cluster'],
      dtype='object')

In [138]:
queen_train.drop(columns=['year_x','year_offset','species','municip_name','municip_code','station_code','CODIGO MUNICIPIO'], inplace=True)

# Model

In [139]:
# Instantiate the models with 100 decision trees
rf_0 = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_1 = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_2 = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_3 = RandomForestRegressor(n_estimators = 1000, random_state = 42)

In [140]:
# Train the model by cluster

train_x_0 = queen_train.loc[queen_train.Cluster == 0 ,:].drop('waspbust_id', axis = 1)
train_y_0 = queen_train.loc[queen_train.Cluster == 0 ,:].loc[:, 'waspbust_id']

train_x_1 = queen_train.loc[queen_train.Cluster == 1 ,:].drop('waspbust_id', axis = 1)
train_y_1 = queen_train.loc[queen_train.Cluster == 1 ,:].loc[:, 'waspbust_id']

train_x_2 = queen_train.loc[queen_train.Cluster == 2 ,:].drop('waspbust_id', axis = 1)
train_y_2 = queen_train.loc[queen_train.Cluster == 2 ,:].loc[:, 'waspbust_id']

train_x_3 = queen_train.loc[queen_train.Cluster == 3 ,:].drop('waspbust_id', axis = 1)
train_y_3 = queen_train.loc[queen_train.Cluster == 3 ,:].loc[:, 'waspbust_id']

rf_0.fit(train_x_0, train_y_0)
rf_1.fit(train_x_1, train_y_1)
rf_2.fit(train_x_2, train_y_2)
rf_3.fit(train_x_3, train_y_3)


RandomForestRegressor(n_estimators=1000, random_state=42)

# Feature importance

In [141]:
# Get numerical feature importances
feature_list_0 = list(train_x_0.columns)

# List of tuples with variable and importance
importances_0 = list(rf_0.feature_importances_)
feature_importances_0 = [(feature, round(importance, 4)) for feature, importance in zip(feature_list_0, importances_0)]

# Sort the feature importances by most important first
feature_importances_0 = sorted(feature_importances_0, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_0];

Variable: food_fruit           Importance: 0.1721
Variable: food_txakoli         Importance: 0.1328
Variable: population           Importance: 0.0925
Variable: lev_max              Importance: 0.0656
Variable: lev_mid              Importance: 0.0649
Variable: wind_max             Importance: 0.062
Variable: rain                 Importance: 0.0589
Variable: rain_max_10          Importance: 0.0501
Variable: temp_avg             Importance: 0.0322
Variable: temp_max_abs         Importance: 0.03
Variable: hum                  Importance: 0.0249
Variable: lev_min              Importance: 0.0217
Variable: rain_1mm             Importance: 0.0209
Variable: sun                  Importance: 0.0202
Variable: temp_max_avg         Importance: 0.0189
Variable: temp_min_abs         Importance: 0.017
Variable: colonies_amount      Importance: 0.014
Variable: rain_max_day         Importance: 0.014
Variable: wind_max_avg         Importance: 0.0139
Variable: food_apple           Importance: 0.0129
Variab

In [142]:
# Get numerical feature importances
feature_list_1 = list(train_x_1.columns)

# List of tuples with variable and importance
importances_1 = list(rf_1.feature_importances_)
feature_importances_1 = [(feature, round(importance, 4)) for feature, importance in zip(feature_list_1, importances_1)]

# Sort the feature importances by most important first
feature_importances_1 = sorted(feature_importances_1, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_1];

Variable: food_fruit           Importance: 0.2162
Variable: rain                 Importance: 0.1958
Variable: rain_max_day         Importance: 0.1614
Variable: rain_1mm             Importance: 0.0648
Variable: rain_max_10          Importance: 0.0462
Variable: temp_max_avg         Importance: 0.0347
Variable: rain_cum             Importance: 0.0283
Variable: colonies_amount      Importance: 0.0258
Variable: temp_max_abs         Importance: 0.025
Variable: wind_max             Importance: 0.0243
Variable: lev_mid              Importance: 0.0216
Variable: lev_min              Importance: 0.0206
Variable: population           Importance: 0.0205
Variable: temp_min_abs         Importance: 0.0177
Variable: hum                  Importance: 0.0128
Variable: wind_avg             Importance: 0.0115
Variable: temp_avg             Importance: 0.0114
Variable: sun                  Importance: 0.0111
Variable: food_apple           Importance: 0.0106
Variable: lev_max              Importance: 0.0098
V

In [143]:
# Get numerical feature importances
feature_list_2 = list(train_x_2.columns)

# List of tuples with variable and importance
importances_2 = list(rf_2.feature_importances_)
feature_importances_2 = [(feature, round(importance, 4)) for feature, importance in zip(feature_list_2, importances_2)]

# Sort the feature importances by most important first
feature_importances_2 = sorted(feature_importances_2, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_2];

Variable: lev_max              Importance: 0.4178
Variable: population           Importance: 0.1464
Variable: rain_max_10          Importance: 0.0646
Variable: lev_min              Importance: 0.0572
Variable: food_apple           Importance: 0.0492
Variable: rain_1mm             Importance: 0.0281
Variable: rain_max_day         Importance: 0.0269
Variable: food_fruit           Importance: 0.0234
Variable: lev_mid              Importance: 0.0212
Variable: temp_max_abs         Importance: 0.0204
Variable: food_kiwi            Importance: 0.0182
Variable: temp_avg             Importance: 0.0174
Variable: freez                Importance: 0.0134
Variable: hum                  Importance: 0.0113
Variable: food_txakoli         Importance: 0.011
Variable: temp_max_avg         Importance: 0.0108
Variable: wind_max_avg         Importance: 0.0102
Variable: rain                 Importance: 0.008
Variable: temp_min_abs         Importance: 0.0079
Variable: food_blueberry       Importance: 0.0073
Va

In [144]:
# Get numerical feature importances
feature_list_3 = list(train_x_3.columns)

# List of tuples with variable and importance
importances_3 = list(rf_3.feature_importances_)
feature_importances_3 = [(feature, round(importance, 4)) for feature, importance in zip(feature_list_3, importances_3)]

# Sort the feature importances by most important first
feature_importances_3 = sorted(feature_importances_3, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_3];

Variable: rain_1mm             Importance: 0.3503
Variable: population           Importance: 0.1206
Variable: wind_max_avg         Importance: 0.0893
Variable: rain                 Importance: 0.0765
Variable: colonies_amount      Importance: 0.0658
Variable: wind_max             Importance: 0.0338
Variable: rain_cum             Importance: 0.0333
Variable: lev_mid              Importance: 0.0314
Variable: lev_min              Importance: 0.0279
Variable: hum                  Importance: 0.0251
Variable: food_apple           Importance: 0.0242
Variable: temp_max_avg         Importance: 0.0206
Variable: rain_max_day         Importance: 0.0167
Variable: lev_max              Importance: 0.0153
Variable: food_fruit           Importance: 0.0137
Variable: wind_avg             Importance: 0.0117
Variable: food_kiwi            Importance: 0.0099
Variable: temp_max_abs         Importance: 0.0078
Variable: sun                  Importance: 0.005
Variable: food_blueberry       Importance: 0.0045
V

# New model with relevant variables

In [None]:
# Train new models by cluster

train2_x_0 = queen_train.loc[queen_train.Cluster == 0 ,['food_fruit', 'food_txakoli', 'population', 'lev_max']]   
train2_x_1 = queen_train.loc[queen_train.Cluster == 1 ,['food_fruit']]
train2_x_2 = queen_train.loc[queen_train.Cluster == 2 ,['lev_max']] 
train2_x_3 = queen_train.loc[queen_train.Cluster == 3 ,['rain_1mm']]
 
rf_0.fit(train2_x_0, train_y_0)
rf_1.fit(train2_x_1, train_y_1)
rf_2.fit(train2_x_2, train_y_2)
rf_3.fit(train2_x_3, train_y_3)

# Predictions

In [125]:
queen_predict_0 = queen_predict.loc[queen_predict.Cluster == 0, :].copy(True)
queen_predict_1 = queen_predict.loc[queen_predict.Cluster == 1 , :].copy(True)
queen_predict_2 = queen_predict.loc[queen_predict.Cluster == 2 ,:].copy(True)
queen_predict_3 = queen_predict.loc[queen_predict.Cluster == 3 ,:].copy(True)


In [126]:
predictions_0 = rf_0.predict(queen_predict_0.loc[:,['food_fruit', 'food_txakoli', 'population', 'lev_max']])
predictions_1 = rf_1.predict(queen_predict_1.loc[:,['food_fruit']])
predictions_2 = rf_2.predict(queen_predict_2.loc[:,['lev_max']])
predictions_3 = rf_3.predict(queen_predict_3.loc[:,['rain_1mm']])

In [127]:
queen_predict_0['nests_2020'] = predictions_0
queen_predict_1['nests_2020'] = predictions_1
queen_predict_2['nests_2020'] = predictions_2
queen_predict_3['nests_2020'] = predictions_3

In [128]:
HEX_0 = queen_predict_0.loc[:,['municip_code','municip_name','nests_2020']].groupby(by=['municip_code','municip_name'], as_index=False).sum().round().dropna()
HEX_1 = queen_predict_1.loc[:,['municip_code','municip_name','nests_2020']].groupby(by=['municip_code','municip_name'], as_index=False).sum().round().dropna()
HEX_2 = queen_predict_2.loc[:,['municip_code','municip_name','nests_2020']].groupby(by=['municip_code','municip_name'], as_index=False).sum().round().dropna()
HEX_3 = queen_predict_3.loc[:,['municip_code','municip_name','nests_2020']].groupby(by=['municip_code','municip_name'], as_index=False).sum().round().dropna()


In [129]:
aux = pd.DataFrame({"municip_code":[48020, 48022, 48071, 48088,48074,48051],
                    "municip_name":['Bilbao','Karrantza Harana/Valle de Carranza','Muskiz', 'Ubide','UrduÃ±a/OrduÃ±a','Lanestosa'],
                    "nests_2020":[0, 1, 0, 0, 1, 1]})

In [130]:
HEX = HEX_0.append(HEX_1, ignore_index = True).append(HEX_2, ignore_index = True).append(HEX_3, ignore_index = True).append(aux, ignore_index = True)

In [131]:
HEX.columns = ['CODIGO MUNICIPIO','NOMBRE MUNICIPIO','NIDOS 2020']

In [132]:
HEX.to_csv('WaspBusters_20210519_RandomForestyears.csv', index=False)