# Random Forest Model (months) with previous clustering
# In preparation for the May 19th submission to the Kopuru challenge

In [1]:
# Base packages -----------------------------------
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Linear Regression -------------------------------
from statsmodels.formula.api import ols

# SKLearn -----------------------------------------
from sklearn.model_selection import train_test_split


# Random Forest Regressort------------------------
from sklearn.ensemble import RandomForestRegressor

# Loading and massaging data

In [2]:
# Importing datasets from GitHub as Pandas Dataframes
queen_train = pd.read_csv("../Feeder_months/WBds03_QUEENtrain_months_DataWig.csv") #2018+2019 test df
queen_predict = pd.read_csv("../Feeder_months/WBds03_QUEENpredict_months_DataWig.csv") #2018+2019 test df
queen_cluster = pd.read_csv("../../../Other_open_data/cluster.csv") 

In [3]:
queen_train = queen_train.loc[queen_train.year_x == 2019,:].copy(True)

In [4]:
# Adding cluster labels
queen_train = pd.merge(queen_train, queen_cluster, how = 'left', left_on = 'municip_code', right_on = 'CODIGO MUNICIPIO')
queen_predict = pd.merge(queen_predict, queen_cluster, how = 'left', left_on = 'municip_code', right_on = 'CODIGO MUNICIPIO')

In [5]:
queen_train.drop(columns=['year_x','year_offset','species','municip_name','municip_code','station_code', 'index', 'MMM', 'station_name', 'CODIGO MUNICIPIO'], inplace=True)

# Model

In [6]:
# Instantiate the models with 100 decision trees
rf_0 = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_1 = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_2 = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_3 = RandomForestRegressor(n_estimators = 1000, random_state = 42)

In [7]:
# Train the model by cluster

train_x_0 = queen_train.loc[queen_train.Cluster == 0 ,:].drop('waspbust_id', axis = 1)
train_y_0 = queen_train.loc[queen_train.Cluster == 0 ,:].loc[:, 'waspbust_id']

train_x_1 = queen_train.loc[queen_train.Cluster == 1 ,:].drop('waspbust_id', axis = 1)
train_y_1 = queen_train.loc[queen_train.Cluster == 1 ,:].loc[:, 'waspbust_id']

train_x_2 = queen_train.loc[queen_train.Cluster == 2 ,:].drop('waspbust_id', axis = 1)
train_y_2 = queen_train.loc[queen_train.Cluster == 2 ,:].loc[:, 'waspbust_id']

train_x_3 = queen_train.loc[queen_train.Cluster == 3 ,:].drop('waspbust_id', axis = 1)
train_y_3 = queen_train.loc[queen_train.Cluster == 3 ,:].loc[:, 'waspbust_id']

rf_0.fit(train_x_0, train_y_0)
rf_1.fit(train_x_1, train_y_1)
rf_2.fit(train_x_2, train_y_2)
rf_3.fit(train_x_3, train_y_3)


RandomForestRegressor(n_estimators=1000, random_state=42)

# Feature importance

In [8]:
# Get numerical feature importances
feature_list_0 = list(train_x_0.columns)

# List of tuples with variable and importance
importances_0 = list(rf_0.feature_importances_)
feature_importances_0 = [(feature, round(importance, 4)) for feature, importance in zip(feature_list_0, importances_0)]

# Sort the feature importances by most important first
feature_importances_0 = sorted(feature_importances_0, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_0];

Variable: food_fruit           Importance: 0.2292
Variable: weath_solar          Importance: 0.0857
Variable: weath_minLevel       Importance: 0.0625
Variable: food_txakoli         Importance: 0.061
Variable: population           Importance: 0.0602
Variable: weath_maxTemp        Importance: 0.0561
Variable: weath_midLevel       Importance: 0.0515
Variable: weath_humidity       Importance: 0.0365
Variable: weath_meanTemp       Importance: 0.0348
Variable: weath_minTemp        Importance: 0.0345
Variable: weath_maxWindM       Importance: 0.0292
Variable: weath_10minRainfall  Importance: 0.0282
Variable: weath_maxMeanTemp    Importance: 0.0276
Variable: weath_maxLevel       Importance: 0.0269
Variable: weath_meanWindM      Importance: 0.0258
Variable: weath_days_rain      Importance: 0.0201
Variable: month                Importance: 0.02
Variable: food_apple           Importance: 0.0182
Variable: weath_meanDayMaxWind Importance: 0.0174
Variable: weath_accuRainfall   Importance: 0.0154
Var

In [9]:
# Get numerical feature importances
feature_list_1 = list(train_x_1.columns)

# List of tuples with variable and importance
importances_1 = list(rf_1.feature_importances_)
feature_importances_1 = [(feature, round(importance, 4)) for feature, importance in zip(feature_list_1, importances_1)]

# Sort the feature importances by most important first
feature_importances_1 = sorted(feature_importances_1, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_1];

Variable: weath_minTemp        Importance: 0.183
Variable: weath_maxMeanTemp    Importance: 0.0907
Variable: food_fruit           Importance: 0.0859
Variable: weath_10minRainfall  Importance: 0.0728
Variable: colonies_amount      Importance: 0.0575
Variable: weath_meanTemp       Importance: 0.047
Variable: weath_maxTemp        Importance: 0.0432
Variable: weath_humidity       Importance: 0.0363
Variable: weath_meanWindM      Importance: 0.0338
Variable: weath_maxWindM       Importance: 0.0332
Variable: weath_minLevel       Importance: 0.0324
Variable: weath_days_rain1mm   Importance: 0.0314
Variable: population           Importance: 0.0298
Variable: weath_1dayRainfall   Importance: 0.0282
Variable: weath_days_rain      Importance: 0.0272
Variable: weath_midLevel       Importance: 0.0244
Variable: weath_maxLevel       Importance: 0.0233
Variable: weath_accuRainfall   Importance: 0.0232
Variable: weath_solar          Importance: 0.0221
Variable: weath_meanDayMaxWind Importance: 0.0212
Va

In [10]:
# Get numerical feature importances
feature_list_2 = list(train_x_2.columns)

# List of tuples with variable and importance
importances_2 = list(rf_2.feature_importances_)
feature_importances_2 = [(feature, round(importance, 4)) for feature, importance in zip(feature_list_2, importances_2)]

# Sort the feature importances by most important first
feature_importances_2 = sorted(feature_importances_2, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_2];

Variable: population           Importance: 0.1633
Variable: weath_solar          Importance: 0.1311
Variable: weath_days_rain1mm   Importance: 0.0672
Variable: weath_meanTemp       Importance: 0.0668
Variable: weath_minTemp        Importance: 0.0655
Variable: weath_10minRainfall  Importance: 0.0582
Variable: weath_maxMeanTemp    Importance: 0.054
Variable: weath_minLevel       Importance: 0.0413
Variable: food_fruit           Importance: 0.0403
Variable: weath_maxTemp        Importance: 0.0397
Variable: food_kiwi            Importance: 0.0325
Variable: weath_maxLevel       Importance: 0.0318
Variable: weath_meanWindM      Importance: 0.0291
Variable: weath_midLevel       Importance: 0.0238
Variable: weath_1dayRainfall   Importance: 0.022
Variable: food_txakoli         Importance: 0.0187
Variable: weath_humidity       Importance: 0.018
Variable: weath_maxWindM       Importance: 0.0176
Variable: weath_days_rain      Importance: 0.0166
Variable: food_apple           Importance: 0.0156
Var

In [11]:
# Get numerical feature importances
feature_list_3 = list(train_x_3.columns)

# List of tuples with variable and importance
importances_3 = list(rf_3.feature_importances_)
feature_importances_3 = [(feature, round(importance, 4)) for feature, importance in zip(feature_list_3, importances_3)]

# Sort the feature importances by most important first
feature_importances_3 = sorted(feature_importances_3, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_3];

Variable: weath_accuRainfall   Importance: 0.1343
Variable: weath_days_rain1mm   Importance: 0.1079
Variable: weath_maxWindM       Importance: 0.1033
Variable: weath_minTemp        Importance: 0.0757
Variable: weath_maxMeanTemp    Importance: 0.0683
Variable: weath_solar          Importance: 0.068
Variable: weath_1dayRainfall   Importance: 0.0667
Variable: weath_meanTemp       Importance: 0.0639
Variable: weath_maxLevel       Importance: 0.0476
Variable: weath_maxTemp        Importance: 0.0451
Variable: weath_meanDayMaxWind Importance: 0.0339
Variable: weath_midLevel       Importance: 0.029
Variable: weath_days_rain      Importance: 0.0229
Variable: weath_meanWindM      Importance: 0.0225
Variable: weath_humidity       Importance: 0.0194
Variable: weath_10minRainfall  Importance: 0.0166
Variable: population           Importance: 0.0144
Variable: weath_minLevel       Importance: 0.0142
Variable: food_fruit           Importance: 0.0124
Variable: food_apple           Importance: 0.0095
Va

# New model with relevant variables

In [18]:
# Train new models by cluster

train2_x_0 = queen_train.loc[queen_train.Cluster == 0 ,['food_fruit']]
train2_x_1 = queen_train.loc[queen_train.Cluster == 1 ,['weath_minTemp']]
train2_x_2 = queen_train.loc[queen_train.Cluster == 2 ,['weath_solar', 'population']]
train2_x_3 = queen_train.loc[queen_train.Cluster == 3 ,['weath_accuRainfall', 'weath_days_rain1mm', 'weath_maxWindM']]

rf_0.fit(train2_x_0, train_y_0)
rf_1.fit(train2_x_1, train_y_1)
rf_2.fit(train2_x_2, train_y_2)
rf_3.fit(train2_x_3, train_y_3)

RandomForestRegressor(n_estimators=1000, random_state=42)

# Predictions

In [19]:
queen_predict_0 = queen_predict.loc[queen_predict.Cluster == 0, :].copy(True)
queen_predict_1 = queen_predict.loc[queen_predict.Cluster == 1 , :].copy(True)
queen_predict_2 = queen_predict.loc[queen_predict.Cluster == 2 ,:].copy(True)
queen_predict_3 = queen_predict.loc[queen_predict.Cluster == 3 ,:].copy(True)


In [20]:
predictions_0 = rf_0.predict(queen_predict_0.loc[:,['food_fruit']])
predictions_1 = rf_1.predict(queen_predict_1.loc[:,['weath_minTemp']])
predictions_2 = rf_2.predict(queen_predict_2.loc[:,['weath_solar', 'population']])
predictions_3 = rf_3.predict(queen_predict_3.loc[:,['weath_accuRainfall', 'weath_days_rain1mm', 'weath_maxWindM']])

In [21]:
queen_predict_0['nests_2020'] = predictions_0
queen_predict_1['nests_2020'] = predictions_1
queen_predict_2['nests_2020'] = predictions_2
queen_predict_3['nests_2020'] = predictions_3

In [22]:
HEX_0 = queen_predict_0.loc[:,['municip_code','municip_name','nests_2020']].groupby(by=['municip_code','municip_name'], as_index=False).sum().round().dropna()
HEX_1 = queen_predict_1.loc[:,['municip_code','municip_name','nests_2020']].groupby(by=['municip_code','municip_name'], as_index=False).sum().round().dropna()
HEX_2 = queen_predict_2.loc[:,['municip_code','municip_name','nests_2020']].groupby(by=['municip_code','municip_name'], as_index=False).sum().round().dropna()
HEX_3 = queen_predict_3.loc[:,['municip_code','municip_name','nests_2020']].groupby(by=['municip_code','municip_name'], as_index=False).sum().round().dropna()


In [23]:
aux = pd.DataFrame({"municip_code":[48020, 48022, 48071, 48088,48074,48051],
                    "municip_name":['Bilbao','Karrantza Harana/Valle de Carranza','Muskiz', 'Ubide','UrduÃ±a/OrduÃ±a','Lanestosa'],
                    "nests_2020":[0, 1, 0, 0, 1, 1]})

In [24]:
HEX = HEX_0.append(HEX_1, ignore_index = True).append(HEX_2, ignore_index = True).append(HEX_3, ignore_index = True).append(aux, ignore_index = True)

In [25]:
HEX.columns = ['CODIGO MUNICIPIO','NOMBRE MUNICIPIO','NIDOS 2020']

In [26]:
HEX.to_csv('WaspBusters_20210519_RandomForestmonths.csv', index=False)