# Random Forest Model (months) with previous clustering
# In preparation for the May 19th submission to the Kopuru challenge

In [44]:
# Base packages -----------------------------------
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Linear Regression -------------------------------
from statsmodels.formula.api import ols

# SKLearn -----------------------------------------
from sklearn.model_selection import train_test_split


# Random Forest Regressort------------------------
from sklearn.ensemble import RandomForestRegressor

In [45]:
## Functions

In [46]:
# Function that checks if final Output is ready for submission or needs revision   

def check_data(HEX):

    if HEX.shape == (112, 3):
        print(HEX.shape,": Shape is correct.")
    else:
        print(HEX.shape,": Shape is **INCORRECT!**")

    if HEX["CODIGO MUNICIPIO"].nunique() == 112:
        print(HEX["CODIGO MUNICIPIO"].nunique(),": Number of unique municipalities is correct.")
    else:
        print(HEX["CODIGO MUNICIPIO"].nunique(),": Number of unique municipalities is **INCORRECT!**")
        
    if any(HEX["NIDOS 2020"] < 0):
        print("**INCORRECT!** At least one municipality has NESTS <= 0.")
    else:
        print("Great! All municipalities have NESTS >= 0.")
    
    print("The Total 2020 Nests' Prediction is", int(HEX["NIDOS 2020"].sum()))

# Loading and massaging data

In [4]:
# Importing datasets from GitHub as Pandas Dataframes
queen_train = pd.read_csv("../Feeder_years/WBds03_QUEENtrainYEARS.csv") #2018+2019 test df
queen_predict = pd.read_csv("../Feeder_years/WBds03_QUEENpredictYEARS.csv") #2018+2019 test df
queen_cluster = pd.read_csv("../../../Other_open_data/cluster.csv") 

In [7]:
queen_train.head()

Unnamed: 0,municip_name,municip_code,year_offset,NESTS,colonies_amount,food_fruit,food_apple,food_txakoli,food_kiwi,food_pear,...,rain_max_day,sun,temp_avg,temp_max_abs,temp_max_avg,temp_min_abs,wind_avg,wind_max,wind_max_avg,population
0,Abadiño,48001,2017,54,160.0,0.0,0.0,0.0,0.0,0.0,...,63.1,12.529042,13.283333,38.4,26.1,0.8,10.868514,82.102336,40.190008,7522
1,Abadiño,48001,2018,22,160.0,0.0,0.0,0.0,0.0,0.0,...,71.2,12.519172,13.675,37.0,26.6,2.2,11.036587,76.642772,40.302092,7533
2,Abanto y Ciérvana-Abanto Zierbena,48002,2017,92,161.0,0.0,0.0,0.0,0.0,0.0,...,74.9,11.183333,13.433333,34.1,23.6,2.9,11.541667,142.9,59.4,9543
3,Abanto y Ciérvana-Abanto Zierbena,48002,2018,47,161.0,0.0,0.0,0.0,0.0,0.0,...,43.3,12.086274,13.7,36.0,24.8,3.0,12.816667,118.1,70.7,9545
4,Ajangiz,48911,2017,19,0.0,17.0,1.0,3.0,15.0,0.0,...,47.0,12.526743,14.033333,37.8,25.6,2.0,11.07345,80.263148,40.287502,474


In [8]:
queen_train = queen_train.loc[queen_train.year_offset == 2018,:].copy(True)

In [9]:
# Adding cluster labels
queen_train = pd.merge(queen_train, queen_cluster, how = 'left', left_on = 'municip_code', right_on = 'CODIGO MUNICIPIO')
queen_predict = pd.merge(queen_predict, queen_cluster, how = 'left', left_on = 'municip_code', right_on = 'CODIGO MUNICIPIO')

In [10]:
queen_train.columns

Index(['municip_name', 'municip_code', 'year_offset', 'NESTS',
       'colonies_amount', 'food_fruit', 'food_apple', 'food_txakoli',
       'food_kiwi', 'food_pear', 'food_blueberry', 'food_raspberry',
       'station_code', 'freez', 'hum', 'lev_max', 'lev_mid', 'lev_min', 'rain',
       'rain_1mm', 'rain_cum', 'rain_max_10', 'rain_max_day', 'sun',
       'temp_avg', 'temp_max_abs', 'temp_max_avg', 'temp_min_abs', 'wind_avg',
       'wind_max', 'wind_max_avg', 'population', 'CODIGO MUNICIPIO',
       'Cluster'],
      dtype='object')

In [12]:
queen_train.drop(columns=['year_offset','municip_name','municip_code','station_code','CODIGO MUNICIPIO'], inplace=True)

# Model

In [13]:
# Instantiate the models with 100 decision trees
rf_0 = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_1 = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_2 = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf_3 = RandomForestRegressor(n_estimators = 1000, random_state = 42)

In [14]:
# Train the model by cluster

train_x_0 = queen_train.loc[queen_train.Cluster == 0 ,:].drop('NESTS', axis = 1)
train_y_0 = queen_train.loc[queen_train.Cluster == 0 ,:].loc[:, 'NESTS']

train_x_1 = queen_train.loc[queen_train.Cluster == 1 ,:].drop('NESTS', axis = 1)
train_y_1 = queen_train.loc[queen_train.Cluster == 1 ,:].loc[:, 'NESTS']

train_x_2 = queen_train.loc[queen_train.Cluster == 2 ,:].drop('NESTS', axis = 1)
train_y_2 = queen_train.loc[queen_train.Cluster == 2 ,:].loc[:, 'NESTS']

train_x_3 = queen_train.loc[queen_train.Cluster == 3 ,:].drop('NESTS', axis = 1)
train_y_3 = queen_train.loc[queen_train.Cluster == 3 ,:].loc[:, 'NESTS']

rf_0.fit(train_x_0, train_y_0)
rf_1.fit(train_x_1, train_y_1)
rf_2.fit(train_x_2, train_y_2)
rf_3.fit(train_x_3, train_y_3)


RandomForestRegressor(n_estimators=1000, random_state=42)

# Feature importance

In [15]:
# Get numerical feature importances
feature_list_0 = list(train_x_0.columns)

# List of tuples with variable and importance
importances_0 = list(rf_0.feature_importances_)
feature_importances_0 = [(feature, round(importance, 4)) for feature, importance in zip(feature_list_0, importances_0)]

# Sort the feature importances by most important first
feature_importances_0 = sorted(feature_importances_0, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_0];

Variable: food_fruit           Importance: 0.2054
Variable: population           Importance: 0.1825
Variable: food_txakoli         Importance: 0.1184
Variable: lev_max              Importance: 0.0426
Variable: lev_mid              Importance: 0.037
Variable: hum                  Importance: 0.0361
Variable: wind_avg             Importance: 0.0354
Variable: temp_max_abs         Importance: 0.0318
Variable: rain_max_day         Importance: 0.0311
Variable: temp_max_avg         Importance: 0.0268
Variable: colonies_amount      Importance: 0.0223
Variable: temp_avg             Importance: 0.0213
Variable: rain_max_10          Importance: 0.0209
Variable: wind_max_avg         Importance: 0.0185
Variable: lev_min              Importance: 0.0184
Variable: sun                  Importance: 0.0178
Variable: rain_1mm             Importance: 0.017
Variable: food_kiwi            Importance: 0.0165
Variable: food_apple           Importance: 0.016
Variable: food_blueberry       Importance: 0.016
Vari

In [16]:
# Get numerical feature importances
feature_list_1 = list(train_x_1.columns)

# List of tuples with variable and importance
importances_1 = list(rf_1.feature_importances_)
feature_importances_1 = [(feature, round(importance, 4)) for feature, importance in zip(feature_list_1, importances_1)]

# Sort the feature importances by most important first
feature_importances_1 = sorted(feature_importances_1, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_1];

Variable: colonies_amount      Importance: 0.1478
Variable: rain_cum             Importance: 0.1162
Variable: food_fruit           Importance: 0.116
Variable: population           Importance: 0.0864
Variable: temp_max_avg         Importance: 0.0741
Variable: temp_min_abs         Importance: 0.0422
Variable: food_txakoli         Importance: 0.0363
Variable: rain_1mm             Importance: 0.0357
Variable: sun                  Importance: 0.0326
Variable: lev_max              Importance: 0.03
Variable: wind_avg             Importance: 0.0271
Variable: rain_max_10          Importance: 0.0268
Variable: lev_mid              Importance: 0.0263
Variable: wind_max             Importance: 0.0251
Variable: food_kiwi            Importance: 0.0239
Variable: lev_min              Importance: 0.0236
Variable: wind_max_avg         Importance: 0.0236
Variable: temp_avg             Importance: 0.0206
Variable: temp_max_abs         Importance: 0.0194
Variable: rain_max_day         Importance: 0.0184
Var

In [17]:
# Get numerical feature importances
feature_list_2 = list(train_x_2.columns)

# List of tuples with variable and importance
importances_2 = list(rf_2.feature_importances_)
feature_importances_2 = [(feature, round(importance, 4)) for feature, importance in zip(feature_list_2, importances_2)]

# Sort the feature importances by most important first
feature_importances_2 = sorted(feature_importances_2, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_2];

Variable: population           Importance: 0.3192
Variable: rain_max_day         Importance: 0.0724
Variable: lev_min              Importance: 0.0696
Variable: hum                  Importance: 0.068
Variable: lev_mid              Importance: 0.0604
Variable: rain_1mm             Importance: 0.0582
Variable: temp_max_abs         Importance: 0.0553
Variable: food_fruit           Importance: 0.0513
Variable: temp_avg             Importance: 0.0468
Variable: food_txakoli         Importance: 0.0413
Variable: lev_max              Importance: 0.0386
Variable: sun                  Importance: 0.0302
Variable: temp_max_avg         Importance: 0.0202
Variable: food_kiwi            Importance: 0.0149
Variable: wind_max             Importance: 0.0132
Variable: food_apple           Importance: 0.0077
Variable: wind_max_avg         Importance: 0.0077
Variable: food_pear            Importance: 0.0054
Variable: temp_min_abs         Importance: 0.0045
Variable: wind_avg             Importance: 0.0039
V

In [18]:
# Get numerical feature importances
feature_list_3 = list(train_x_3.columns)

# List of tuples with variable and importance
importances_3 = list(rf_3.feature_importances_)
feature_importances_3 = [(feature, round(importance, 4)) for feature, importance in zip(feature_list_3, importances_3)]

# Sort the feature importances by most important first
feature_importances_3 = sorted(feature_importances_3, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_3];

Variable: food_apple           Importance: 0.1493
Variable: lev_min              Importance: 0.1408
Variable: colonies_amount      Importance: 0.1265
Variable: wind_max_avg         Importance: 0.077
Variable: lev_max              Importance: 0.0588
Variable: wind_max             Importance: 0.0587
Variable: temp_max_avg         Importance: 0.0525
Variable: hum                  Importance: 0.0375
Variable: rain_1mm             Importance: 0.0317
Variable: rain_cum             Importance: 0.0316
Variable: rain_max_day         Importance: 0.0273
Variable: food_kiwi            Importance: 0.0247
Variable: rain                 Importance: 0.0241
Variable: lev_mid              Importance: 0.0228
Variable: population           Importance: 0.021
Variable: sun                  Importance: 0.0201
Variable: temp_max_abs         Importance: 0.018
Variable: food_fruit           Importance: 0.0171
Variable: rain_max_10          Importance: 0.0146
Variable: wind_avg             Importance: 0.0112
Var

# New model with relevant variables

In [65]:
# Train new models by cluster

train2_x_0 = queen_train.loc[queen_train.Cluster == 0 ,['food_fruit', 'food_txakoli', 'population', 'wind_avg', 'hum']]   
train2_x_1 = queen_train.loc[queen_train.Cluster == 1 ,['colonies_amount','rain_cum','food_fruit', 'temp_max_avg', 'temp_min_abs', 'food_txakoli']]
train2_x_2 = queen_train.loc[queen_train.Cluster == 2 ,['population', 'rain_max_day', 'rain_1mm', 'hum', 'temp_max_abs', 'food_fruit']] 
train2_x_3 = queen_train.loc[queen_train.Cluster == 3 ,['food_apple', 'colonies_amount','wind_max_avg', 'temp_max_avg']]
 
rf_0.fit(train2_x_0, train_y_0)
rf_1.fit(train2_x_1, train_y_1)
rf_2.fit(train2_x_2, train_y_2)
rf_3.fit(train2_x_3, train_y_3)

RandomForestRegressor(n_estimators=1000, random_state=42)

# Predictions

In [66]:
queen_predict_0 = queen_predict.loc[queen_predict.Cluster == 0, :].copy(True)
queen_predict_1 = queen_predict.loc[queen_predict.Cluster == 1 , :].copy(True)
queen_predict_2 = queen_predict.loc[queen_predict.Cluster == 2 ,:].copy(True)
queen_predict_3 = queen_predict.loc[queen_predict.Cluster == 3 ,:].copy(True)


In [67]:
predictions_0 = rf_0.predict(queen_predict_0.loc[:,['food_fruit', 'food_txakoli', 'population', 'wind_avg', 'hum']])
predictions_1 = rf_1.predict(queen_predict_1.loc[:,['colonies_amount','rain_cum','food_fruit', 'temp_max_avg', 'temp_min_abs', 'food_txakoli']])
predictions_2 = rf_2.predict(queen_predict_2.loc[:,['population', 'rain_max_day', 'rain_1mm', 'hum', 'temp_max_abs', 'food_fruit']])
predictions_3 = rf_3.predict(queen_predict_3.loc[:,['food_apple', 'colonies_amount','wind_max_avg', 'temp_max_avg']])

In [68]:
queen_predict_0['nests_2020'] = predictions_0
queen_predict_1['nests_2020'] = predictions_1
queen_predict_2['nests_2020'] = predictions_2
queen_predict_3['nests_2020'] = predictions_3

In [69]:
HEX_0 = queen_predict_0.loc[:,['municip_code','municip_name','nests_2020']].groupby(by=['municip_code','municip_name'], as_index=False).sum().round().dropna()
HEX_1 = queen_predict_1.loc[:,['municip_code','municip_name','nests_2020']].groupby(by=['municip_code','municip_name'], as_index=False).sum().round().dropna()
HEX_2 = queen_predict_2.loc[:,['municip_code','municip_name','nests_2020']].groupby(by=['municip_code','municip_name'], as_index=False).sum().round().dropna()
HEX_3 = queen_predict_3.loc[:,['municip_code','municip_name','nests_2020']].groupby(by=['municip_code','municip_name'], as_index=False).sum().round().dropna()


In [70]:
aux = pd.DataFrame({"municip_code":[48020, 48022, 48071, 48088,48074,48051],
                    "municip_name":['Bilbao','Karrantza Harana/Valle de Carranza','Muskiz', 'Ubide','UrduÃ±a/OrduÃ±a','Lanestosa'],
                    "nests_2020":[0, 1, 0, 0, 1, 1]})

In [71]:
HEX = HEX_0.append(HEX_1, ignore_index = True).append(HEX_2, ignore_index = True).append(HEX_3, ignore_index = True).append(aux, ignore_index = True)

In [72]:
HEX.loc[HEX.municip_code.isin(['48015','48036']),'nests_2020'] = 0

In [73]:
HEX.columns = ['CODIGO MUNICIPIO','NOMBRE MUNICIPIO','NIDOS 2020']

In [74]:
check_data(HEX)

(112, 3) : Shape is correct.
112 : Number of unique municipalities is correct.
Great! All municipalities have NESTS >= 0.
The Total 2020 Nests' Prediction is 1945


In [75]:
HEX.to_csv('WaspBusters_20210526_RandomForestyears_v1.csv', index=False)