# Random Forest Model: years with clusters done by MB.


In [229]:
# Base packages -----------------------------------
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Linear Regression -------------------------------
from statsmodels.formula.api import ols

# SKLearn -----------------------------------------
from sklearn.model_selection import train_test_split


# Random Forest Regressort------------------------
from sklearn.ensemble import RandomForestRegressor

## Functions

In [230]:
# Function that checks if final Output is ready for submission or needs revision   

def check_data(HEX):

    if HEX.shape == (112, 3):
        print(HEX.shape,": Shape is correct.")
    else:
        print(HEX.shape,": Shape is **INCORRECT!**")

    if HEX["CODIGO MUNICIPIO"].nunique() == 112:
        print(HEX["CODIGO MUNICIPIO"].nunique(),": Number of unique municipalities is correct.")
    else:
        print(HEX["CODIGO MUNICIPIO"].nunique(),": Number of unique municipalities is **INCORRECT!**")
        
    if any(HEX["NIDOS 2020"] < 0):
        print("**INCORRECT!** At least one municipality has NESTS <= 0.")
    else:
        print("Great! All municipalities have NESTS >= 0.")
    
    print("The Total 2020 Nests' Prediction is", int(HEX["NIDOS 2020"].sum()))

# Loading and massaging data

In [231]:
# Importing datasets from GitHub as Pandas Dataframes
queen_train = pd.read_csv("../Feeder_years/WBds03_QUEENtrainYEARS.csv") #2018+2019 test df
queen_predict = pd.read_csv("../Feeder_years/WBds03_QUEENpredictYEARS.csv") #2018+2019 test df
queen_cluster = pd.read_csv("../Feeder_years/WBds_CLUSTERSnests.csv") 

In [232]:
queen_train.head()

Unnamed: 0,municip_name,municip_code,year_offset,NESTS,colonies_amount,food_fruit,food_apple,food_txakoli,food_kiwi,food_pear,...,weath_meanDayMaxWind,population,cluster_size,cluster_cosmo,cluster_survive,cluster_food,cluster_weather_wet,cluster_weather_temp,cluster_weather_wind,cluster_weather_level
0,Abadiño,48001,2017,54,160.0,0.0,0.0,0.0,0.0,0.0,...,40.190008,7522,0,0,0,0,1,1,0,1
1,Abadiño,48001,2018,22,160.0,0.0,0.0,0.0,0.0,0.0,...,40.302092,7533,0,0,0,0,1,1,0,1
2,Abanto y Ciérvana-Abanto Zierbena,48002,2017,92,161.0,0.0,0.0,0.0,0.0,0.0,...,59.4,9543,4,1,1,0,1,1,1,1
3,Abanto y Ciérvana-Abanto Zierbena,48002,2018,47,161.0,0.0,0.0,0.0,0.0,0.0,...,70.7,9545,4,1,1,0,1,1,1,1
4,Ajangiz,48911,2017,19,0.0,17.0,1.0,3.0,15.0,0.0,...,40.287502,474,1,0,0,0,1,1,0,1


In [233]:
queen_train = queen_train.loc[queen_train.year_offset == 2018,:].copy(True)

In [234]:
# Adding cluster labels
queen_train = pd.merge(queen_train, queen_cluster, how = 'left', on = 'municip_code')
queen_predict = pd.merge(queen_predict, queen_cluster, how = 'left', on = 'municip_code')

In [235]:
queen_train.columns

Index(['municip_name_x', 'municip_code', 'year_offset', 'NESTS',
       'colonies_amount', 'food_fruit', 'food_apple', 'food_txakoli',
       'food_kiwi', 'food_pear', 'food_blueberry', 'food_raspberry',
       'station_code', 'weath_days_frost', 'weath_humidity', 'weath_maxLevel',
       'weath_midLevel', 'weath_minLevel', 'weath_days_rain',
       'weath_days_rain1mm', 'weath_accuRainfall', 'weath_10minRainfall',
       'weath_1dayRainfall', 'weath_solar', 'weath_meanTemp', 'weath_maxTemp',
       'weath_maxMeanTemp', 'weath_minTemp', 'weath_meanWindM',
       'weath_maxWindM', 'weath_meanDayMaxWind', 'population', 'cluster_size',
       'cluster_cosmo', 'cluster_survive', 'cluster_food',
       'cluster_weather_wet', 'cluster_weather_temp', 'cluster_weather_wind',
       'cluster_weather_level', 'municip_name_y', 'Cluster'],
      dtype='object')

In [236]:
queen_train.head(2)

Unnamed: 0,municip_name_x,municip_code,year_offset,NESTS,colonies_amount,food_fruit,food_apple,food_txakoli,food_kiwi,food_pear,...,cluster_size,cluster_cosmo,cluster_survive,cluster_food,cluster_weather_wet,cluster_weather_temp,cluster_weather_wind,cluster_weather_level,municip_name_y,Cluster
0,Abadiño,48001,2018,22,160.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,1,0,1,Abadiño,2.0
1,Abanto y Ciérvana-Abanto Zierbena,48002,2018,47,161.0,0.0,0.0,0.0,0.0,0.0,...,4,1,1,0,1,1,1,1,Abanto y Ciérvana-Abanto Zierbena,2.0


In [237]:
queen_train.shape

(112, 42)

In [238]:
X = queen_train.drop(columns=['NESTS','year_offset','municip_name_x','municip_name_y', 'municip_code','station_code', 'cluster_size', 'cluster_cosmo', 'cluster_survive', 'cluster_food', 'cluster_weather_wet', 'cluster_weather_temp', 'cluster_weather_wind' ,'cluster_weather_level']).copy(True).dropna()

In [239]:
X.shape

(111, 28)

In [240]:
y = queen_train.dropna().NESTS

In [241]:
y.shape

(111,)

In [242]:
y.dtypes

dtype('int64')

# Model

In [243]:
# Instantiate the models with 1000 decision trees
model = RandomForestRegressor(n_estimators = 1000, random_state = 42)

In [244]:
# Train the model
model.fit(X,y)

RandomForestRegressor(n_estimators=1000, random_state=42)

# Feature importance

In [245]:
# Get numerical feature importances
feature_list = list(X.columns)

# List of tuples with variable and importance
importances = list(model.feature_importances_)
feature_importances = [(feature, round(importance, 4)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: population           Importance: 0.4075
Variable: Cluster              Importance: 0.1664
Variable: food_fruit           Importance: 0.088
Variable: food_txakoli         Importance: 0.0548
Variable: weath_minLevel       Importance: 0.0276
Variable: colonies_amount      Importance: 0.0242
Variable: weath_midLevel       Importance: 0.0198
Variable: food_blueberry       Importance: 0.0159
Variable: weath_10minRainfall  Importance: 0.0152
Variable: weath_1dayRainfall   Importance: 0.014
Variable: weath_humidity       Importance: 0.0134
Variable: weath_days_rain1mm   Importance: 0.0134
Variable: weath_maxLevel       Importance: 0.0129
Variable: weath_solar          Importance: 0.0125
Variable: food_apple           Importance: 0.0121
Variable: weath_days_frost     Importance: 0.0108
Variable: weath_meanTemp       Importance: 0.0108
Variable: weath_minTemp        Importance: 0.0099
Variable: weath_maxTemp        Importance: 0.0098
Variable: weath_accuRainfall   Importance: 0.0095
Va

# New model with relevant variables

In [246]:
# Train new models by cluster
X_1 = X.loc[:,['population', 'Cluster','food_fruit']] #,'food_txakoli', 'weath_minLevel', 'colonies_amount', 'weath_midLevel','food_blueberry'
model.fit(X_1, y)

RandomForestRegressor(n_estimators=1000, random_state=42)

# Predictions

In [247]:
X_test = queen_predict.loc[queen_predict.municip_code != 48020, :].copy(True)

In [248]:
predictions = model.predict(X_test.loc[:,['population', 'Cluster','food_fruit']])

In [249]:
X_test['nests_2020'] = predictions

In [250]:
HEX = X_test.loc[:,['municip_code','municip_name_x','nests_2020']].groupby(by=['municip_code','municip_name_x'], as_index=False).sum().round().dropna()

In [251]:
HEX.shape

(111, 3)

In [252]:
aux = pd.DataFrame({"municip_code": [48020],
                    "municip_name_x":['Bilbao'],
                    "nests_2020": [0]})

In [253]:
HEX = HEX.append(aux, ignore_index = True)

In [226]:
HEX.loc[HEX.municip_code.isin([48020,48071, 48088]),'nests_2020']= 0
HEX.loc[HEX.municip_code.isin([48022,48074, 48051]),'nests_2020']= 1

In [227]:
HEX.columns = ['CODIGO MUNICIPIO','NOMBRE MUNICIPIO','NIDOS 2020']

In [228]:
check_data(HEX)

(112, 3) : Shape is correct.
112 : Number of unique municipalities is correct.
Great! All municipalities have NESTS >= 0.
The Total 2020 Nests' Prediction is 1930


In [75]:
HEX.to_csv('WaspBusters_20210526_RandomForestyears_v1.csv', index=False)