# HEX algorithm **Kopuru Vespa Velutina Competition**

**XGBoost model**

Purpose: Predict the number of Nests in each of Biscay's 112 municipalities for the year 2020.

Output: *(WaspBusters_20210609_batch_XGBy_48019prodigal.csv)*

@authors:
* mario.bejar@student.ie.edu
* pedro.geirinhas@student.ie.edu
* a.berrizbeitia@student.ie.edu
* pcasaverde@student.ie.edu

## Libraries

In [1]:
# Base packages -----------------------------------
import numpy as np
import pandas as pd

# Visualization -----------------------------------
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (15, 10)
import seaborn as sns
plt.style.use("seaborn-notebook")

# Scaling data ------------------------------------
from sklearn import preprocessing

# Grid search -------------------------------------
from sklearn.model_selection import GridSearchCV

# Confusion matrix --------------------------------
#from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# XGBoost -----------------------------------------
from xgboost import XGBRegressor
from xgboost import plot_importance

## Functions

In [2]:
# Function that checks if final Output is ready for submission or needs revision   

def check_data(HEX):
    
    def template_checker(HEX):
        submission_df = (HEX["CODIGO MUNICIPIO"].astype("string") + HEX["NOMBRE MUNICIPIO"]).sort_values().reset_index(drop=True)
        template_df = (template["CODIGO MUNICIPIO"].astype("string") + template["NOMBRE MUNICIPIO"]).sort_values().reset_index(drop=True)
        check_df = pd.DataFrame({"submission_df":submission_df,"template_df":template_df})
        check_df["check"] = check_df.submission_df == check_df.template_df
        if (check_df.check == False).any():
            pd.options.display.max_rows = 112
            return check_df.loc[check_df.check == False,:]
        else:  
            return "All Municipality Names and Codes to be submitted match the Template"
    
    print("Submission form Shape is", HEX.shape)
    print("Number of Municipalities is", HEX["CODIGO MUNICIPIO"].nunique())
    print("The Total 2020 Nests' Prediction is", int(HEX["NIDOS 2020"].sum()))

    assert HEX.shape == (112, 3), "Error: Shape is incorrect."
    assert HEX["CODIGO MUNICIPIO"].nunique() == 112, "Error: Number of unique municipalities is correct."    
    return template_checker(HEX)

## Get the data

In [3]:
QUEEN_train = pd.read_csv('../Feeder_years/WBds03_QUEENtrainYEARS.csv', sep=',')
QUEEN_predict = pd.read_csv('../Feeder_years/WBds03_QUEENpredictYEARS.csv', sep=',')

clustersMario = pd.read_csv("../auxiliary_files/WBds_CLUSTERSnests.csv")

template = pd.read_csv("../../../Input_open_data/ds01_PLANTILLA-RETO-AVISPAS-KOPURU.csv",sep=";", encoding="utf-8")

In [4]:
#QUEEN_predict.isnull().sum()

In [5]:
QUEEN_train.shape

(224, 40)

In [6]:
QUEEN_predict.shape

(112, 40)

### Add in more Clusters (nest amount clusters)

In [7]:
QUEEN_train = pd.merge(QUEEN_train, clustersMario, how = 'left', on = ['municip_code', 'municip_name'])
QUEEN_predict = pd.merge(QUEEN_predict, clustersMario, how = 'left', on = ['municip_code', 'municip_name'])

In [8]:
QUEEN_train.fillna(4, inplace=True)
QUEEN_predict.fillna(4, inplace=True)

In [9]:
QUEEN_train.shape

(224, 41)

In [10]:
QUEEN_predict.shape

(112, 41)

In [11]:
QUEEN_predict.Cluster.value_counts()

0.0    66
2.0    41
1.0     4
4.0     1
Name: Cluster, dtype: int64

## Prediction time!

### 1. Choose the model class

In [12]:
XGBRegressor

xgboost.sklearn.XGBRegressor

### 2. Instantiate the model

In [13]:
xgb = XGBRegressor(random_state=23)

### 3. Prepare Feature matrix and Target variable

In [14]:
# The target variable
y_train = QUEEN_train.loc[:, ['municip_code', 'year_offset', 'NESTS']]
y_train.set_index(['year_offset', 'municip_code'], inplace=True)

y_predict = QUEEN_predict.loc[:, ['municip_code', 'year_offset', 'NESTS']]
y_predict.set_index(['year_offset', 'municip_code'], inplace=True)

# The features matrix
X_train = QUEEN_train.drop(['municip_name', 'station_code', 'NESTS'], axis=1)
X_train.set_index(['year_offset', 'municip_code'], inplace=True)

X_predict = QUEEN_predict.drop(['municip_name', 'station_code', 'NESTS'], axis=1)
X_predict.set_index(['year_offset', 'municip_code'], inplace=True)

In [15]:
X_train.shape

(224, 36)

In [16]:
y_train.shape

(224, 1)

In [17]:
X_predict.shape

(112, 36)

In [18]:
y_predict.shape

(112, 1)

### 4. Fit the model to the training data sets

#### Scale and get feature importance

In [19]:
#X = X_train
#y = y_train
#scalators = X.columns
#X[scalators] = preprocessing.minmax_scale(X[scalators])

In [20]:
# define the model
#model_fi = XGBRegressor(random_state=23)

# fit the model
#model_fi.fit(X, y)

In [21]:
# get importance
#importance = model_fi.feature_importances_
# summarize feature importance
#for i,v in enumerate(importance):
#	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
#plot_importance(model_fi, height=0.5, xlabel="F-Score", ylabel="Feature Importance", grid=False)
#plt.show()

#### Now, do fit the model but only with the relevant features

In [22]:
X_train = X_train.loc[:, ['population', 'food_fruit', 'colonies_amount', 'weath_days_frost', 'weath_humidity', 'food_txakoli', 'food_apple']]

X_predict = X_predict.loc[:, ['population', 'food_fruit', 'colonies_amount', 'weath_days_frost', 'weath_humidity', 'food_txakoli', 'food_apple']]

In [23]:
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=6, num_parallel_tree=1, random_state=23,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

### 5. Predict the labels for new data

In [24]:
y_predict = xgb.predict(X_predict)

In [25]:
accuracy_train = xgb.score(X_train, y_train)
print(f"Accuracy on the training set: {accuracy_train:.0%}")

Accuracy on the training set: 100%


In [26]:
accuracy_predict = xgb.score(X_predict, y_predict)
print(f"Accuracy on the test set: {accuracy_predict:.0%}")

Accuracy on the test set: 100%


In [27]:
y_predict.shape

(112,)

In [28]:
QUEEN_predict['NESTS'] = y_predict

In [29]:
QUEEN_predict.NESTS.sum()

2745.5986

In [30]:
QUEEN_predict.NESTS[QUEEN_predict.NESTS < 0] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  QUEEN_predict.NESTS[QUEEN_predict.NESTS < 0] = 0


In [31]:
QUEEN_predict.NESTS.sum()

2745.755

## Prepare the dataset for submission

In [32]:
HEX = QUEEN_predict.loc[:,['municip_code', 'municip_name', 'NESTS']]

## Adjust manually for Bilbao 48020 and generate the output

In [33]:
HEX.loc[HEX.municip_code.isin([48020]), 'NESTS'] = 0

In [34]:
HEX.loc[HEX.municip_code.isin([48022, 48071, 48088, 48074, 48051, 48020]), :]

Unnamed: 0,municip_code,municip_name,NESTS
27,48020,Bilbao,0.0
60,48022,Karrantza Harana/Valle de Carranza,29.934254
62,48051,Lanestosa,4.323457
83,48071,Muskiz,14.258961
99,48088,Ubide,5.978913
102,48074,Urduña/Orduña,30.56222


In [35]:
HEX.columns = ["CODIGO MUNICIPIO", "NOMBRE MUNICIPIO", "NIDOS 2020"] # change column names to Spanish (Competition template)

In [36]:
check_data(HEX)

Submission form Shape is (112, 3)
Number of Municipalities is 112
The Total 2020 Nests' Prediction is 2745


'All Municipality Names and Codes to be submitted match the Template'

### Export dataset for submission

In [37]:
HEX.to_csv('WaspBusters_20210609_128-yXGB-prodigal-noGSCV-noSort-FI-no0s.csv', index=False)

## VERSION Manual adjustments

In [38]:
HEX.columns = ['municip_code', 'municip_name', 'NESTS'] # change column names to Spanish (Competition template)

In [39]:
HEX.loc[HEX.municip_code.isin([48022, 48071, 48088, 48074, 48051]), 'NESTS'] = [0,0,1,0,1]

In [40]:
HEX.loc[HEX.municip_code.isin([48022, 48071, 48088, 48074, 48051, 48020]), :]

Unnamed: 0,municip_code,municip_name,NESTS
27,48020,Bilbao,0.0
60,48022,Karrantza Harana/Valle de Carranza,0.0
62,48051,Lanestosa,0.0
83,48071,Muskiz,1.0
99,48088,Ubide,0.0
102,48074,Urduña/Orduña,1.0


In [41]:
HEX.columns = ["CODIGO MUNICIPIO", "NOMBRE MUNICIPIO", "NIDOS 2020"] # change column names to Spanish (Competition template)

In [42]:
check_data(HEX)

Submission form Shape is (112, 3)
Number of Municipalities is 112
The Total 2020 Nests' Prediction is 2662


'All Municipality Names and Codes to be submitted match the Template'

### Export dataset for submission

In [43]:
HEX.to_csv('WaspBusters_20210609_127-yXGB-prodigal-noGSCV-noSort-FI-0s.csv', index=False)