# HEX algorithm **Kopuru Vespa Velutina Competition**

**XGBoost model**

Purpose: Predict the number of Nests in each of Biscay's 112 municipalities for the year 2020.

Output: *(WaspBusters_20210609_batch_XGBy_48019prodigal.csv)*

@authors:
* mario.bejar@student.ie.edu
* pedro.geirinhas@student.ie.edu
* a.berrizbeitia@student.ie.edu
* pcasaverde@student.ie.edu

## Libraries

In [231]:
# Base packages -----------------------------------
import numpy as np
import pandas as pd

# Visualization -----------------------------------
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (15, 10)
import seaborn as sns
plt.style.use("seaborn-notebook")

# Scaling data ------------------------------------
from sklearn import preprocessing

# Grid search -------------------------------------
from sklearn.model_selection import GridSearchCV

# Confusion matrix --------------------------------
#from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# XGBoost -----------------------------------------
from xgboost import XGBRegressor
from xgboost import plot_importance

## Functions

In [232]:
# Function that checks if final Output is ready for submission or needs revision   

def check_data(HEX):
    
    def template_checker(HEX):
        submission_df = (HEX["CODIGO MUNICIPIO"].astype("string") + HEX["NOMBRE MUNICIPIO"]).sort_values().reset_index(drop=True)
        template_df = (template["CODIGO MUNICIPIO"].astype("string") + template["NOMBRE MUNICIPIO"]).sort_values().reset_index(drop=True)
        check_df = pd.DataFrame({"submission_df":submission_df,"template_df":template_df})
        check_df["check"] = check_df.submission_df == check_df.template_df
        if (check_df.check == False).any():
            pd.options.display.max_rows = 112
            return check_df.loc[check_df.check == False,:]
        else:  
            return "All Municipality Names and Codes to be submitted match the Template"
    
    print("Submission form Shape is", HEX.shape)
    print("Number of Municipalities is", HEX["CODIGO MUNICIPIO"].nunique())
    print("The Total 2020 Nests' Prediction is", int(HEX["NIDOS 2020"].sum()))

    assert HEX.shape == (112, 3), "Error: Shape is incorrect."
    assert HEX["CODIGO MUNICIPIO"].nunique() == 112, "Error: Number of unique municipalities is correct."    
    return template_checker(HEX)

## Get the data

In [233]:
QUEEN_train = pd.read_csv('../Feeder_months/WBds03_QUEENtrainMONTHS.csv', sep=',')
QUEEN_predict = pd.read_csv('../Feeder_months/WBds03_QUEENpredictMONTHS.csv', sep=',')

clustersMario = pd.read_csv("../auxiliary_files/WBds_CLUSTERSnests.csv")

template = pd.read_csv("../../../Input_open_data/ds01_PLANTILLA-RETO-AVISPAS-KOPURU.csv",sep=";", encoding="utf-8")

In [234]:
#QUEEN_predict.isnull().sum()

In [235]:
QUEEN_train.shape

(2688, 43)

In [236]:
QUEEN_predict.shape

(1344, 43)

### Add in more Clusters (nest amount clusters)

In [237]:
QUEEN_train = pd.merge(QUEEN_train, clustersMario, how = 'left', on = ['municip_code', 'municip_name'])
QUEEN_predict = pd.merge(QUEEN_predict, clustersMario, how = 'left', on = ['municip_code', 'municip_name'])

In [238]:
QUEEN_train.fillna(4, inplace=True)
QUEEN_predict.fillna(4, inplace=True)

In [239]:
QUEEN_train.shape

(2688, 44)

In [240]:
QUEEN_predict.shape

(1344, 44)

In [241]:
QUEEN_predict.Cluster.value_counts()

0.0    792
2.0    492
1.0     48
4.0     12
Name: Cluster, dtype: int64

## Get hyperparameters with GridsearchCV using 2018's features (i.e. 2019's nests) as the test year

In [242]:
# The target variable
hyper_y_train = QUEEN_train.loc[QUEEN_train.year_offset.isin([2017]), ['municip_code', 'year_offset', 'month', 'NESTS']]
hyper_y_train = hyper_y_train.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)
hyper_y_train.set_index(['year_offset', 'month', 'municip_code'], inplace=True)

hyper_y_test = QUEEN_train.loc[QUEEN_train.year_offset.isin([2018]), ['municip_code', 'year_offset', 'month', 'NESTS']]
hyper_y_test = hyper_y_test.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)
hyper_y_test.set_index(['year_offset', 'month', 'municip_code'], inplace=True)


# The features matrix
hyperXtrain = QUEEN_train.loc[QUEEN_train.year_offset.isin([2017]), :].drop(['municip_name', 'station_code', 'station_name', 'NESTS'], axis=1)
hyperXtrain = hyperXtrain.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)
hyperXtrain.set_index(['year_offset', 'month', 'municip_code'], inplace=True)

hyperXtest = QUEEN_train.loc[QUEEN_train.year_offset.isin([2018]), :].drop(['municip_name', 'station_code', 'station_name', 'NESTS'], axis=1)
hyperXtest = hyperXtest.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)
hyperXtest.set_index(['year_offset', 'month', 'municip_code'], inplace=True)

In [243]:
xgb1 = XGBRegressor(random_state=23)
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7],
             'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 3,
                        n_jobs = 5,
                        verbose=True)

xgb_grid.fit(hyperXtrain, hyper_y_train)

print(xgb_grid.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


-0.8732560531236078


In [244]:
print(xgb_grid.best_params_)

{'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}


In [245]:
y_xgb_grid = xgb_grid.best_estimator_.predict(hyperXtest)

In [246]:
#matrix = confusion_matrix(hyper_y_test, y_xgb_grid)

In [247]:
#ax = sns.heatmap(
#    matrix.T, square=True, annot=True, fmt="d", cbar=False, cmap="viridis",
#    xticklabels=["0", "1"], yticklabels=["0", "1"]
#)
#ax.set_xlabel("True label")
#ax.set_ylabel("Predicted label");

In [248]:
#print(classification_report(hyper_y_test, y_xgb_grid))

## Prediction time!

### 1. Choose the model class

In [249]:
XGBRegressor

xgboost.sklearn.XGBRegressor

### 2. Instantiate the model

In [250]:
xgb = xgb_grid.best_estimator_

### 3. Prepare Feature matrix and Target variable

In [251]:
# The target variable
y_train = QUEEN_train.loc[:, ['municip_code', 'year_offset', 'month', 'NESTS']]
y_train = y_train.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)
y_train.set_index(['year_offset', 'month', 'municip_code'], inplace=True)

y_predict = QUEEN_predict.loc[:, ['municip_code', 'year_offset', 'month', 'NESTS']]
y_predict = y_predict.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)
y_predict.set_index(['year_offset', 'month', 'municip_code'], inplace=True)

# The features matrix
X_train = QUEEN_train.drop(['municip_name', 'station_code', 'station_name', 'NESTS'], axis=1)
X_train = X_train.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)
X_train.set_index(['year_offset', 'month', 'municip_code'], inplace=True)

X_predict = QUEEN_predict.drop(['municip_name', 'station_code', 'station_name', 'NESTS'], axis=1)
X_predict = X_predict.sort_values(by=['year_offset', 'month', 'municip_code'], ascending=True)
X_predict.set_index(['year_offset', 'month', 'municip_code'], inplace=True)

In [252]:
X_train.shape

(2688, 37)

In [253]:
y_train.shape

(2688, 1)

In [254]:
X_predict.shape

(1344, 37)

In [255]:
y_predict.shape

(1344, 1)

### 4. Fit the model to the training data sets

#### Scale and get feature importance

In [256]:
#X = X_train
#y = y_train
#scalators = X.columns
#X[scalators] = preprocessing.minmax_scale(X[scalators])

In [257]:
# define the model
#model_fi = XGBRegressor(random_state=23)

# fit the model
#model_fi.fit(X, y)

In [258]:
# get importance
#importance = model_fi.feature_importances_
# summarize feature importance
#for i,v in enumerate(importance):
#	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
#plot_importance(model_fi, height=0.5, xlabel="F-Score", ylabel="Feature Importance", grid=False)
#plt.show()

#### Now, do fit the model but only with the relevant features

In [259]:
X_train = X_train.loc[:, ['population', 'weath_humidity', 'food_fruit', 'weath_maxLevel', 'food_txakoli', 'weath_midLevel', 'weath_minLevel', 'colonies_amount', 'weath_maxWindM', 'weath_meanWindM', 'weath_accuRainfall', 'weath_10minRainfall', 'food_kiwi', 'food_apple', 'weath_days_rain1mm', 'weath_meanDayMaxWind', 'weath_meanTemp']]

X_predict = X_predict.loc[:, ['population', 'weath_humidity', 'food_fruit', 'weath_maxLevel', 'food_txakoli', 'weath_midLevel', 'weath_minLevel', 'colonies_amount', 'weath_maxWindM', 'weath_meanWindM', 'weath_accuRainfall', 'weath_10minRainfall', 'food_kiwi', 'food_apple', 'weath_days_rain1mm', 'weath_meanDayMaxWind', 'weath_meanTemp']]

In [260]:
xgb.fit(X_train, y_train)

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.03, max_delta_step=0, max_depth=5,
             min_child_weight=4, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=4, nthread=4, num_parallel_tree=1,
             objective='reg:linear', random_state=23, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, silent=1, subsample=0.7, tree_method='exact',
             validate_parameters=1, verbosity=None)

### 5. Predict the labels for new data

In [261]:
y_predict = xgb.predict(X_predict)

In [262]:
accuracy_train = xgb.score(X_train, y_train)
print(f"Accuracy on the training set: {accuracy_train:.0%}")

Accuracy on the training set: 89%


In [263]:
accuracy_predict = xgb.score(X_predict, y_predict)
print(f"Accuracy on the test set: {accuracy_predict:.0%}")

Accuracy on the test set: 100%


In [264]:
y_predict.shape

(1344,)

In [265]:
QUEEN_predict['NESTS'] = y_predict

In [266]:
QUEEN_predict.NESTS.sum()

2952.7798

In [267]:
QUEEN_predict.NESTS[QUEEN_predict.NESTS < 0] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  QUEEN_predict.NESTS[QUEEN_predict.NESTS < 0] = 0


In [268]:
QUEEN_predict.NESTS.sum()

2955.921

## Prepare the dataset for submission

In [269]:
HEX = QUEEN_predict.loc[:,['municip_code', 'municip_name', 'NESTS']].groupby(by=['municip_code', 'municip_name'], as_index=False).sum()

## Adjust manually for Bilbao 48020 and generate the output

In [270]:
HEX.loc[HEX.municip_code.isin([48020]), 'NESTS'] = 0

In [271]:
HEX.loc[HEX.municip_code.isin([48022, 48071, 48088, 48074, 48051, 48020]), :]

Unnamed: 0,municip_code,municip_name,NESTS
19,48020,Bilbao,0.0
21,48022,Karrantza Harana/Valle de Carranza,30.041466
50,48051,Lanestosa,43.537819
70,48071,Muskiz,21.277731
73,48074,Urduña/Orduña,13.911097
87,48088,Ubide,26.038284


In [272]:
HEX.columns = ["CODIGO MUNICIPIO", "NOMBRE MUNICIPIO", "NIDOS 2020"] # change column names to Spanish (Competition template)

In [273]:
check_data(HEX)

Submission form Shape is (112, 3)
Number of Municipalities is 112
The Total 2020 Nests' Prediction is 2949


'All Municipality Names and Codes to be submitted match the Template'

### Export dataset for submission

In [274]:
HEX.to_csv('WaspBusters_20210609_132-mXGB-prodigal-GSCV-sorted-FI-no0s.csv', index=False)

## VERSION Manual adjustments

In [275]:
HEX.columns = ['municip_code', 'municip_name', 'NESTS'] # change column names to Spanish (Competition template)

In [276]:
HEX.loc[HEX.municip_code.isin([48022, 48071, 48088, 48074, 48051]), 'NESTS'] = [0,0,1,0,1]

In [277]:
HEX.loc[HEX.municip_code.isin([48022, 48071, 48088, 48074, 48051, 48020]), :]

Unnamed: 0,municip_code,municip_name,NESTS
19,48020,Bilbao,0.0
21,48022,Karrantza Harana/Valle de Carranza,0.0
50,48051,Lanestosa,0.0
70,48071,Muskiz,1.0
73,48074,Urduña/Orduña,0.0
87,48088,Ubide,1.0


In [278]:
HEX.columns = ["CODIGO MUNICIPIO", "NOMBRE MUNICIPIO", "NIDOS 2020"] # change column names to Spanish (Competition template)

In [279]:
check_data(HEX)

Submission form Shape is (112, 3)
Number of Municipalities is 112
The Total 2020 Nests' Prediction is 2816


'All Municipality Names and Codes to be submitted match the Template'

### Export dataset for submission

In [280]:
HEX.to_csv('WaspBusters_20210609_131-mXGB-prodigal-GSCV-sorted-FI-0s.csv', index=False)