<a href="https://colab.research.google.com/github/IEwaspbusters/KopuruVespaCompetitionIE/blob/main/Competition_subs/2021-04-28_submit/batch_LARVAE/HEX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# XGBoost Years: Prediction with Cluster Variables and selected Weather Variables (according to Feature importance)

## Import the Data & Modules

In [1]:
# Base packages -----------------------------------
import pandas as pd
import numpy as np

# Data Viz -----------------------------------
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (15, 10) # to set figure size when ploting feature_importance


# XGBoost -------------------------------
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import plot_importance # built-in function to plot features ordered by their importance

# SKLearn -----------------------------------------
from sklearn import preprocessing # scaling data

#Cluster
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from plotnine import *

In [2]:
# Function that checks if final Output is ready for submission or needs revision   

def check_data(HEX):
    
    def template_checker(HEX):
        submission_df = (HEX["CODIGO MUNICIPIO"].astype("string")+HEX["NOMBRE MUNICIPIO"]).sort_values().reset_index(drop=True)
        template_df = (template["CODIGO MUNICIPIO"].astype("string")+template["NOMBRE MUNICIPIO"]).sort_values().reset_index(drop=True)
        check_df = pd.DataFrame({"submission_df":submission_df,"template_df":template_df})
        check_df["check"] = check_df.submission_df == check_df.template_df
        if (check_df.check == False).any():
            pd.options.display.max_rows = 112
            return check_df.loc[check_df.check == False,:]
        else:  
            return "All Municipality Names and Codes to be submitted match the Template"
    
    print("Submission form Shape is", HEX.shape)
    print("Number of Municipalities is", HEX["CODIGO MUNICIPIO"].nunique())
    print("The Total 2020 Nests' Prediction is", int(HEX["NIDOS 2020"].sum()))

    assert HEX.shape == (112, 3), "Error: Shape is incorrect."
    assert HEX["CODIGO MUNICIPIO"].nunique() == 112, "Error: Number of unique municipalities is correct."    
    return template_checker(HEX)

In [3]:
# Importing datasets from GitHub as Pandas Dataframes
queen_train = pd.read_csv("../Feeder_years/WBds03_QUEENtrainYEARS.csv", encoding="utf-8") #2018+2019 test df
queen_predict = pd.read_csv("../Feeder_years/WBds03_QUEENpredictYEARS.csv", encoding="utf-8") #2020 prediction df
template = pd.read_csv("../../../Input_open_data/ds01_PLANTILLA-RETO-AVISPAS-KOPURU.csv",sep=";", encoding="utf-8")
den_com = pd.read_excel("../../../Other_open_data/densidad comercial.xlsx")
cluster= pd.read_csv("../Feeder_years/WBds_CLUSTERSnests.csv")

In [20]:
den_com_melt= pd.melt(den_com, id_vars=['Código municipio'], value_vars=['2019', '2018', '2017'], var_name='year_offset', 
                      value_name='densidad')
den_com_melt.rename({'Código municipio':'municip_code'}, axis=1, inplace=True)
den_com_melt["densidad"] = den_com_melt["densidad"].apply(lambda x: x.replace(",", "."))
den_com_melt['year_offset']= den_com_melt['year_offset'].apply(str)

## New queen Train dataset

In [99]:
df_train= queen_train.iloc[:,:33]
df_train['year_offset']= df_train['year_offset'].apply(str)

df_train = df_train.merge(den_com_melt,\
             how='left', left_on=['municip_code','year_offset'],\
             right_on=['municip_code','year_offset']).merge(cluster, how='left', on= 'municip_code') #Merge Densidad comercial + Cluster

#Cleaning
df_train.drop(['municip_name_y','station_code'], axis=1, inplace=True)
df_train.rename({'municip_name_x': 'municip_name'}, axis=1, inplace=True)


## New queen predict dataset

In [165]:
queen_predict['year_offset']= queen_predict['year_offset'].apply(str)

df_predict= queen_predict.loc[:,['municip_name', 'municip_code', 'year_offset','population']].merge(den_com_melt,\
             how='left', left_on=['municip_code','year_offset'],\
             right_on=['municip_code','year_offset']).merge(cluster, how='left',on='municip_code')

df_predict.drop(['municip_name_y'], axis=1, inplace=True)
df_predict.rename({'municip_name_x': 'municip_name'}, axis=1, inplace=True)

#Aux to predict (X_Predict)
aux_predict= df_predict.iloc[:,3:]

In [173]:
y = df_train.NESTS

# X will be the explanatory variables. Remove response variable and non desired categorical columns such as (municip code, year, etc...)
X = df_train.loc[:,['population', 'densidad','Cluster']]



## Forecasting

In [174]:
# Scale the datasets using MinMaxScaler

X_scaled = preprocessing.minmax_scale(X) # this creates a numpy array
X_scaled = pd.DataFrame(X_scaled,index=X.index,columns=X.columns)

In [175]:
# selecting the XGBoost model and fitting with the train data
model = XGBRegressor()

In [176]:
# selecting the XGBoost model and fitting with the train data for each cluster

model.fit(X_scaled, y)

XGBRegressor()

In [177]:
# make a prediction
X_scaled_pred = preprocessing.minmax_scale(aux_predict)
X_scaled_pred = pd.DataFrame(X_scaled_pred,index=aux_predict.index,columns=aux_predict.columns)
df_predict['nests_2020'] = model.predict(X_scaled_pred)

## Add Each Cluster Predictions to the original DataFrame and Save it as a `.csv file`

In [179]:
# Remove the Municipalities to which we did not assign a Cluster, since there was not reliable data for us to predict 

df_predict = df_predict.loc[~df_predict.municip_code.isin([48071, 48074, 48022, 48088, 48051, 48020]),:]

In [180]:
# Create a new DataFrame with the Municipalities to insert manualy

HEX_aux = pd.DataFrame({"CODIGO MUNICIPIO":[48022, 48071, 48088, 48074, 48051, 48020],\
             "NOMBRE MUNICIPIO":["Karrantza Harana/Valle de Carranza","Muskiz","Ubide","Urduña/Orduña","Lanestosa","Bilbao"],\
             "NIDOS 2020":[0,0,1,0,1,0]})

In [181]:
HEX = df_predict.loc[:,["municip_code","municip_name","nests_2020"]].round() # create a new Dataframe for Kopuru submission
HEX.columns = ["CODIGO MUNICIPIO","NOMBRE MUNICIPIO","NIDOS 2020"] # change column names to Spanish (Decidata template)
HEX = HEX.append(HEX_aux, ignore_index=True) # Add rows of municipalities to add manually

In [182]:
# Final check

check_data(HEX)

Submission form Shape is (112, 3)
Number of Municipalities is 112
The Total 2020 Nests' Prediction is 2801


'All Municipality Names and Codes to be submitted match the Template'

In [183]:
# reset max_rows to default values (used in function to see which rows did not match template)

pd.reset_option("max_rows")

In [184]:
# Save the new dataFrame as a .csv in the current working directory on Windows

HEX.to_csv("WaspBusters_20210608_XGyears_ClusterMB_PC4_Zeros.csv", index=False)