## Modules

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error

import tensorflow
from tensorflow.python.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.wrappers.scikit_learn import KerasRegressor




## Loading data

In [200]:
df=pd.read_csv('D:/Bootcamp/Kopuru/Predictions data/WBds03_QUEENtrain_months.csv', sep=',')
df.head()

Unnamed: 0,year_x,municip_name,species,municip_code,month,year_offset,waspbust_id,colonies_amount,food_fruit,food_apple,...,weath_1dayRainfall,weath_solar,weath_meanTemp,weath_maxTemp,weath_maxMeanTemp,weath_minTemp,weath_meanWindM,weath_maxWindM,weath_meanDayMaxWind,population
0,2018,Abadiño,Vespa Velutina,48001,1,2017,1,160,0,0,...,63.1,12.452224,4.3,16.8,8.5,0.8,10.34961,82.102336,40.167811,7.533
1,2018,Abadiño,Vespa Velutina,48001,2,2017,4,160,0,0,...,24.4,12.549345,9.5,17.9,14.8,4.6,10.267276,75.442306,40.125148,7.533
2,2018,Abadiño,Vespa Velutina,48001,3,2017,1,160,0,0,...,23.0,12.526446,11.1,27.5,16.8,6.0,10.09632,74.288904,40.108287,7.533
3,2018,Abadiño,Vespa Velutina,48001,5,2017,3,160,0,0,...,40.5,12.460106,16.7,35.5,23.9,10.2,10.573962,72.719699,40.122893,7.533
4,2018,Abadiño,Vespa Velutina,48001,6,2017,1,160,0,0,...,18.0,12.548247,19.0,38.4,25.1,14.1,11.004038,73.728702,40.190008,7.533


## Split variables

In [201]:
X=df.loc[:,['population', 'weath_meanTemp', 'food_txakoli']]
y=df[['waspbust_id']]

In [202]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Find relevant variables

I used Random forest to see importance variables.

In [98]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 100,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42)
rf.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


RandomForestRegressor(n_jobs=-1, oob_score=True, random_state=42)

In [99]:
from sklearn.metrics import r2_score
from rfpimp import permutation_importances

def r2(rf, X_train, y_train):
    return r2_score(y_train, rf.predict(X_train))

perm_imp_rfpimp = permutation_importances(rf, X_train, y_train, r2)

In [100]:
perm_imp_rfpimp

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
population,0.827754
weath_meanTemp,0.361284
food_txakoli,0.14605
food_fruit,0.105834
colonies_amount,0.102203
weath_minTemp,0.085933
weath_meanWindM,0.080957
weath_humidity,0.0577
food_apple,0.056399
weath_accuRainfall,0.051394


## Creating neural network

In [203]:
model = Sequential()
model.add(Dense(10, input_dim=3, kernel_initializer='normal', activation='relu'))
model.add(Dense(1000, activation='relu'))
model.add(Dense(1, activation='selu'))
model.summary()

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_54 (Dense)             (None, 10)                40        
_________________________________________________________________
dense_55 (Dense)             (None, 1000)              11000     
_________________________________________________________________
dense_56 (Dense)             (None, 1)                 1001      
Total params: 12,041
Trainable params: 12,041
Non-trainable params: 0
_________________________________________________________________


# Creating model

In [204]:
model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])
history=model.fit(X_train, y_train, epochs=30, batch_size=150, verbose=1, validation_split=0.15)
predictions = model.predict(X_test)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## Measuring 

In [206]:
import math
MSE = mean_squared_error(y_test,predictions)
RMSE = math.sqrt(MSE)
print(RMSE)

3.3430013433382975


## Forecasting

In [207]:
predict=pd.read_csv('D:/Bootcamp/Kopuru/Predictions data/WBds03_QUEENpredict_months.csv', sep=',')
data=predict.loc[:,['population', 'weath_meanTemp', 'food_txakoli','municip_name','municip_code']]
X_pred=predict.loc[:,['population', 'weath_meanTemp', 'food_txakoli']]
data['pred_2020'] = model.predict(X_pred)
data = data.loc[:,['municip_code','municip_name','pred_2020']].groupby(by=['municip_code','municip_name'], as_index=False).sum().round().dropna()

data.columns = ['CODIGO MUNICIPIO','NOMBRE MUNICIPIO','NIDOS 2020']
bilbao_dic = {'CODIGO MUNICIPIO':'48020', 'NOMBRE MUNICIPIO':'Bilbao','NIDOS 2020':'0'}
bilbao=pd.DataFrame([bilbao_dic])
data= data.append(bilbao)
data.head()

Unnamed: 0,CODIGO MUNICIPIO,NOMBRE MUNICIPIO,NIDOS 2020
0,48001,AbadiÃ±o,41.0
1,48002,Abanto y CiÃ©rvana-Abanto Zierbena,47.0
2,48003,Amorebieta-Etxano,61.0
3,48004,Amoroto,24.0
4,48005,Arakaldo,30.0


In [208]:
data.to_csv('D:/Bootcamp/Kopuru/Predictions data/WaspBusters_20210511_batch_DeepMonths.csv', index=False)