# HIVE algorithm **Kopuru Vespa Velutina Competition**

Purpose: to process the weather data from Biscay's weather stations into a workable dataset.

Output: METEO dataset *(WBds02_METEO.csv)*

@authors:
* mario.bejar@student.ie.edu
* pedro.geirinhas@student.ie.edu
* a.berrizbeitia@student.ie.edu
* pcasaverde@student.ie.edu

# Part 1: ...

## 1.1 Get the data

In [None]:
import pandas as pd
import numpy as np
import glob, os

In [None]:
path = r'../../Input_open_data' # use your path
files = glob.glob(path + "/*.csv")
df = pd.concat([pd.read_csv(fp, header=None, sep=';').assign(new=os.path.basename(fp).split('.')[0]) for fp in files])

In [None]:
df.rename(columns=df.iloc[0], inplace=True)
df.columns
df.rename(columns={'DÍAS DE HELADA 2016': 'new'}, inplace= True)
df=df.loc[~df['COD.'].isin(['KOD.','COD.' ]),:].dropna(subset=['COD.']).drop(columns=["cota (m)", "SUMA"])

# Extract year from the string  
df['year'] = df['new'].str.extract('(\d\d\d\d)', expand=True)

#Función para crear codigo_merge
def str_join(df, sep, *cols):
    from functools import reduce
    return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), 
                 [df[col] for col in cols])

## 1.2 Get the variables

In [None]:
#Variables------------------------------------------------------------------------------------------------------------------

## Freeze----------------------------
freez= df[df['new'].str.contains("HELADA")].drop(columns=['new'])
freez=pd.melt(freez, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='freez')

freez['merge_cod'] = str_join(freez,'_' , 'COD.','ESTACION','year', 'month')
freez.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)
cols= ['merge_cod', 'freez']
freez= freez.reindex(columns= cols)

## Rain ------------------------------
rain= df[df['new'].str.contains("DÍAS DE PRECIPITACIÓN 20")].drop(columns=['new'])
rain=pd.melt(rain, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='rain')

rain['merge_cod'] = str_join(rain,'_' , 'COD.','ESTACION','year', 'month')

rain.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)

## rain_1mm----------------------------

rain_1mm= df[df['new'].str.contains("DÍAS DE PRECIPITACIÓN IGUAL O SUPERIOR")].drop(columns=['new'])
rain_1mm=pd.melt(rain_1mm, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='rain_1mm')

rain_1mm['merge_cod'] = str_join(rain_1mm,'_' , 'COD.','ESTACION','year', 'month')
rain_1mm.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)

## rain_cum------------------------------
rain_cum= df[df['new'].str.contains("PRECIPITACIÓN ACUMULADA")].drop(columns=['new'])
rain_cum=pd.melt(rain_cum, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='rain_cum')

rain_cum['merge_cod'] = str_join(rain_cum,'_' , 'COD.','ESTACION','year', 'month')
rain_cum.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)


## rain_max_10------------------------------
rain_max_10= df[df['new'].str.contains("PRECIPITACIÓN MÁXIMA EN 10 MINUTOS ")].drop(columns=['new'])
rain_max_10=pd.melt(rain_max_10, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='rain_max_10')

rain_max_10['merge_cod'] = str_join(rain_max_10,'_' , 'COD.','ESTACION','year', 'month')
rain_max_10.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)


## rain_max_day------------------------------
rain_max_day= df[df['new'].str.contains("PRECIPITACIÓN MÁXIMA EN UN DÍA ")].drop(columns=['new'])
rain_max_day=pd.melt(rain_max_day, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='rain_max_day')

rain_max_day['merge_cod'] = str_join(rain_max_day,'_' , 'COD.','ESTACION','year', 'month')
rain_max_day.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)

## hum------------------------------
hum= df[df['new'].str.contains("HUMEDAD MEDIA")].drop(columns=['new'])
hum=pd.melt(hum, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='hum')

hum['merge_cod'] = str_join(hum,'_' , 'COD.','ESTACION','year', 'month')
hum.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)


## sun------------------------------
sun= df[df['new'].str.contains("IRRADIACIÓN MEDIA")].drop(columns=['new'])
sun=pd.melt(sun, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='sun')

sun['merge_cod'] = str_join(sun,'_' , 'COD.','ESTACION','year', 'month')
sun.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)




## lev_max------------------------------
lev_max= df[df['new'].str.contains("NIVEL MÁXIMO")].drop(columns=['new'])
lev_max=pd.melt(lev_max, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='lev_max')

lev_max['merge_cod'] = str_join(lev_max,'_' , 'COD.','ESTACION','year', 'month')
lev_max.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)



## lev_mid------------------------------
lev_mid= df[df['new'].str.contains("NIVEL MEDIO")].drop(columns=['new'])
lev_mid=pd.melt(lev_mid, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='lev_mid')

lev_mid['merge_cod'] = str_join(lev_mid,'_' , 'COD.','ESTACION','year', 'month')
lev_mid.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)

## lev_min------------------------------only 2019
lev_min= df[df['new'].str.contains("NIVEL MÍNIMO")].drop(columns=['new'])
lev_min=pd.melt(lev_min, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='lev_min')

lev_min['merge_cod'] = str_join(lev_min,'_' , 'COD.','ESTACION','year', 'month')
lev_min.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)


## temp_max_abs------------------------------
temp_max_abs= df[df['new'].str.contains("TEMPERATURA MÁXIMA ABSOLUTA")].drop(columns=['new'])
temp_max_abs=pd.melt(temp_max_abs, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='temp_max_abs')

temp_max_abs['merge_cod'] = str_join(temp_max_abs,'_' , 'COD.','ESTACION','year', 'month')
temp_max_abs.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)


## temp_max_avg-----------------------------
temp_max_avg= df[df['new'].str.contains("TEMPERATURA MÁXIMA MEDIA")].drop(columns=['new'])
temp_max_avg=pd.melt(temp_max_avg, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='temp_max_avg')

temp_max_avg['merge_cod'] = str_join(temp_max_avg,'_' , 'COD.','ESTACION','year', 'month')
temp_max_avg.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)

## temp_avg----------------------------
temp_avg= df[df['new'].str.contains("TEMPERATURA MEDIA")].drop(columns=['new'])
temp_avg=pd.melt(temp_avg, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='temp_avg')

temp_avg['merge_cod'] = str_join(temp_avg,'_' , 'COD.','ESTACION','year', 'month')
temp_avg.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)


## temp_min_abs----------------------------
temp_min_abs= df[df['new'].str.contains("TEMPERATURA MÍNIMA MEDIA ")].drop(columns=['new'])
temp_min_abs=pd.melt(temp_min_abs, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='temp_min_abs')

temp_min_abs['merge_cod'] = str_join(temp_min_abs,'_' , 'COD.','ESTACION','year', 'month')
temp_min_abs.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)


## wind_max----------------------------
wind_max= df[df['new'].str.contains("VELOCIDAD DE LA RACHA MÁXIMA ")].drop(columns=['new'])
wind_max=pd.melt(wind_max, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='wind_max')

wind_max['merge_cod'] = str_join(wind_max,'_' , 'COD.','ESTACION','year', 'month')
wind_max.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)


## wind_avg----------------------------
wind_avg= df[df['new'].str.contains("VELOCIDAD MEDIA ")].drop(columns=['new'])
wind_avg=pd.melt(wind_avg, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='wind_avg')

wind_avg['merge_cod'] = str_join(wind_avg,'_' , 'COD.','ESTACION','year', 'month')
wind_avg.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)

## wind_max_avg----------------------------
wind_max_avg= df[df['new'].str.contains("MEDIA DE LAS VELOCIDADES MÁXIMAS ")].drop(columns=['new'])
wind_max_avg=pd.melt(wind_max_avg, id_vars=['COD.', 'ESTACION', 'year'], value_vars=['ENE', 'FEB', 'MAR', 'ABR', 'MAY',
                                                                'JUN', 'JUL', 'AGO', 'SET', 'OCT', 'NOV','DIC'], var_name='month',value_name='wind_max_avg')

wind_max_avg['merge_cod'] = str_join(wind_max_avg,'_' , 'COD.','ESTACION','year', 'month')
wind_max_avg.drop(columns= ['COD.', 'ESTACION', 'year', 'month'], inplace= True)

## 1.3 Merge the data

In [None]:
m_data= freez.merge(hum, on='merge_cod', how= 'outer' ).merge(
    lev_max, on='merge_cod', how= 'outer' ).merge(
    lev_mid, on='merge_cod', how= 'outer' ).merge(
    lev_min, on='merge_cod', how= 'outer' ).merge(
    rain, on='merge_cod', how= 'outer' ).merge(
    rain_1mm, on='merge_cod', how= 'outer' ).merge(
    rain_cum, on='merge_cod', how= 'outer' ).merge(
    rain_max_10, on='merge_cod', how= 'outer' ).merge(
    rain_max_day, on='merge_cod', how= 'outer' ).merge(
    sun, on='merge_cod', how= 'outer' ).merge(
    temp_avg, on='merge_cod', how= 'outer' ).merge(
    temp_max_abs, on='merge_cod', how= 'outer' ).merge(
    temp_max_avg, on='merge_cod', how= 'outer' ).merge(
    temp_min_abs, on='merge_cod', how= 'outer' ).merge(
    wind_avg, on='merge_cod', how= 'outer' ).merge(
    wind_max, on='merge_cod', how= 'outer' ).merge(
    wind_max_avg, on='merge_cod', how= 'outer' )

m_data['code_merge']= m_data['merge_cod']
m_data[['codigo',' estacion','year', 'month']]= m_data.merge_cod.str.split("_",expand=True)

## 1.4 Export the data

In [None]:
#m_data.to_csv (r'estaciones.csv', index = False, header=True)

#fruta
fruta= pd.read_csv('../../Input_open_data/FRUTALES-DECLARADOS-KOPURU.csv', sep=';')
fruta.to_csv (r'fruta.csv', index = False, header=True)

#met
met= pd.read_csv('../../Input_open_data/LOCALIZACION-ESTACIONES-METEOROLOGICAS.csv', sep=';')
met.to_csv (r'met.csv', index = False, header=True)

#apicu
apicu= pd.read_csv('../../Input_open_data/APICULTURA_COLMENAS_KOPURU.csv', sep=';')
apicu.to_csv (r'apicu.csv', index = False, header=True)

#nido
nido=pd.read_excel('../../Input_open_data/datos-nidos-avispa-asiatica.xlsx')
nido.to_csv (r'nido.csv', index = False, header=True)

# Part 2: ...

## 2.1 Get the data

In [None]:
from datawig import SimpleImputer
from datawig.utils import random_split
from sklearn.metrics import f1_score, classification_report, precision_score, recall_score, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.datasets import make_regression

In [None]:
#seasons= pd.read_csv('D:/Bootcamp/Data/estaciones.csv')
seasons = m_data.copy()
seasons.columns

## 2.2 Impute the NaNs

In [None]:
#Hum----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_hum = SimpleImputer(
input_columns=['month','freez', 'temp_avg', 'rain','wind_avg','rain_1mm','rain_cum','rain_max_10','rain_max_day'],
output_column='hum',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_hum.fit(train_df=df_train)
predictions_hum = imputer_hum.predict(df_test)

pre_hum= predictions_hum.loc[~predictions_hum['hum'].isnull(),['hum','hum_imputed'] ]

#Calculate f1 score
r2_hum = r2_score(pre_hum['hum'], pre_hum['hum_imputed'])
msq_hum = mean_squared_error(pre_hum['hum'], pre_hum['hum_imputed'])


#completing hum data

seasons_1= imputer_hum.predict(seasons.loc[seasons['hum'].isnull(),:])
del seasons_1["hum"]
seasons_1=seasons_1.rename(columns={'hum_imputed':'hum'}).append(seasons.loc[~seasons['hum'].isnull(),:])


#Freez----------------------------------------------------------------------------------------

df_train, df_test= random_split(seasons_1, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_freez = SimpleImputer(
input_columns=['month','hum', 'temp_avg', 'rain','wind_avg'],
output_column='freez',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_freez.fit(train_df=df_train)
predictions_freez = imputer_freez.predict(df_test)

pre_freez= predictions_freez.loc[~predictions_freez['freez'].isnull(),['freez','freez_imputed'] ]

#Calculate R2 & MSE
r2_freez = r2_score(pre_freez['freez'], pre_freez['freez_imputed'])
msq_freez = mean_squared_error(pre_freez['freez'], pre_freez['freez_imputed'])

seasons_2= imputer_freez.predict(seasons_1.loc[seasons_1['freez'].isnull(),:])
del seasons_2["freez"]
seasons_2=seasons_2.rename(columns={'freez_imputed':'freez'}).append(seasons_1.loc[~seasons['freez'].isnull(),:])



#Rain----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_2, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_rain = SimpleImputer(
input_columns=['month','hum', 'temp_avg','wind_avg', 'freez','rain_1mm','rain_cum','rain_max_10','rain_max_day'],
output_column='rain',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_rain.fit(train_df=df_train)
predictions_rain = imputer_rain.predict(df_test)

pre_rain= predictions_rain.loc[~predictions_rain['rain'].isnull(),['rain','rain_imputed'] ]

#Calculate R2 & MSE
r2_rain = r2_score(pre_rain['rain'], pre_rain['rain_imputed'])
msq_rain = mean_squared_error(pre_rain['rain'], pre_rain['rain_imputed'])

seasons_3= imputer_rain.predict(seasons_2.loc[seasons_2['rain'].isnull(),:])
del seasons_3["rain"]
seasons_3=seasons_3.rename(columns={'rain_imputed':'rain'}).append(seasons_2.loc[~seasons['rain'].isnull(),:])


#lev_max----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_3, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_lev_max = SimpleImputer(
input_columns=['hum', 'temp_avg','wind_avg', 'rain', 'freez','sun','lev_mid','lev_min'],
output_column='lev_max',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_lev_max.fit(train_df=df_train)
predictions_lev_max = imputer_lev_max.predict(df_test)

pre_lev_max= predictions_lev_max.loc[~predictions_lev_max['lev_max'].isnull(),['lev_max','lev_max_imputed'] ]

#Calculate R2 & MSE
r2_lev_max = r2_score(pre_lev_max['lev_max'], pre_lev_max['lev_max_imputed'])
msq_lev_max = mean_squared_error(pre_lev_max['lev_max'], pre_lev_max['lev_max_imputed'])

seasons_4= imputer_lev_max.predict(seasons_3.loc[seasons_3['lev_max'].isnull(),:])
del seasons_4["lev_max"]
seasons_4=seasons_4.rename(columns={'lev_max_imputed':'lev_max'}).append(seasons_3.loc[~seasons['lev_max'].isnull(),:])


#lev_mid----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_4, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_lev_mid = SimpleImputer(
input_columns=['hum', 'temp_avg','wind_avg', 'rain', 'freez','sun','lev_min','lev_max'],
output_column='lev_mid',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_lev_mid.fit(train_df=df_train)
predictions_lev_mid = imputer_lev_mid.predict(df_test)

pre_lev_mid= predictions_lev_mid.loc[~predictions_lev_mid['lev_mid'].isnull(),['lev_mid','lev_mid_imputed'] ]

#Calculate R2 & MSE
r2_lev_mid = r2_score(pre_lev_mid['lev_mid'], pre_lev_mid['lev_mid_imputed'])
msq_lev_mid = mean_squared_error(pre_lev_mid['lev_mid'], pre_lev_mid['lev_mid_imputed'])

seasons_5= imputer_lev_mid.predict(seasons_4.loc[seasons_4['lev_mid'].isnull(),:])
del seasons_5["lev_mid"]
seasons_5=seasons_5.rename(columns={'lev_mid_imputed':'lev_mid'}).append(seasons_4.loc[~seasons['lev_mid'].isnull(),:])



#lev_min----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_5, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_lev_min = SimpleImputer(
input_columns=['hum', 'temp_avg','wind_avg', 'rain', 'freez','sun','lev_mid','lev_max'],
output_column='lev_min',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_lev_min.fit(train_df=df_train)
predictions_lev_min = imputer_lev_min.predict(df_test)

pre_lev_min= predictions_lev_min.loc[~predictions_lev_min['lev_min'].isnull(),['lev_min','lev_min_imputed'] ]

#Calculate R2 & MSE
r2_lev_min = r2_score(pre_lev_min['lev_min'], pre_lev_min['lev_min_imputed'])
msq_lev_min = mean_squared_error(pre_lev_min['lev_min'], pre_lev_min['lev_min_imputed'])

seasons_6= imputer_lev_min.predict(seasons_5.loc[seasons_5['lev_min'].isnull(),:])
del seasons_6["lev_min"]
seasons_6=seasons_6.rename(columns={'lev_min_imputed':'lev_min'}).append(seasons_5.loc[~seasons['lev_min'].isnull(),:])


#rain_1mm----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_6, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_rain_1mmn = SimpleImputer(
input_columns=['hum', 'temp_avg','wind_avg', 'rain', 'freez','sun','rain_cum','rain_max_10','rain_max_day'],
output_column='rain_1mm',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_rain_1mmn.fit(train_df=df_train)
predictions_rain_1mm = imputer_rain_1mmn.predict(df_test)

pre_rain_1mm= predictions_rain_1mm.loc[~predictions_rain_1mm['rain_1mm'].isnull(),['rain_1mm','rain_1mm_imputed'] ]

#Calculate R2 & MSE
r2_rain_1mm = r2_score(pre_rain_1mm['rain_1mm'], pre_rain_1mm['rain_1mm_imputed'])
msq_rain_1mm = mean_squared_error(pre_rain_1mm['rain_1mm'], pre_rain_1mm['rain_1mm_imputed'])

seasons_7= imputer_rain_1mmn.predict(seasons_6.loc[seasons_6['rain_1mm'].isnull(),:])
del seasons_7["rain_1mm"]
seasons_7=seasons_7.rename(columns={'rain_1mm_imputed':'rain_1mm'}).append(seasons_6.loc[~seasons['rain_1mm'].isnull(),:])


#rain_cum ----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_7, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_rain_cum  = SimpleImputer(
input_columns=['hum', 'temp_avg','wind_avg', 'freez','sun','rain_1mm','rain_max_10','rain_max_day','lev_max','lev_mid','lev_min'],
output_column='rain_cum',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_rain_cum.fit(train_df=df_train)
predictions_rain_cum  = imputer_rain_cum.predict(df_test)

pre_rain_cum = predictions_rain_cum.loc[~predictions_rain_cum ['rain_cum'].isnull(),['rain_cum','rain_cum_imputed'] ]

#Calculate R2 & MSE
r2_rain_cum = r2_score(pre_rain_cum['rain_cum'], pre_rain_cum['rain_cum_imputed'])
msq_rain_cum = mean_squared_error(pre_rain_cum['rain_cum'], pre_rain_cum['rain_cum_imputed'])

seasons_8= imputer_rain_cum.predict(seasons_7.loc[seasons_7['rain_cum'].isnull(),:])
del seasons_8["rain_cum"]
seasons_8=seasons_8.rename(columns={'rain_cum_imputed':'rain_cum'}).append(seasons_7.loc[~seasons['rain_cum'].isnull(),:])

#rain_max_10----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_8, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_rain_max_10 = SimpleImputer(
input_columns=['hum', 'temp_avg','wind_avg', 'rain', 'freez','sun','rain_cum','rain_1mm','rain_max_day'],
output_column='rain_max_10',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_rain_max_10.fit(train_df=df_train)
predictions_rain_max_10 = imputer_rain_max_10.predict(df_test)

pre_rain_max_10= predictions_rain_max_10.loc[~predictions_rain_max_10['rain_max_10'].isnull(),['rain_max_10','rain_max_10_imputed'] ]

#Calculate R2 & MSE
r2_rain_max_10 = r2_score(pre_rain_max_10['rain_max_10'], pre_rain_max_10['rain_max_10_imputed'])
msq_rain_max_10= mean_squared_error(pre_rain_max_10['rain_max_10'], pre_rain_max_10['rain_max_10_imputed'])

seasons_9= imputer_rain_max_10.predict(seasons_8.loc[seasons_8['rain_max_10'].isnull(),:])
del seasons_9["rain_max_10"]
seasons_9=seasons_9.rename(columns={'rain_max_10_imputed':'rain_max_10'}).append(seasons_8.loc[~seasons['rain_max_10'].isnull(),:])


#rain_max_day----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_9, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_rain_max_day = SimpleImputer(
input_columns=['month','hum', 'temp_avg','wind_avg', 'rain', 'freez','sun','rain_cum','rain_1mm','rain_max_10'],
output_column='rain_max_day',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_rain_max_day.fit(train_df=df_train)
predictions_rain_max_day = imputer_rain_max_day.predict(df_test)

pre_rain_max_day =predictions_rain_max_day.loc[~predictions_rain_max_day['rain_max_day'].isnull(),['rain_max_day','rain_max_day_imputed'] ]

#Calculate R2 & MSE
r2_rain_max_day = r2_score(pre_rain_max_day['rain_max_day'], pre_rain_max_day['rain_max_day_imputed'])
msq_rain_max_day= mean_squared_error(pre_rain_max_day['rain_max_day'], pre_rain_max_day['rain_max_day_imputed'])

seasons_10= imputer_rain_max_day.predict(seasons_9.loc[seasons_9['rain_max_day'].isnull(),:])
del seasons_10["rain_max_day"]
seasons_10=seasons_10.rename(columns={'rain_max_day_imputed':'rain_max_day'}).append(seasons_9.loc[~seasons['rain_max_10'].isnull(),:])



#temp_avg----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_10, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_temp_avg = SimpleImputer(
input_columns=['hum','wind_avg', 'rain', 'freez','sun','temp_max_abs','temp_max_avg','temp_min_abs'],
output_column='temp_avg',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_temp_avg.fit(train_df=df_train)
predictions_temp_avg = imputer_temp_avg.predict(df_test)

pre_temp_avg =predictions_temp_avg.loc[~predictions_temp_avg['temp_avg'].isnull(),['temp_avg','temp_avg_imputed'] ]

#Calculate R2 & MSE
r2_temp_avg = r2_score(pre_temp_avg['temp_avg'], pre_temp_avg['temp_avg_imputed'])
msq_temp_avg= mean_squared_error(pre_temp_avg['temp_avg'], pre_temp_avg['temp_avg_imputed'])

seasons_11= imputer_temp_avg.predict(seasons_10.loc[seasons_10['temp_avg'].isnull(),:])
del seasons_11["temp_avg"]
seasons_11=seasons_11.rename(columns={'temp_avg_imputed':'temp_avg'}).append(seasons_10.loc[~seasons['temp_avg'].isnull(),:])




#temp_max_abs----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_11, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_temp_max_abs = SimpleImputer(
input_columns=['hum','wind_avg', 'rain', 'freez','sun','temp_max_avg','temp_avg','temp_min_abs'],
output_column='temp_max_abs',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_temp_max_abs.fit(train_df=df_train)
predictions_temp_max_abs = imputer_temp_max_abs.predict(df_test)

pre_temp_max_abs=predictions_temp_max_abs.loc[~predictions_temp_max_abs['temp_max_abs'].isnull(),['temp_max_abs','temp_max_abs_imputed'] ]

#Calculate R2 & MSE
r2_temp_max_abs= r2_score(pre_temp_max_abs['temp_max_abs'], pre_temp_max_abs['temp_max_abs_imputed'])
msq_temp_max_abs= mean_squared_error(pre_temp_max_abs['temp_max_abs'], pre_temp_max_abs['temp_max_abs_imputed'])

seasons_12= imputer_temp_max_abs.predict(seasons_11.loc[seasons_11['temp_max_abs'].isnull(),:])
del seasons_12["temp_max_abs"]
seasons_12=seasons_12.rename(columns={'temp_max_abs_imputed':'temp_max_abs'}).append(seasons_11.loc[~seasons['temp_max_abs'].isnull(),:])


#temp_max_avg----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_12, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_temp_max_avg = SimpleImputer(
input_columns=['hum','wind_avg', 'rain', 'freez','sun','temp_max_abs','temp_avg','temp_min_abs'],
output_column='temp_max_avg',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_temp_max_avg.fit(train_df=df_train)
predictions_temp_max_avg= imputer_temp_max_avg.predict(df_test)

pre_temp_max_avg=predictions_temp_max_avg.loc[~predictions_temp_max_avg['temp_max_avg'].isnull(),['temp_max_avg','temp_max_avg_imputed'] ]

#Calculate R2 & MSE
r2_temp_max_avg= r2_score(pre_temp_max_avg['temp_max_avg'], pre_temp_max_avg['temp_max_avg_imputed'])
msq_temp_max_avg= mean_squared_error(pre_temp_max_avg['temp_max_avg'], pre_temp_max_avg['temp_max_avg_imputed'])

seasons_13= imputer_temp_max_avg.predict(seasons_12.loc[seasons_12['temp_max_avg'].isnull(),:])
del seasons_13["temp_max_avg"]
seasons_13=seasons_13.rename(columns={'temp_max_avg_imputed':'temp_max_avg'}).append(seasons_12.loc[~seasons['temp_max_avg'].isnull(),:])


#temp_min_abs----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_13, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_temp_min_abs = SimpleImputer(
input_columns=['hum','wind_avg', 'rain', 'freez','sun','temp_max_abs','temp_avg','temp_max_avg'],
output_column='temp_min_abs',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_temp_min_abs.fit(train_df=df_train)
predictions_temp_min_abs= imputer_temp_min_abs.predict(df_test)

pre_temp_min_abs=predictions_temp_min_abs.loc[~predictions_temp_min_abs['temp_min_abs'].isnull(),['temp_min_abs','temp_min_abs_imputed'] ]

#Calculate R2 & MSE
r2_temp_min_abs= r2_score(pre_temp_min_abs['temp_min_abs'], pre_temp_min_abs['temp_min_abs_imputed'])
msq_temp_min_abs= mean_squared_error(pre_temp_min_abs['temp_min_abs'], pre_temp_min_abs['temp_min_abs_imputed'])

seasons_14= imputer_temp_min_abs.predict(seasons_13.loc[seasons_13['temp_min_abs'].isnull(),:])
del seasons_14["temp_min_abs"]
seasons_14=seasons_14.rename(columns={'temp_min_abs_imputed':'temp_min_abs'}).append(seasons_13.loc[~seasons['temp_min_abs'].isnull(),:])


#wind_avg----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_14, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_wind_avg = SimpleImputer(
input_columns=['hum','wind_max', 'rain', 'freez','sun','temp_avg', 'wind_max_avg'],
output_column='wind_avg',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_wind_avg.fit(train_df=df_train)
predictions_wind_avg= imputer_wind_avg.predict(df_test)

pre_wind_avg=predictions_wind_avg.loc[~predictions_wind_avg['wind_avg'].isnull(),['wind_avg','wind_avg_imputed'] ]

#Calculate R2 & MSE
r2_wind_avg= r2_score(pre_wind_avg['wind_avg'], pre_wind_avg['wind_avg_imputed'])
msq_wind_avg= mean_squared_error(pre_wind_avg['wind_avg'], pre_wind_avg['wind_avg_imputed'])

seasons_15= imputer_wind_avg.predict(seasons_14.loc[seasons_14['wind_avg'].isnull(),:])
del seasons_15["wind_avg"]
seasons_15=seasons_15.rename(columns={'wind_avg_imputed':'wind_avg'}).append(seasons_14.loc[~seasons['wind_avg'].isnull(),:])


#wind_max----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_15, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_wind_max = SimpleImputer(
input_columns=['hum','wind_avg', 'rain', 'freez','sun','temp_avg', 'wind_max_avg'],
output_column='wind_max',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_wind_max.fit(train_df=df_train)
predictions_wind_max= imputer_wind_max.predict(df_test)

pre_wind_max=predictions_wind_max.loc[~predictions_wind_max['wind_max'].isnull(),['wind_max','wind_max_imputed'] ]

#Calculate R2 & MSE
r2_wind_max= r2_score(pre_wind_max['wind_max'], pre_wind_max['wind_max_imputed'])
msq_wind_max= mean_squared_error(pre_wind_max['wind_max'], pre_wind_max['wind_max_imputed'])

seasons_16= imputer_wind_max.predict(seasons_15.loc[seasons_15['wind_max'].isnull(),:])
del seasons_16["wind_max"]
seasons_16=seasons_16.rename(columns={'wind_max_imputed':'wind_max'}).append(seasons_15.loc[~seasons['wind_max'].isnull(),:])



#wind_max_avg----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_16, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_wind_max_avg = SimpleImputer(
input_columns=['hum','wind_max', 'rain', 'freez','sun','temp_avg', 'wind_max_avg'],
output_column='wind_max_avg',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_wind_max_avg.fit(train_df=df_train)
predictions_wind_max_avg= imputer_wind_max_avg.predict(df_test)

pre_wind_max_avg=predictions_wind_max_avg.loc[~predictions_wind_max_avg['wind_max_avg'].isnull(),['wind_max_avg','wind_max_avg_imputed'] ]

#Calculate R2 & MSE
r2_wind_max_avg= r2_score(pre_wind_max_avg['wind_max_avg'], pre_wind_max_avg['wind_max_avg_imputed'])
msq_wind_max_avg= mean_squared_error(pre_wind_max_avg['wind_max_avg'], pre_wind_max_avg['wind_max_avg_imputed'])

seasons_17= imputer_wind_max_avg.predict(seasons_16.loc[seasons_16['wind_max_avg'].isnull(),:])
del seasons_17["wind_max_avg"]
seasons_17=seasons_17.rename(columns={'wind_max_avg_imputed':'wind_max_avg'}).append(seasons_16.loc[~seasons['wind_max_avg'].isnull(),:])


#sun----------------------------------------------------------------------------------------

df_train, df_test = random_split(seasons_17, split_ratios=[0.8, 0.2])

#Initialize a SimpleImputer model
imputer_sun= SimpleImputer(
input_columns=['hum','wind_avg', 'rain', 'freez','sun','temp_avg'],
output_column='sun',
output_path = 'imputer_model'
)

#Fit an imputer model on the train data
imputer_sun.fit(train_df=df_train)
predictions_sun= imputer_sun.predict(df_test)

pre_sun=predictions_sun.loc[~predictions_sun['sun'].isnull(),['sun','sun_imputed'] ]

#Calculate R2 & MSE
r2_sun= r2_score(pre_sun['sun'], pre_sun['sun_imputed'])
msq_sun= mean_squared_error(pre_sun['sun'], pre_sun['sun_imputed'])

seasons_18= imputer_sun.predict(seasons_17.loc[seasons_17['sun'].isnull(),:])
del seasons_18["sun"]
seasons_18=seasons_18.rename(columns={'sun_imputed':'sun'}).append(seasons_17.loc[~seasons['sun'].isnull(),:])

## 2.3 Export the data

In [None]:
#seasons_18.to_csv('seasons_impute.csv')

# Part 3: ...

## 3.1 Get the data

In [None]:
#imp_season= pd.read_csv('seasons_impute.csv')
imp_season = seasons_18.copy()
imp_season.columns

## 3.2 Generate the YEARLY dataframes

In [None]:
#2017
imp_season_anual_17= imp_season.loc[imp_season.year==2017,['codigo', 'freez', 'hum','lev_max', 'lev_mid', 'lev_min', 'rain',
                                'rain_1mm', 'rain_cum', 'rain_max_10', 'rain_max_day', 'sun',
                                'temp_avg', 'temp_max_abs', 'temp_max_avg', 'temp_min_abs', 'wind_avg','wind_max', 'wind_max_avg','year']]

imp_season_anual_17=imp_season_anual_17.groupby(['codigo','year'],as_index=True ).agg({'freez':'mean', 
                         'hum':'mean', 
                         'lev_max':'max', 
                         'lev_mid':'mean',
                         'lev_min':'min', 
                         'rain':'mean', 
                         'rain_1mm':'mean',
                         'rain_cum':'mean', 
                         'rain_max_10':'max', 
                         'rain_max_day':'max',                         
                         'sun':'mean',
                         'temp_avg':'mean',
                         'temp_max_abs':'max',
                         'temp_max_avg':'max', 
                         'temp_min_abs':'min', 
                         'rain_max_day':'max',                         
                         'wind_avg':'mean',
                         'wind_max':'max',                                            
                         'wind_max_avg':'max'}).reset_index()



#2018
imp_season_anual_18= imp_season.loc[imp_season.year==2018,['codigo', 'freez', 'hum','lev_max', 'lev_mid', 'lev_min', 'rain',
                                'rain_1mm', 'rain_cum', 'rain_max_10', 'rain_max_day', 'sun',
                                'temp_avg', 'temp_max_abs', 'temp_max_avg', 'temp_min_abs', 'wind_avg','wind_max', 'wind_max_avg','year']]

imp_season_anual_18=imp_season_anual_18.groupby(['codigo','year'],as_index=True ).agg({'freez':'mean', 
                         'hum':'mean', 
                         'lev_max':'max', 
                         'lev_mid':'mean',
                         'lev_min':'min', 
                         'rain':'mean', 
                         'rain_1mm':'mean',
                         'rain_cum':'mean', 
                         'rain_max_10':'max', 
                         'rain_max_day':'max',                         
                         'sun':'mean',
                         'temp_avg':'mean',
                         'temp_max_abs':'max',
                         'temp_max_avg':'max', 
                         'temp_min_abs':'min', 
                         'rain_max_day':'max',                         
                         'wind_avg':'mean',
                         'wind_max':'max',                                            
                         'wind_max_avg':'max'}).reset_index()

#2019
imp_season_anual_19= imp_season.loc[imp_season.year==2019,['codigo', 'freez', 'hum','lev_max', 'lev_mid', 'lev_min', 'rain',
                                'rain_1mm', 'rain_cum', 'rain_max_10', 'rain_max_day', 'sun',
                                'temp_avg', 'temp_max_abs', 'temp_max_avg', 'temp_min_abs', 'wind_avg','wind_max', 'wind_max_avg','year']]

imp_season_anual_19=imp_season_anual_19.groupby(['codigo','year'],as_index=True ).agg({'freez':'mean', 
                         'hum':'mean', 
                         'lev_max':'max', 
                         'lev_mid':'mean',
                         'lev_min':'min', 
                         'rain':'mean', 
                         'rain_1mm':'mean',
                         'rain_cum':'mean', 
                         'rain_max_10':'max', 
                         'rain_max_day':'max',                         
                         'sun':'mean',
                         'temp_avg':'mean',
                         'temp_max_abs':'max',
                         'temp_max_avg':'max', 
                         'temp_min_abs':'min', 
                         'rain_max_day':'max',                         
                         'wind_avg':'mean',
                         'wind_max':'max',                                            
                         'wind_max_avg':'max'}).reset_index()

imp_season_anual= imp_season_anual_17.append(imp_season_anual_18).append(imp_season_anual_19)

## 3.3 Export the data

In [None]:
imp_season_anual.to_csv('WBds02_METEO.csv',index=False )