# Imports

In [149]:
import wbgapi as wb
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np
from sklearn.metrics import mean_squared_error as mse
from yellowbrick.regressor import residuals_plot, ResidualsPlot
from yellowbrick.regressor import prediction_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.multioutput import MultiOutputRegressor
import pickle

from numpy import mean
from numpy import std
from numpy import absolute

from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoLars

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold


# Models

In [2]:
MODELS = {
    "LassoLars": MultiOutputRegressor( LassoLars(alpha=.1, normalize=False)),
    "Linear regression":MultiOutputRegressor(LinearRegression()),
    "Ridge": MultiOutputRegressor(RidgeCV()),
    "Lasso": MultiOutputRegressor(Lasso()),
    "ElasticNet": MultiOutputRegressor( ElasticNet(random_state=42)),
    "RandomForestRegressor": MultiOutputRegressor(RandomForestRegressor(max_depth=4, random_state=42)),
    "Decision Tree Regressor":MultiOutputRegressor(DecisionTreeRegressor(max_depth=5)),
    "MultiO/P GBR" :MultiOutputRegressor(GradientBoostingRegressor(n_estimators=5)),
    "MultiO/P AdaB" :MultiOutputRegressor(AdaBoostRegressor(n_estimators=5)),
    "XGBRegressor": MultiOutputRegressor(XGBRegressor(max_depth=3, n_estimators=100, n_jobs=3,
                           objective='reg:squarederror',
                           random_state=42, learning_rate=0.03)),
    "K-nn": MultiOutputRegressor( KNeighborsRegressor()),
    "LinearRegression":MultiOutputRegressor(LinearRegression()),
    
}

# Functions

In [3]:
def read_data(codes):
    worldbank_data = wb.data.DataFrame(codes, 
                                       ['DEU', 'AUS', 'CAN', 'USA', 'FRA', 'ITA', 'JPN', 'GBR', 'ZAF', 'SAU', 'ARG', 'BRA', 'CHN', 'KOR', 'IND', 'IDN', 'MEX', 'RUS', 'TUR'],
                                      range(1990, 2020, 1))
    return worldbank_data

In [4]:
def get_data(data):
    dict_data={}
    
    for index, row in data.items():
        if len(row)>19:
            for index2, row2 in row.items():
                if not dict_data.get(index2[1]):
                    dict_data[index2[1]]=[]
                dict_data.get(index2[1]).append(row2)
        else:
            for index2, row2 in row.items():
                if not dict_data.get("coluna"):
                    dict_data["coluna"]=[]
                dict_data.get("coluna").append(row2)
    return pd.DataFrame(dict_data)

In [171]:
def get_model(input, output):
    
    data_input = get_data(input)
    data_output = get_data(output)
    
    normalizer = MinMaxScaler(feature_range = (-1, 1))
 
    x = normalizer.fit_transform(data_input.values)
    y = normalizer.transform(data_output.values)
    
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, 
                                                    random_state=42)
    
    model = MultiOutputRegressor(XGBRegressor(max_depth=3, n_estimators=100, n_jobs=3,
                           objective='reg:squarederror',
                           random_state=42, learning_rate=0.05))
  
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test) # Predictions
    y_true = y_test
    R_squared = r2_score(y_true,y_pred,multioutput="variance_weighted")
    MSE = mse(y_true, y_pred)
    RMSE = np.sqrt(MSE)
    print("\nRMSE: ", np.round(RMSE, 2))
    print()
    print("R-Squared: ", np.round(R_squared, 2))
    return [model, normalizer]

In [260]:
def get_the_best_model(input,output):
    data_input = get_data(input)
    data_output = get_data(output)
    
    normalizer = MinMaxScaler(feature_range = (-1, 1))
    
    print("x-shape: ", data_input.shape)
    print("y-shape: ", data_output.shape)
    
    data = pd.concat([data_input,data_output],axis=1)
    
    len_in = len(data_input.columns)
    len_out = len(data_output.columns)
    
    data = normalizer.fit_transform(data.values)
    
    x = data[:,0:len_in]
    y = data[:,len_in:len_in+len_out]


    
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, 
                                                    random_state=42)
    y_test_predict = dict()
    y_mse = dict()

    the_best ={"RMSE":10, "MSE": 10, "MAE": 10, "R_squared": 0, 'model':dict() }

    for name, estimator in MODELS.items():
        print("Model name: ", name)
        try:
            estimator.fit(X_train, y_train)                    
        except:
            continue
        y_test_predict[name] = estimator.predict(X_test) 

        # Metrics
        y_mse[name] = mean_squared_error(y_test, estimator.predict(X_test))
        y_true = y_test 

        cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
        n_scores = cross_val_score(estimator, x, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=3)
        n_scores = absolute(n_scores)

        MSE = mse(y_true, y_test_predict[name])
        RMSE = np.sqrt(MSE)
        R_squared = r2_score(y_true, y_test_predict[name],multioutput="variance_weighted")
        MAE = mean(n_scores)
        print("RMSE: ", np.round(RMSE, 3))
        print("MSE:" , np.round(y_mse[name],3))
        print("R-Squared: ", np.round(R_squared, 3))
        print(f"MAE: {np.round(MAE,3)} ({np.round(std(n_scores),3)})")
        print()

        if(RMSE < the_best.get('RMSE') and MSE<the_best.get('MSE') and ( (1-R_squared)>0 and R_squared> the_best.get('R_squared') ) ):
            print("New best->", name) 
            the_best['RMSE'] = RMSE
            the_best['MSE'] = MSE
            the_best['R_squared']= R_squared
            the_best['model']={'name':name, 'estimator': estimator}

    print('The best model',the_best.get('model').get('name'))
    
    return [the_best.get('model').get('estimator'), normalizer]

In [261]:
def save(nome, model, normalizer):
    with open(f'models/normalizer_{nome}.pkl','wb') as f:
        pickle.dump(normalizer,f)
    with open(f'models/model_{nome}.pkl','wb') as f:
        pickle.dump(model,f)
    print('salvo')

## Agricultura e desenvolvimento rural

## Inputs

In [262]:
inputs_agricultura= ['NV.AGR.TOTL.CD']
df_inputs_agricultura = read_data(inputs_agricultura)
df_inputs_agricultura.fillna(df_inputs_agricultura.mean(), inplace=True)


## Outputs

In [263]:
outputs_agricultura = ['SL.AGR.EMPL.ZS','SP.RUR.TOTL.ZS','AG.PRD.FOOD.XD']
df_outputs_agricultura = read_data(outputs_agricultura)
df_outputs_agricultura.fillna(df_outputs_agricultura.mean(), inplace=True)
df_outputs_agricultura=df_outputs_agricultura.groupby(level=0).sum()

In [264]:
model_agricultura, normalizer_agricultura =  get_the_best_model(df_inputs_agricultura,df_outputs_agricultura)

x-shape:  (570, 1)
y-shape:  (570, 1)
Model name:  LassoLars
RMSE:  0.349
MSE: 0.122
R-Squared:  -0.007
MAE: 0.238 (0.028)

Model name:  Linear regression
RMSE:  0.318
MSE: 0.101
R-Squared:  0.165
MAE: 0.219 (0.029)

New best-> Linear regression
Model name:  Ridge
RMSE:  0.319
MSE: 0.102
R-Squared:  0.161
MAE: 0.219 (0.029)

Model name:  Lasso
RMSE:  0.349
MSE: 0.122
R-Squared:  -0.007
MAE: 0.238 (0.028)

Model name:  ElasticNet
RMSE:  0.349
MSE: 0.122
R-Squared:  -0.007
MAE: 0.238 (0.028)

Model name:  RandomForestRegressor
RMSE:  0.27
MSE: 0.073
R-Squared:  0.399
MAE: 0.204 (0.026)

New best-> RandomForestRegressor
Model name:  Decision Tree Regressor
RMSE:  0.258
MSE: 0.067
R-Squared:  0.45
MAE: 0.214 (0.029)

New best-> Decision Tree Regressor
Model name:  MultiO/P GBR
RMSE:  0.311
MSE: 0.097
R-Squared:  0.201
MAE: 0.218 (0.026)

Model name:  MultiO/P AdaB
RMSE:  0.289
MSE: 0.083
R-Squared:  0.313
MAE: 0.211 (0.025)

Model name:  XGBRegressor
RMSE:  0.276
MSE: 0.076
R-Squared:  0.3

In [265]:
save('agricultura',model_agricultura,normalizer_agricultura)

salvo


## Educação

## Inputs

In [266]:
inputs_educacao=["SE.XPD.PRIM.ZS", "SE.XPD.SECO.ZS", "SE.XPD.TERT.ZS", "SE.XPD.TOTL.GB.ZS"]
df_inputs_educacao = read_data(inputs_educacao)
df_inputs_educacao.fillna(df_inputs_educacao.mean(), inplace=True)

## Outputs

In [13]:
outputs_educacao = ["SE.ADT.LITR.ZS","SE.ADT.1524.LT.ZS","SE.PRM.CMPT.ZS", 'SE.SEC.CMPT.LO.ZS','SE.SEC.UNER.LO.ZS','SE.PRM.UNER.ZS']
df_outputs_educacao = read_data(outputs_educacao)
df_outputs_educacao.fillna(df_outputs_educacao.mean(), inplace=True)

## Outputs Negative

In [14]:
ouputs_negative_educacao = ['SE.SEC.CMPT.LO.ZS','SE.SEC.UNER.LO.ZS','SE.PRM.UNER.ZS']
# df_ouputs_negative_educacao = read_data(ouputs_negative_educacao)
# df_ouputs_negative_educacao.fillna(df_ouputs_negative_educacao.mean(), inplace=True)
# df_ouputs_negative_educacao = df_ouputs_negative_educacao * -1
# pd.concat([df_outputs_educacao, df_ouputs_negative_educacao])

In [15]:
df_outputs_educacao

Unnamed: 0_level_0,Unnamed: 1_level_0,YR1990,YR1991,YR1992,YR1993,YR1994,YR1995,YR1996,YR1997,YR1998,YR1999,...,YR2010,YR2011,YR2012,YR2013,YR2014,YR2015,YR2016,YR2017,YR2018,YR2019
economy,series,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ARG,SE.ADT.1524.LT.ZS,57.697009,98.304642,52.413408,49.270402,49.455680,46.976121,46.996034,48.144630,35.856585,41.947214,...,99.298637,99.628242,99.375870,99.551323,99.296707,99.559700,99.502708,52.101998,99.505524,50.533718
ARG,SE.ADT.LITR.ZS,57.697009,96.040718,52.413408,49.270402,49.455680,46.976121,46.996034,48.144630,35.856585,41.947214,...,98.954971,99.108330,99.105232,99.121948,98.993889,99.179962,99.125008,52.101998,99.003868,50.533718
ARG,SE.PRM.CMPT.ZS,57.697009,50.125052,52.413408,49.270402,98.577599,46.976121,92.963257,91.528763,99.151718,97.202042,...,103.567413,103.887711,103.352249,101.645012,103.106438,102.973740,103.893570,99.962761,98.706253,98.547241
ARG,SE.PRM.UNER.ZS,57.697009,3.824100,52.413408,49.270402,49.455680,46.976121,46.996034,0.000000,0.265700,0.165470,...,0.603730,0.565900,0.571760,0.408090,0.098040,0.056300,0.452390,0.432690,0.403450,0.422470
ARG,SE.SEC.CMPT.LO.ZS,57.697009,50.125052,52.413408,49.270402,49.455680,46.976121,46.996034,48.144630,35.856585,80.332611,...,81.547379,85.269440,86.210251,88.456207,88.916321,90.115860,89.205307,89.817039,90.867340,93.539200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZAF,SE.ADT.LITR.ZS,57.697009,50.125052,52.413408,49.270402,49.455680,46.976121,82.402100,48.144630,35.856585,41.947214,...,92.877319,93.102142,93.729469,48.267920,94.139900,94.367920,50.200440,87.046669,55.320413,95.022972
ZAF,SE.PRM.CMPT.ZS,57.697009,77.067719,52.413408,49.270402,49.455680,46.976121,46.996034,48.144630,84.870483,79.950081,...,57.337221,58.558475,49.989132,48.267920,53.484998,90.110489,87.255333,52.101998,90.344498,91.999489
ZAF,SE.PRM.UNER.ZS,57.697009,12.885810,52.413408,49.270402,10.883730,10.261400,46.996034,48.144630,9.817430,8.657320,...,57.337221,58.558475,49.989132,48.267920,53.484998,4.354490,50.200440,7.550620,11.007960,10.787040
ZAF,SE.SEC.CMPT.LO.ZS,57.697009,56.152618,52.413408,49.270402,49.455680,46.976121,46.996034,48.144630,35.856585,74.023102,...,57.337221,58.558475,49.989132,48.267920,53.484998,52.729689,80.768852,52.101998,80.033234,80.372330


In [16]:
def to_negative(row):
    print(row.index[0][1])
    if row.index[0][1] in ouputs_negative_educacao:
        print(row.index[0][1])
        row = row * -1
    return row

In [17]:
ouputs_negative_educacao

['SE.SEC.CMPT.LO.ZS', 'SE.SEC.UNER.LO.ZS', 'SE.PRM.UNER.ZS']

In [18]:
# df_outputs_educacao.apply(to_negative, axis=0)

In [19]:
# df_outputs_educacao.min()

In [21]:
# df_ouputs_negative_educacao.fillna(df_ouputs_negative_educacao.mean(), inplace=True)
# df_ouputs_negative_educacao = df_ouputs_negative_educacao.groupby(level=0).sum()

## Mudanças climáticas e meio ambiente

## Inputs

In [267]:
inputs_ambiente = ['EN.ATM.METH.AG.ZS','EN.ATM.METH.AG.KT.CE','EN.ATM.NOXE.AG.ZS','EN.ATM.CO2E.KD.GD']
df_inputs_ambiente = read_data(inputs_ambiente)
df_inputs_ambiente.fillna(df_inputs_ambiente.mean(), inplace=True)

## Ouputs

In [268]:
outputs_ambiente = ["EG.ELC.ACCS.ZS","EG.ELC.HYRO.ZS","EG.ELC.RNWX.ZS"]
df_outputs_ambiente = read_data(outputs_ambiente)
df_outputs_ambiente.fillna(df_outputs_ambiente.mean(), inplace=True)
df_outputs_ambiente=df_outputs_ambiente.groupby(level=0).sum()

## Model

In [269]:
df_outputs_ambiente=df_outputs_ambiente*100

In [270]:
model_ambiente,normalizer_ambiente=get_the_best_model(df_inputs_ambiente, df_outputs_ambiente)

x-shape:  (570, 4)
y-shape:  (570, 1)
Model name:  LassoLars
Model name:  Linear regression
Model name:  Ridge
Model name:  Lasso
Model name:  ElasticNet
Model name:  RandomForestRegressor
Model name:  Decision Tree Regressor
Model name:  MultiO/P GBR
Model name:  MultiO/P AdaB
Model name:  XGBRegressor
RMSE:  0.133
MSE: 0.018
R-Squared:  0.934
MAE: 0.111 (0.014)

New best-> XGBRegressor
Model name:  K-nn
Model name:  LinearRegression
The best model XGBRegressor


In [271]:
save('ambiente',model_ambiente,normalizer_ambiente)

salvo


## Saúde

In [272]:
inputs_saude = ['SH.XPD.CHEX.GD.ZS','SH.XPD.GHED.CH.ZS']
df_inputs_saude = read_data(inputs_saude)
df_inputs_saude.fillna(df_inputs_saude.mean(), inplace=True)


## Outputs

In [273]:
outputs_saude = ['SH.STA.BASS.ZS','SH.MED.BEDS.ZS','SH.MED.NUMW.P3','SH.MED.PHYS.ZS']
df_outputs_saude = read_data(outputs_saude)
df_outputs_saude.fillna(df_outputs_saude.mean(), inplace=True)
df_outputs_saude=df_outputs_saude.groupby(level=0).sum()

## Model

In [274]:
# df_outputs_saude[df_outputs_saude.isnull().all(axis=1)]

In [275]:
model_saude,normalizer_saude=get_the_best_model(df_inputs_saude, df_outputs_saude)

x-shape:  (570, 2)
y-shape:  (570, 1)
Model name:  LassoLars
Model name:  Linear regression
Model name:  Ridge
Model name:  Lasso
Model name:  ElasticNet
Model name:  RandomForestRegressor
Model name:  Decision Tree Regressor
Model name:  MultiO/P GBR
Model name:  MultiO/P AdaB
Model name:  XGBRegressor
RMSE:  0.355
MSE: 0.126
R-Squared:  0.245
MAE: 0.206 (0.03)

New best-> XGBRegressor
Model name:  K-nn
Model name:  LinearRegression
The best model XGBRegressor


In [276]:

save('saude',model_saude,normalizer_saude)

salvo


## Infraestreutura, Ciência e Tecnologia

## Input

In [277]:
inputs_ciencia = ['GB.XPD.RSDV.GD.ZS']
df_inputs_ciencia = read_data(inputs_ciencia)
df_inputs_ciencia.fillna(df_inputs_ciencia.mean(), inplace=True)

## Output

In [278]:
outputs_ciencia =['IT.NET.SECR.P6','EP.PMP.SGAS.CD','IP.JRN.ARTC.SC','IP.IDS.NRCT','IP.IDS.RSCT','IS.RRS.GOOD.MT.K6']
df_outputs_ciencia = read_data(outputs_ciencia)
df_outputs_ciencia.fillna(df_outputs_ciencia.mean(), inplace=True)
df_outputs_ciencia=df_outputs_ciencia.groupby(level=0).sum()

## Model

In [279]:
model_ciencia,normalizer_ciencia=get_the_best_model(df_inputs_ciencia, df_outputs_ciencia)
save('ciencia',model_ciencia,normalizer_ciencia)

x-shape:  (570, 1)
y-shape:  (570, 1)
Model name:  LassoLars
Model name:  Linear regression
Model name:  Ridge
Model name:  Lasso
Model name:  ElasticNet
Model name:  RandomForestRegressor
Model name:  Decision Tree Regressor
Model name:  MultiO/P GBR
Model name:  MultiO/P AdaB
Model name:  XGBRegressor
RMSE:  0.284
MSE: 0.081
R-Squared:  0.343
MAE: 0.241 (0.041)

New best-> XGBRegressor
Model name:  K-nn
Model name:  LinearRegression
The best model XGBRegressor
salvo


## Desenvolvimento 

In [35]:
inputs_desenvolvimento = ['SI.POV.GINI']
df_inputs_desenvolvimento = read_data(inputs_desenvolvimento)
df_inputs_desenvolvimento.fillna(df_inputs_desenvolvimento.mean(), inplace=True)

In [36]:
outputs_desenvolvimento = ['SL.EMP.VULN.ZS','SL.UEM.TOTL.NE.ZS']
df_outputs_desenvolvimento = read_data(outputs_desenvolvimento)
df_outputs_desenvolvimento.fillna(df_outputs_desenvolvimento.mean(), inplace=True)

In [None]:
ouputs_negative_desenvolvimento = ['SI.DST.50MD']
df_ouputs_negative_desenvolvimento = read_data(ouputs_negative_desenvolvimento)
df_ouputs_negative_desenvolvimento.fillna(df_ouputs_negative_desenvolvimento.mean(), inplace=True)

## Banco Central

In [280]:
inputs_banco = ['FR.INR.DPST']

In [281]:
df_inputs_banco= read_data(inputs_banco)
df_inputs_banco.fillna(df_inputs_banco.mean(), inplace=True)

In [282]:
outputs_banco = ['FM.LBL.BMNY.GD.ZS','FM.LBL.BMNY.ZG','FP.CPI.TOTL.ZG','NY.GDP.DEFL.KD.ZG.AD','PA.NUS.FCRF']
df_outputs_banco = read_data(outputs_banco)
df_outputs_banco.fillna(df_outputs_banco.mean(), inplace=True)
df_outputs_banco=df_outputs_banco.groupby(level=0).sum()

In [283]:
model_banco,normalizer_banco=get_the_best_model(df_inputs_banco, df_outputs_banco)
save('banco',model_banco,normalizer_banco)

x-shape:  (570, 1)
y-shape:  (570, 1)
Model name:  LassoLars
RMSE:  0.259
MSE: 0.067
R-Squared:  -0.003
MAE: 0.135 (0.036)

Model name:  Linear regression
RMSE:  0.251
MSE: 0.063
R-Squared:  0.059
MAE: 0.13 (0.036)

New best-> Linear regression
Model name:  Ridge
RMSE:  0.247
MSE: 0.061
R-Squared:  0.086
MAE: 0.13 (0.036)

New best-> Ridge
Model name:  Lasso
RMSE:  0.259
MSE: 0.067
R-Squared:  -0.003
MAE: 0.135 (0.036)

Model name:  ElasticNet
RMSE:  0.259
MSE: 0.067
R-Squared:  -0.003
MAE: 0.135 (0.036)

Model name:  RandomForestRegressor
RMSE:  0.229
MSE: 0.053
R-Squared:  0.21
MAE: 0.126 (0.036)

New best-> RandomForestRegressor
Model name:  Decision Tree Regressor
RMSE:  0.265
MSE: 0.07
R-Squared:  -0.055
MAE: 0.13 (0.035)

Model name:  MultiO/P GBR
RMSE:  0.245
MSE: 0.06
R-Squared:  0.102
MAE: 0.131 (0.036)

Model name:  MultiO/P AdaB
RMSE:  0.3
MSE: 0.09
R-Squared:  -0.35
MAE: 0.179 (0.049)

Model name:  XGBRegressor
RMSE:  0.244
MSE: 0.06
R-Squared:  0.106
MAE: 0.177 (0.033)

Mo

# Economia

## Input

In [284]:
inputs_economia =['NY.GDP.FCST.CD','NV.IND.TOTL.CD','NV.IND.MANF.CD','NV.SRV.TOTL.CD']
df_inputs_economia = read_data(inputs_economia)
df_inputs_economia.fillna(df_inputs_economia.mean(), inplace=True)

## Output

In [285]:
outputs_economia = ['NY.GDS.TOTL.CD','NE.RSB.GNFS.CD','NE.CON.TOTL.CD','IC.REG.STRT.BUS.DFRN']
df_outputs_economia = read_data(outputs_economia)
df_outputs_economia.fillna(df_outputs_economia.mean(), inplace=True)
df_outputs_economia=df_outputs_economia.groupby(level=0).sum()


## Model

In [286]:
model_economia, normalizer_economia = get_the_best_model(df_inputs_economia, df_outputs_economia)
save('economia', model_economia, normalizer_economia)

x-shape:  (570, 4)
y-shape:  (570, 1)
Model name:  LassoLars
RMSE:  0.238
MSE: 0.057
R-Squared:  -0.015
MAE: 0.181 (0.03)

Model name:  Linear regression
RMSE:  0.062
MSE: 0.004
R-Squared:  0.931
MAE: 0.023 (0.01)

New best-> Linear regression
Model name:  Ridge
RMSE:  0.056
MSE: 0.003
R-Squared:  0.943
MAE: 0.027 (0.008)

New best-> Ridge
Model name:  Lasso
RMSE:  0.238
MSE: 0.057
R-Squared:  -0.015
MAE: 0.181 (0.03)

Model name:  ElasticNet
RMSE:  0.238
MSE: 0.057
R-Squared:  -0.015
MAE: 0.181 (0.03)

Model name:  RandomForestRegressor
RMSE:  0.025
MSE: 0.001
R-Squared:  0.989
MAE: 0.02 (0.005)

New best-> RandomForestRegressor
Model name:  Decision Tree Regressor
RMSE:  0.038
MSE: 0.001
R-Squared:  0.975
MAE: 0.022 (0.006)

Model name:  MultiO/P GBR
RMSE:  0.146
MSE: 0.021
R-Squared:  0.619
MAE: 0.112 (0.019)

Model name:  MultiO/P AdaB
RMSE:  0.046
MSE: 0.002
R-Squared:  0.963
MAE: 0.04 (0.007)

Model name:  XGBRegressor
RMSE:  0.07
MSE: 0.005
R-Squared:  0.913
MAE: 0.069 (0.002)



## Trash

In [254]:
sad = normalizer_economia.transform([[ -9.786788e+10,5.091778e+10,7.895158e+10,1.413524e+11,0]])
x_teste_economia=[sad[0][:4]]

In [255]:
result = model_economia.predict(x_teste_economia)
result = [result[0][0],0,0,12]


In [256]:
normalizer_agricultura.inverse_transform([result])

array([[5.93120862e+09, 6.77800263e+01, 3.16899998e+01, 4.39895000e+02]])

In [97]:
# df_inputs_economia

In [None]:
def get_data(data):
    dict_data={}
    for index, row in dict_input.items():
        for index2, row2 in row.items():
            if not dict_data.get(index2[1]):
                dict_data[index2[1]]=[]
            dict_data.get(index2[1]).append(row2)
    return pd.DataFrame(dict_data)

In [None]:
df_inputs_agricultura

In [None]:
teste = {}
for index, row in df_inputs_agricultura.items():
    if len(row)>19:
        for index2, row2 in row.items():
            if not teste.get(index2[1]):
                teste[index2[1]]=[]
            teste.get(index2[1]).append(row2)
    else:
        for index2, row2 in row.items():
            if not teste.get("coluna"):
                teste["coluna"]=[]
            teste.get("coluna").append(row2)
            
#     if not teste.get(index[1]):
#         teste[index[1]]=[]
#     teste.get(index[1]).append({row[0]:row[1]})
    
z=pd.DataFrame(teste)
z

In [None]:
teste = {}
for index, row in df_inputs_economia.items():
    for index2, row2 in row.items():
        if not teste.get(index2[1]):
            teste[index2[1]]=[]
        teste.get(index2[1]).append(row2)
#     if not teste.get(index[1]):
#         teste[index[1]]=[]
#     teste.get(index[1]).append({row[0]:row[1]})
    
w=pd.DataFrame(teste)

In [None]:
# df_outputs_economia[df_outputs_economia.isnull().all(axis=1)]

In [None]:
# df_outputs_economia.fillna(df_outputs_economia.mean(), inplace=True)
# df_outputs_economia=df_outputs_economia.groupby(level=0).sum()

In [None]:
# df_outputs_economia.transpose().
x = w.values
y = z.values


## MODELOS

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np
from sklearn.metrics import mean_squared_error as mse
from yellowbrick.regressor import residuals_plot,ResidualsPlot
from yellowbrick.regressor import prediction_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
import pickle


In [None]:
x = get_data(df_inputs_economia)
y = get_data(df_outputs_economia)
sc= MinMaxScaler(feature_range = (-1, 1))

In [None]:
x = sc.fit_transform(x)
y = sc.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, 
                                                    random_state=42)

In [None]:
print(x.shape)
print(y.shape)

In [None]:
model=MultiOutputRegressor(LGBMRegressor())

In [None]:
# xgb_reg = MultiOutputRegressor(XGBRegressor(max_depth=3, n_estimators=100, n_jobs=3,
#                            objective='reg:squarederror',
#                            random_state=42, learning_rate=0.05))

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test) # Predictions
y_true = y_test # True values

In [None]:
MSE = mse(y_true, y_pred)
RMSE = np.sqrt(MSE)

In [None]:
R_squared = r2_score(y_true, y_pred)

In [None]:
print("\nRMSE: ", np.round(RMSE, 2))
print()
print("R-Squared: ", np.round(R_squared, 2))

In [None]:
# Making the Prediction Error Plot
print("\nPrediction Error Plot")
print(prediction_error(model, X_train, y_train, X_test, y_test))

In [None]:
# Making the Residuals Plot
print("\nResiduals Plot")
print(residuals_plot(model, X_train, y_train, X_test, y_test)) 

In [None]:
model1=MultiOutputRegressor(LinearRegression())

In [None]:
model1.fit(X_train, y_train)

In [None]:
y_pred = model1.predict(X_test) # Predictions
y_true = y_test # True values

In [None]:
y_true = y_test # True values
MSE = mse(y_true, y_pred)
RMSE = np.sqrt(MSE)
R_squared = r2_score(y_true, y_pred,multioutput='variance_weighted')
print("\nRMSE: ", np.round(RMSE, 2))
print()
print("R-Squared: ", np.round(R_squared, 2))

In [None]:
print("\nRMSE: ", np.round(RMSE, 2))
print()
print("R-Squared: ", np.round(R_squared, 2))

In [None]:
# Making the Prediction Error Plot
print("\nPrediction Error Plot")
print(prediction_error(model, X_train, y_train, X_test, y_test))

In [None]:
def save(nome, model, normalizer) 
    with open(f'models/normalizer_{nome}.pkl','wb') as f:
        pickle.dump(normalizer,f)
    with open(f'models/model_{nome}.pkl','wb') as f:
        pickle.dump(model,f)

In [None]:
testando=sc.fit_transform([[12313131,31234123,412312412,312312312]])
testando

In [None]:
xgb_reg.predict(testando)

In [None]:
pip install scikit-image

In [None]:

# 1.0 Call libraries
# For data manipulation
import numpy as np

# 1.1 For plotting faces
import matplotlib.pyplot as plt   
from skimage.io import imshow

##Metrics
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold



In [None]:
def show_image(test,X_test,y_test_predict,name,n_faces,y_mse):
  ## scattor plot
  plt.figure(figsize=(8,6))
  plt.scatter(y_test_predict[name],y_test,cmap='plasma')
  plt.title(name)
  plt.show()
  print('RMSE for ',name,' is ',y_mse[name])


In [None]:
ESTIMATORS = {
                              # Accept default parameters
    "Linear regression":MultiOutputRegressor(  LinearRegression()),
    "Ridge": MultiOutputRegressor( RidgeCV()),
    "Lasso": MultiOutputRegressor( Lasso()),
    "ElasticNet": MultiOutputRegressor( ElasticNet(random_state=0)),
    "RandomForestRegressor": MultiOutputRegressor( RandomForestRegressor(max_depth=4, random_state=2)),
    "Decision Tree Regressor":MultiOutputRegressor( DecisionTreeRegressor(max_depth=5)),
    "MultiO/P GBR" :MultiOutputRegressor(GradientBoostingRegressor(n_estimators=5)),
    "MultiO/P AdaB" :MultiOutputRegressor(AdaBoostRegressor(n_estimators=5)),
    "XGBRegressor": MultiOutputRegressor(XGBRegressor(max_depth=3, n_estimators=100, n_jobs=3,
                           objective='reg:squarederror',
                           random_state=42, learning_rate=0.05)),
    "K-nn": MultiOutputRegressor( KNeighborsRegressor()),
    "LinearRegression":MultiOutputRegressor(LinearRegression()),
    
}

In [None]:
y_test_predict = dict()
y_mse = dict()

the_best ={"model":[]}

for name, estimator in ESTIMATORS.items(): 
    estimator.fit(X_train, y_train)                    
    y_test_predict[name] = estimator.predict(X_test) 
    
    # Metrics
    y_mse[name] = mean_squared_error(y_test, estimator.predict(X_test))
    y_true = y_test 
    
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(estimator, x, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-2)
    n_scores = absolute(n_scores)
    
    MSE = y_mse[name] 
    RMSE = np.sqrt(MSE)
    R_squared = r2_score(y_true, y_test_predict[name],multioutput="variance_weighted")
    MAE = mean(n_scores)
    print("Model name:", name)
    print("RMSE: ", np.round(RMSE, 3))
    print("MSE:" , np.round(MSE,3))
    print("R-Squared: ", np.round(R_squared, 3))
    print(f"MAE: {np.round(MAE,3)} ({np.round(std(n_scores),3)})")
    print()

In [None]:
y_test_predict = dict()
y_mse = dict()

the_best ={"RMSE":10, "MSE": 10, "MAE": 10, "R_squared": 0, 'model':dict() }

for name, estimator in ESTIMATORS.items(): 
    estimator.fit(X_train, y_train)                    
    y_test_predict[name] = estimator.predict(X_test) 
    
    # Metrics
    y_mse[name] = mean_squared_error(y_test, estimator.predict(X_test))
    y_true = y_test 
    
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(estimator, x, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-2)
    n_scores = absolute(n_scores)
    
    MSE = mse(y_true, y_test_predict[name])
    RMSE = np.sqrt(MSE)
    R_squared = r2_score(y_true, y_test_predict[name],multioutput="variance_weighted")
    MAE = mean(n_scores)
    print("Model name:", name)
    print("RMSE: ", np.round(RMSE, 3))
    print("MSE:" , np.round(y_mse[name],3))
    print("R-Squared: ", np.round(R_squared, 3))
    print(f"MAE: {np.round(MAE,3)} ({np.round(std(n_scores),3)})")
    print()
    
    if(RMSE < the_best.get('RMSE') and MSE<the_best.get('MSE') and ( (1-R_squared)>0 and R_squared> the_best.get('R_squared') ) ):
        print("New best->", name) 
        the_best["RMSE"] = RMSE
        the_best['MSE'] = MSE
        the_best["R_squared"]= R_squared
        the_best['model']={'name':name, 'estimator': estimator}
            
print('The best model',the_best.get("model").get('name'))

In [196]:
a=get_data(df_inputs_economia)
b=get_data(df_outputs_economia)
c= pd.concat([a, b], axis=1)
al= len(a.columns)
bl= len(b.columns)
print(al)
print(bl)
print(c.iloc[:,0:al].columns)
print(a.columns)
print(c.iloc[:,al:al+bl].columns)
print(b.columns)


4
3
Index(['NV.IND.MANF.CD', 'NV.IND.TOTL.CD', 'NV.SRV.TOTL.CD', 'NY.GDP.FCST.CD'], dtype='object')
Index(['NV.IND.MANF.CD', 'NV.IND.TOTL.CD', 'NV.SRV.TOTL.CD', 'NY.GDP.FCST.CD'], dtype='object')
Index(['NE.CON.TOTL.CD', 'NE.RSB.GNFS.CD', 'NY.GDS.TOTL.CD'], dtype='object')
Index(['NE.CON.TOTL.CD', 'NE.RSB.GNFS.CD', 'NY.GDS.TOTL.CD'], dtype='object')


In [224]:

# input['NY.GDP.FCST.CD','NV.IND.TOTL.CD','NV.IND.MANF.CD','NV.SRV.TOTL.CD']
# out['NY.GDS.TOTL.CD','NE.RSB.GNFS.CD','NE.CON.TOTL.CD','IC.REG.STRT.BUS.DFRN']


model_eco=pickle.load(open("models/model_economia.pkl",'rb'))
normalizer_eco=pickle.load(open("models/normalizer_economia.pkl",'rb'))


In [227]:
normalizer_eco.transform([[123,123,123,3,4,4,3]])

array([[-1.00520908, -1.01088954, -1.0042351 , -1.00933533, -1.00809216,
         0.37337072, -1.00721503]])