In [18]:
import sys
sys.path.append("..") # append the directory above where the databese is.
from utils import config
import pandas as pd
import numpy as np
import sqlite3
import xgboost as xgb
from numba import jit
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
%load_ext line_profiler
### disable scientific notation in pandas
pd.set_option('display.float_format', '{:.2f}'.format) ### display up to 2 decimal pts

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [None]:
from utils import models

In [None]:
gbm_hyperparams = {
    'n_estimators': 100,
    'max_depth': 10,
    'learning_rate': 0.1,
    'loss': 'ls'
}
gbm_model = models.GDPGrowthPredictor(**gbm_hyperparams)

In [None]:
gbm_model.training_dataset = config.DATABASE_PATH

In [None]:
gbm_model.training_dataset

In [None]:
def add_others_GDP(df_origin):
    df = df_origin[config.GDP_GROWTH].copy()
    n_countries = df.index.get_level_values(level="CountryCode").nunique()
    countries = df.index.get_level_values(level="CountryCode").unique()
    n_years = df.index.get_level_values(level="Year").nunique()
    n_rows = df.shape[0]
    dfs = []
    m = 0
    while(m + n_years <= n_rows):
        df_slice = df.iloc[m:m + n_years]
        dfs.append(df_slice)
        m += n_years
    # actually dfs elements are series, we will keep working with them because I've seen it is lighter to work with
    # series than with dataframes
    for i in range(len(dfs)):
        df_aux = dfs[i]
        df_copies = []
        for _ in range(n_countries):
            df_copies.append(dfs[i].copy())
        dfs[i] = pd.concat(df_copies)
    i = 0
    for series in dfs:
        series.rename(f"{config.GDP_GROWTH}_Country.{countries[i]}", inplace=True)
        i += 1
    dfs_frames = [series.to_frame().reset_index(drop=True) for series in dfs]
    dfs_joined = dfs_frames[0].join(dfs_frames[1:], how="left")
    dfs_joined.set_index(df.index, inplace=True)
    df_final = df_origin.join(dfs_joined)
    return df_final # now we don't want the original gdp as it's repeated in the column with key f"config.GDP_{country}"


In [None]:
def clean_and_pivote(df):
    countries_gdp = df[df["IndicatorCode"] == config.GDP_GROWTH]["CountryCode"].to_list() #countries with gdp
    countries = df["CountryCode"].unique()
    countries_no_gdp = list(set(sorted(countries)) - set(sorted(countries_gdp))) # set difference =  countries with no gdp

    return df.pivot(index=["CountryCode","Year"], columns="IndicatorCode", values="Value").drop(countries_no_gdp)

In [None]:
def lags_f(df, lags=0):
    countries = df.index.get_level_values(level="CountryCode").unique()
    years = df.index.get_level_values(level="Year").unique()[lags:]
    df_chunks = []
    
    for country in countries:
        df_chunks_country = []
        for year in years:
            df_aux = df.loc[country].loc[range(year, year - lags - 1, -1)]
            #print(df_aux)
            rows = []
            for y in df_aux.index.get_level_values(level="Year"):
                rows.append(df_aux.loc[y].to_frame().T)
            lag = 0
            for df_row in rows:
                df_row.rename(columns={column: f"{column}.LAG:{lag}" for column in df_row.columns}, inplace=True)
                df_row.reset_index(drop=True, inplace=True)
                lag += 1
            rows_concat = pd.concat(rows, axis=1)
            rows_concat["CountryCode"] = country
            rows_concat["Year.range"] = f"{year} - {year - lags}"
            rows_concat.set_index(["CountryCode", "Year.range"], inplace=True)
            df_chunks_country.append(rows_concat)
        df_chunks.append(pd.concat(df_chunks_country, axis=0))
        print(country)
    return pd.concat(df_chunks, axis=0)

In [None]:
with sqlite3.connect(config.DATABASE_PATH) as connection:
    df = pd.read_sql("SELECT * FROM CountryIndicators", connection)

In [None]:
%%time
df_cleaned = clean_and_pivote(df)

In [None]:
%%time
df_features = add_others_GDP(df_cleaned) 

In [None]:
%%time
df_year_ranges = lags_f(df_features, 3)

In [None]:

df_features = df_year_ranges.drop(index="2010 - 2007", level="Year.range")


In [None]:
df_target = df_cleaned.pop(config.GDP_GROWTH)
df_target = df_target.drop(index=range(1960,1971), level="Year")
print(f"{df_features.index} \t {df_target.index}")
df_target.to_csv("target.csv")

In [None]:
print(df_target)

In [None]:
df_year_ranges.join(df_target, how="left")

In [None]:
df_target = df_pivoted.pop(config.GDP)
df_features.drop(index=2010, level="Year", inplace=True)
df_target.drop(index=1960, level="Year", inplace=True)
# up until here we have the basic features selected. Now we want to add more features, like the GDP of each country


In [None]:
X = df_features
y = df_target.to_frame()
print(X.shape)
print(y.shape)
imp_mode = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
# genereate imputed dataframes
X_imp = imp_mode.fit_transform(X) 
y_imp = imp_mode.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X_imp, y_imp, test_size=0.2, random_state=1)

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
model = xgb.XGBRegressor(max_depth=7, learning_rate=0.01, subsample=0.8, n_estimators=1000, base_score=y_train.mean())
# model.fit(X_train, y_train, eval_set=[(X_test,y_test)], eval_metric="rmse", verbose=1000, early_stopping_rounds=20)
model.fit(X_train, y_train)
pred = model.predict(X_test, output_margin=True)
print("Model absolute error =", mean_absolute_error(y_test, pred))
print("Model squared error =", mean_squared_error(y_test, pred))
print("R2 =", r2_score(y_test, pred))