In [1]:
# General libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import plotly.express as px
from pandas_datareader import data as pdr
import yfinance as yfin
import datetime as dt
from datetime import datetime
import seaborn as sns
import urllib.request
from PIL import Image
import re
from path import Path
from scipy import stats
import statsmodels.api as sm
from IPython.display import display, HTML
from scipy.stats import pearsonr
import itertools
from scipy.stats import ttest_ind
import statsmodels.tsa.stattools as tsa
import pickle
import joblib
from joblib import dump
from scipy.stats import norm


# FRED library
from fredapi import Fred
# API Key
fred_key = '2e3cf97d1b456831253eda002ce25948'

## Machine Learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
# Regression Models
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
# Deep Learning Models
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, GRU, Conv1D, Dense, Flatten, Dropout
from statsmodels.tsa.statespace.varmax import VARMAX
from keras.wrappers.scikit_learn import KerasRegressor
from keras.wrappers.scikit_learn import KerasClassifier
from kerasbeats import prep_time_series, NBeatsModel
from keras.losses import MeanSquaredLogarithmicError
from keras.optimizers import Adam
from keras import Model
from sklearn.model_selection import train_test_split

# Metrics and processing
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error as mape

import warnings
warnings.filterwarnings('ignore')

from definitions import *

### Setting up the configuration

In [2]:
## Date varialbes
years = 20
period="W"
cutoff_date=datetime.today().strftime('%Y-%m-%d')
Ymd_str = datetime.today().strftime('%Y%m%d')
Ym_str = datetime.today().strftime('%Y%m')
Y_str = datetime.today().strftime('%Y')
timeframe = 365*years
today = datetime.today()
end = today.strftime("%Y-%m-%d")
start = (today - dt.timedelta(days=timeframe)).strftime("%Y-%m-%d")
periods = period

## Config varialbes
ROOT_PATH = Path(ROOT_PATH)
config_paths = [
    "config\Market_Data_Config.csv",
    "config\Economic_Data_Config.csv",
    "config\Calc_Data_Config.csv"]
market_config = ROOT_PATH.joinpath(config_paths[0])
economic_config = ROOT_PATH.joinpath(config_paths[1]).abspath()
calc_config = ROOT_PATH.joinpath(config_paths[2]).abspath()
target_list = pd.read_csv(market_config, sep=";", header=0).loc[:, "Codigo"].to_list()
markets_used = pd.read_csv(market_config, sep=";", header=0).loc[:, "Codigo"].to_list()
markets_remove = pd.read_csv(market_config, sep=";", header=0).loc[:, "Market"].to_list()
# markets_used = ['SPY', 'GDX', 'BND']
target = "Financials"
seed = 2  # ML random seed
extract = True
cross_val=5
medidas = ["mean", "median", "mode", "Min", "Percentil_25", "Percentil_75","Max", "var", "std", "skew", "kurt"]
extract = True

### Creating the necessary folders

In [28]:
## Paths variables
PARENT_DIR = ROOT_PATH
DATA_FOLDER = "data/result/processed_data"
PREDICT_FOLDER = "data/result/prediction_data"
RAW_FOLDER = "data/result/raw_data"
MODEL_FOLDER = "model"

data_path = PARENT_DIR+"/"+ DATA_FOLDER+"/"+ Y_str+"/"+ Ym_str+"/"+ Ymd_str
predict_path = PARENT_DIR+"/"+ PREDICT_FOLDER+"/"+ Y_str+"/"+ Ym_str+"/"+ Ymd_str
model_path = PARENT_DIR+"/"+ MODEL_FOLDER+"/"+ Y_str+"/"+ Ym_str+"/"+ Ymd_str
raw_path = PARENT_DIR+"/"+ RAW_FOLDER+"/"+ Y_str+"/"+ Ym_str+"/"+ Ymd_str
try:
    os.makedirs(data_path)
except:
    print("No folder created: "+DATA_FOLDER)
try:
    os.makedirs(predict_path)
except:
    print("No folder created: "+PREDICT_FOLDER)
try:
    os.makedirs(model_path)
except:
    print("No folder created: "+MODEL_FOLDER)
try:
    os.makedirs(raw_path)
except:
    print("No folder created: "+RAW_FOLDER)

No folder created: data/result/processed_data
No folder created: data/result/prediction_data
No folder created: model
No folder created: data/result/raw_data


### Data configuration

In [4]:
def convert_to_dictionary(config, markets_used):
        config = config.dropna()
        if markets_used is None:
            pass
        else:
            markets_used = markets_used
            config = config[config["Codigo"].isin(markets_used)]
        config.set_index('Codigo', inplace=True)
        if config is None:
            raise ValueError("No config loaded.")
        if config.columns.size >2:
            config = config.iloc[:, :2]
        return config.to_dict()

In [5]:
market = pd.read_csv(market_config, sep=';', decimal=',', header=0)
market_dict = convert_to_dictionary(market, markets_used=markets_used)
market_dict = market_dict['Market']

In [6]:
economic = pd.read_csv(economic_config, sep=';', decimal=',', header=0)
economic_dict = convert_to_dictionary(economic, markets_used=None)
economic_dict = economic_dict['Indicador']

indicators = {}
for ind in list(economic["Tipo"].unique()):
    indicators[ind] = economic[economic["Tipo"] == ind]["Indicador"].to_list()

### Extract Indicator Data

In [7]:
fred = Fred(api_key=fred_key)

indicators_df = pd.DataFrame() # DataFrame para almacenar los datos extraidos
# Acceder al diccionario de indicadores economicos para extraer sus datos (utilizando el objeto 'fred' y la funcion 'get_series') y almacenarlos en un DataFrame
for code,indicator in economic_dict.items():
    print(f'   -->Extracting {indicator}...')
    indicators_df[indicator] = fred.get_series(code)
path = ROOT_PATH.joinpath('data', 'result', 'raw_data', 'indicators_df.csv')
indicators_df.to_csv(path)

   -->Extracting 10-Year Treasury Yield...
   -->Extracting 2-Year Treasury Yield...
   -->Extracting Federal Funds Effective Rate...
   -->Extracting 1-Month Treasury Yield...
   -->Extracting 3-Month Treasury Yield...
   -->Extracting 5-Year Treasury Yield...
   -->Extracting 20-Year Treasury Yield...
   -->Extracting 30-Year Treasury Yield...
   -->Extracting 15-Year Mortgage Rate...
   -->Extracting 30-Year Mortgage Rate...
   -->Extracting 10-Year Euro Gov Bond Yield...
   -->Extracting 3-Month Euro Gov Bond Yield...
   -->Extracting 3-Month China Gov Bond Yield...
   -->Extracting 10-Year India Gov Bond Yield...
   -->Extracting CPI...
   -->Extracting PPI...
   -->Extracting China CPI...
   -->Extracting Europe CPI...
   -->Extracting Unemployment Rate...
   -->Extracting GDP...
   -->Extracting Real GDP...
   -->Extracting Consumer Confidence Index...
   -->Extracting Government Debt...
   -->Extracting Debt Service vs Disposable Income...
   -->Extracting Credit Card Delinquen

#### Clean Indicator Data

In [10]:
fill_method="ffill"
indicator_dict = indicators
df_indicators = indicators_df.loc[start:end]
df_indicators.index = pd.to_datetime(df_indicators.index, utc=True, format='%Y-%m-%d')
df_indicators = df_indicators.resample(periods).last()
# Rellenar los datos vacios con el dato anterior
df_indicators_limpio = df_indicators.fillna(method=fill_method)
if fill_method == 'ffill':
    # Rellenar los siguientes datos vacios con el ultimo dato
    df_indicators_limpio.fillna(method='bfill', inplace=True) 
# Guardar las tablas de indicadores en csv
for ind_name, ind_list in indicator_dict.items():
    path = ROOT_PATH.joinpath('data', 'result',  'processed_data', 'indicators', ind_name+'.csv')
    df_indicators[ind_list].dropna().to_csv(path)
# Generar el diferencial de los datos
df_indicators_diff = df_indicators_limpio.diff().fillna(0)
# Generar el dataframe de rendimiento de los datos
df_indicators_rets = df_indicators_limpio.pct_change().fillna(0)
dfs = [df_indicators, df_indicators_limpio, df_indicators_diff, df_indicators_rets]
for df in dfs:
    try:
        df.index = df.index.strftime("%Y-%m-%d")
    except:
        print("Error in date formatting")
df_indicators_cum = df_indicators_rets.cumsum().fillna(0)

#### Extract Market Data

In [13]:
stocks = list(market_dict.keys())
yfin.pdr_override()
# Extraer los precios de !Yahoo Finanzas para cada uno de los indices y almacenarlos en el DataFrame 'markets'
markets = pdr.get_data_yahoo(stocks,start=start,end=end)
# Filtrar el DataFrame quedandonos con la columna de 'Adj Close' y el rango temporal previamente definido
market_hist = markets["Adj Close"].loc[start:end]# Guardar el DataFrame como un archivo csv
path = ROOT_PATH.joinpath('data', 'result', 'raw_data', 'market_df.csv')
market_hist.to_csv(path)

[*********************100%***********************]  17 of 17 completed


#### Clean Indicator Data

In [16]:
resample=periods
fill_method="ffill"
df_markets = market_hist.rename(columns=market_dict)
# Filtrar los datos de mercado de los primeros 23 años
df_markets = df_markets.loc[start:end]
df_markets.index = pd.to_datetime(df_markets.index, utc=True, format='%Y-%m-%d')
# df_markets.index = df_markets.index.strftime('%Y-%m-%d')

# Hacer el resampleo de datos
df_markets = df_markets.resample(resample).fillna(method=fill_method)
if fill_method == 'ffill':
    # Rellenar los siguientes datos vacios con el ultimo dato
    df_markets.fillna(method='bfill', inplace=True) 

# Crear DataFrame de rendimiento diario de mercados
df_market_rets = df_markets.pct_change().fillna(0)
df_market_rets.index = pd.to_datetime(df_market_rets.index, utc=True, format='%Y-%m-%d')
df_market_rets.index = df_market_rets.index.strftime('%Y-%m-%d')
# Crear DataFrame de rendimiento acumulado de mercados
df_market_cum = df_market_rets.cumsum().fillna(0)

# Crear DataFrame de diferencial de mercados
df_market_diff = df_markets.diff().fillna(0)

# Guardar tablas procesadas de mercados

path = ROOT_PATH.joinpath('data', 'result', 'processed_data', 'markets')
df_markets.to_csv(path.joinpath('market_hist.csv'))
df_market_rets.to_csv(path.joinpath('market_rets.csv'))
df_market_cum.to_csv(path.joinpath('market_cum.csv'))
df_market_diff.to_csv(path.joinpath('market_diff.csv'))

#### Merge indicator and market data

In [23]:
list_market_dfs = [df_markets,df_market_rets,df_market_cum,df_market_diff]
list_indicators_dfs = [df_indicators_limpio,df_indicators_rets,df_indicators_cum,df_indicators_diff]

list_all_dfs = []
for df_indicators, df_markets in zip(list_indicators_dfs, list_market_dfs):
    if type(df_indicators.index[0]) != str:
        try:
            df_indicators.index = df_indicators.index.strftime("%Y-%m-%d")
        except:
            print("Don't change date format")
    if type(df_markets.index[0]) != str:
        try:
            df_markets.index = df_markets.index.strftime("%Y-%m-%d").str.split(" ").str[0]
        except:
            print("Don't change date format")
    df = pd.merge(df_indicators,df_markets, left_index=True, right_index=True,how='outer').fillna(method='ffill')
    list_all_dfs.append(df)
path = ROOT_PATH.joinpath('data', 'result', 'processed_data', 'indicators', 'model_data.csv')
list_all_dfs[1].to_csv(path)
df_all_data, df_all_data_rets, df_all_data_cum, df_all_data_diff = list_all_dfs[0], list_all_dfs[1], list_all_dfs[2], list_all_dfs[3]


#### Remove outliers

In [25]:
df = df_all_data_rets.copy()
threshold_mad = 6

median = np.median(df)
mad = np.median(np.abs(df - median))
threshold_mad = threshold_mad
modified_z_scores = 0.6745 * (df - median) / mad
outliers_mad = df[np.abs(modified_z_scores) > threshold_mad]
df_no_outliers = df[np.abs(modified_z_scores) <= threshold_mad]

for column in df_no_outliers.columns:
    null_indexes = df_no_outliers[column].isnull()
    null_indexes_shifted = null_indexes.shift(1, fill_value=False)
    null_indexes_shifted_rev = null_indexes.shift(-1, fill_value=False)
    mask = null_indexes | null_indexes_shifted | null_indexes_shifted_rev

    while mask.any():  # Repeat until there are no more NaN values surrounded by non-NaN values
        avg_values = df_no_outliers[column].rolling(3, min_periods=1, center=True).mean()  # Calculate rolling average of size 3
        df_no_outliers[column] = np.where(mask, avg_values, df_no_outliers[column])  # Replace NaN values with rolling average values

        null_indexes = df_no_outliers[column].isnull()
        null_indexes_shifted = null_indexes.shift(1, fill_value=False)
        null_indexes_shifted_rev = null_indexes.shift(-1, fill_value=False)
        mask = null_indexes | null_indexes_shifted | null_indexes_shifted_rev
df = df_no_outliers.copy()

#### Add feature data

In [26]:
df_all_data["CAPE Ratio"] = df_all_data["SP500"]/(df_all_data["Corporate Profits"]*0.01)
df["CAPE Ratio"] = df_all_data["SP500"]/(df_all_data["Corporate Profits"]*0.01)

# df_all_data["CAPE Ratio"].plot()
def trend_line(df, name, deg=2):
    coef = np.polyfit(range(0,len(df[name])), df[name], deg)
    x_trend = np.linspace(0,len(df[name]),len(df[name]))
    y_trend = np.polyval(coef, x_trend)
    df = pd.DataFrame(y_trend, index=df.index, columns=[name])
    return df

# df_all_data = pd.DataFrame()
df_all_data["SP_GDP"] = df_all_data["SP500"]/(df_all_data["GDP"]*.01)
df_all_data["SP_GDP_trend"] = trend_line(df_all_data, "SP_GDP", deg=5)
df["SP_GDP"] = df_all_data["SP500"]/(df_all_data["GDP"]*.01)
df["SP_GDP_trend"] = trend_line(df_all_data, "SP_GDP", deg=5)

# df_all_data = pd.DataFrame()
std = df_all_data["SP_GDP"].std()
df_all_data["SP_GDP_1std"] = df_all_data["SP_GDP_trend"] + (std)
df["SP_GDP_1std"] = df_all_data["SP_GDP_trend"] + (std)

# df_all_data = df_all_data.copy()
# df_ts = df_all_data.loc[:,df_all_data.columns.str.contains(f"t-")]
# df_all_data.drop(df_ts.columns,axis=1,inplace=True)
for ma in df_all_data.columns:
    df_all_data[f"{ma}_MA"] = df_all_data[[ma]].rolling(window=52).mean().fillna(method="ffill").fillna(method="bfill")
    df_all_data[f"{ma}_std"] = df_all_data[[ma]].rolling(window=52).std().fillna(method="ffill").fillna(method="bfill")
    df_all_data[f"{ma}_trend"] = trend_line(df_all_data[[ma]], ma, deg=6)
    df_all_data[f"{ma}_MA_trend_dif"] = df_all_data[f"{ma}_trend"] - df_all_data[f"{ma}_MA"]
    
    df[f"{ma}_MA"] = df_all_data[[ma]].rolling(window=52).mean().fillna(method="ffill").fillna(method="bfill")
    df[f"{ma}_trend"] = trend_line(df_all_data[[ma]], ma, deg=6)
    df[f"{ma}_MA_trend_dif"] = df_all_data[f"{ma}_trend"] - df_all_data[f"{ma}_MA"]

## Machine Learning

### Feature Selection

In [30]:
#### Indentifying the most important features
##### Splitting the data

##### Creating the baseline for feature importance
baseline_models = {
                # "LinearRegression": LinearRegression(), 
                # "PolynomialFeatures": PolynomialFeatures(),
                # "DecisionTree": DecisionTreeRegressor(), 
                "RandomForest": RandomForestRegressor(),
                "GradientBoosting": GradientBoostingRegressor(), 
                "SVR": SVR(), 
                "KNeighborsRegressor": KNeighborsRegressor(),
                "XGBRegressor": XGBRegressor(),
                }
#### Splitting the data
data = df.copy()
test_size = 0.15
X = data.drop([target], axis=1)
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
#### Creating the baseline
model_results = pd.DataFrame()
model_scores_dict = {}
model_mse_dict = {}
model_r2_dict = {}

# For model prediction data saving
preds = {}
preds[target] = pd.Series(y_test)

for name, model in baseline_models.items():
    print("Processing "+name)
    if name == "PolynomialFeatures":
        pass
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        # y_pred = pd.DataFrame(y_pred,index=y_test.index,columns=[name+"_"+target+"_pred"])
        y_pred = pd.Series(y_pred,index=y_test.index)
        preds[name+"_"+target+"_pred"] = y_pred
        score = model.score(X_train, y_train)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        model_scores_dict[name] = score
        model_mse_dict[name] = mse
        model_r2_dict[name] = r2

model_results = model_results.append([model_scores_dict,model_mse_dict,model_r2_dict], ignore_index=True).T.sort_values(by=0, ascending=False)
model_results.columns = ["score","mse","r2"]
model_results["rmse"] = np.sqrt(model_results["mse"])
baseline_preds = pd.DataFrame(preds)

# Pick the models that have a score above 0.75
model_results = model_results[model_results.loc[:,"score"]>(max(model_results.loc[:,"score"])*.75)]
# Sort by rmse
model_results.sort_values(by="rmse",ascending=True,inplace=True)
# Pick the model with the best rmse
best_model_name = model_results.index[0]
best_model = baseline_models[best_model_name]
#### Feature importance
df_data = df.loc[:cutoff_date]
model = best_model
accepted_importance = 0.85

X = df_data.drop([target], axis=1)
y = df_data[target]

feat_imp_model = model
feat_imp_model.fit(X,y)
score = feat_imp_model.score(X,y)

feature_importance = feat_imp_model.feature_importances_

df_feature_importance = pd.DataFrame(index=X.columns,data=feature_importance, columns=["Importance"]).sort_values(by="Importance", ascending=False)
df_feature_importance["Cum_Importance"] = df_feature_importance.cumsum()
df_top_feature_importance = df_feature_importance[df_feature_importance["Cum_Importance"] < accepted_importance]

df_top_data = df_data.loc[:,df_data.columns.isin(df_top_feature_importance.index)]
df_top_data = pd.concat([df_top_data, df_data[[target]]], axis=1).dropna()     

#### Feature removal
def feature_removal(df, df_top_data, model_results, best_model_name, score):
    best_model_score = model_results.loc[best_model_name,"score"]
    if score > best_model_score*.9:
        print("We choose to remove "+str(len(df.columns)-len(df_top_data.columns))+" features")
        df = df_top_data.copy()
    else:
        print("We choose to keep the original df with "+str(len(df_top_data.columns))+" features")
    return df

df = feature_removal(df, df_top_data, model_results, best_model_name, score)
## Saving the processed data (ready for ML)
df.to_csv(data_path+f"/processed_data_{target}_{Ymd_str}.csv", index=True, index_label="Date")

### Picking the Best Model

In [39]:
baseline_models = {
                # "LinearRegression": LinearRegression(), 
                # "PolynomialFeatures": PolynomialFeatures(),
                # "DecisionTree": DecisionTreeRegressor(), 
                "RandomForest": RandomForestRegressor(),
                "GradientBoosting": GradientBoostingRegressor(), 
                "SVR": SVR(), 
                "KNeighborsRegressor": KNeighborsRegressor(),
                "XGBRegressor": XGBRegressor(),
                }
#### Splitting the data
data = df.copy()
test_size = 0.15
X = data.drop([target], axis=1)
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
#### Creating the baseline
model_results = pd.DataFrame()
model_scores_dict = {}
model_mse_dict = {}
model_r2_dict = {}

# For model prediction data saving
preds = {}
preds[target] = pd.Series(y_test)

for name, model in baseline_models.items():
    print("Processing "+name)
    if name == "PolynomialFeatures":
        pass
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        # y_pred = pd.DataFrame(y_pred,index=y_test.index,columns=[name+"_"+target+"_pred"])
        y_pred = pd.Series(y_pred,index=y_test.index)
        preds[name+"_"+target+"_pred"] = y_pred
        score = model.score(X_train, y_train)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        model_scores_dict[name] = score
        model_mse_dict[name] = mse
        model_r2_dict[name] = r2

model_results = model_results.append([model_scores_dict,model_mse_dict,model_r2_dict], ignore_index=True).T.sort_values(by=0, ascending=False)
model_results.columns = ["score","mse","r2"]
model_results["rmse"] = np.sqrt(model_results["mse"])
baseline_preds = pd.DataFrame(preds)

# Pick the models that have a score above 0.75
model_results = model_results[model_results.loc[:,"score"]>(max(model_results.loc[:,"score"])*.75)]
# Sort by rmse
model_results.sort_values(by="rmse",ascending=True,inplace=True)
# Pick the model with the best rmse
best_model_name = model_results.index[0]
best_model = baseline_models[best_model_name]

Processing RandomForest
Processing GradientBoosting
Processing SVR
Processing KNeighborsRegressor
Processing XGBRegressor


#### Define the grids

In [42]:
params_RandomForest = {
    "n_estimators": [120],
    "max_depth": [10,15,17],
    "max_features": ["sqrt", 3, 4]                          
    }

params_GradientBoosting = {
    'n_estimators': [100, 150],  # 50, 
    'learning_rate': [0.01, 0.05, 0.1],  
    'max_depth': [5, 7],  
    }

params_XGBRegressor = {
    'n_estimators': [150, 250],  # 100
    'learning_rate': [0.01, 0.05, 0.1],  
    'max_depth': [ 5, 7, 11],
    # 'subsample': [0.8, 1.0],
    # 'max_leaf_nodes': [32, 64, 108]
    }

params_KNeighborsRegressor = {
    'n_neighbors': [3, 5, 7, 9],  
    'weights': ['uniform', 'distance'],  
    'p': [1, 2],  
    }

params_SVR = {
    'C': [0.1, 1.0, 10.0],  
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  
    'degree': [2, 3, 4],  
    'gamma': ['scale', 'auto', 0.1, 1.0],  
    }

baseline_params = {
    "RandomForest":params_RandomForest,
    "GradientBoosting":params_GradientBoosting,
    "SVR":params_SVR,
    "KNeighborsRegressor":params_KNeighborsRegressor,
    "XGBRegressor":params_XGBRegressor
}

#### Perform the grid search

In [43]:
def model_gridSearch(baseline_models,baseline_params,model_results,X_train,y_train,X_test,y_test,cross_val=5):
    y_test = y_test.copy()
    models_gridsearch = {}
    for name, model in baseline_models.items():
        if name in model_results.index.values:
            for mod,params in baseline_params.items():
                if name == mod:
                    models_gridsearch[mod] = GridSearchCV(model, params, cv=cross_val, scoring="neg_root_mean_squared_error", verbose=1, n_jobs=1)
                    models_gridsearch[mod].fit(X_train, y_train)
    best_grids = [(i, j.best_score_) for i, j in models_gridsearch.items()]
    best_grids = pd.DataFrame(best_grids, columns=["Grid", "Best score"]).sort_values(by="Best score", ascending=False)
    top_model = models_gridsearch[best_grids.loc[0,"Grid"]]
    return models_gridsearch, best_grids, top_model

In [44]:
models_gridsearch, best_grids, top_model = model_gridSearch(baseline_models,baseline_params,model_results,X_train,y_train,X_test,y_test,cross_val=cross_val)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


KeyboardInterrupt: 

In [None]:
dump(top_model, model_path+f"\{target}_best_model.joblib")