In [2]:
from utils.libraries import *
from utils.objects import *
from utils.functions import *
from definitions import *

In [3]:
years=20
period="W"
root_path=ROOT_PATH
target="SP500"
cutoff_date="2023-07-07"
cross_val=5
data_path=None

In [4]:
## Collecting & Preparing the data
### Setting up the configuration
# Se utiliza un rango de 20 años para la extraccion de datos económicos
timeframe = 365*years
today = datetime.today()
end = today.strftime("%Y-%m-%d")
start = (today - dt.timedelta(days=timeframe)).strftime("%Y-%m-%d")
periods = period

medidas = ["mean", "median", "mode", "Min", "Percentil_25", "Percentil_75","Max", "var", "std", "skew", "kurt"]

ROOT_PATH = Path(root_path)
config_paths = [
    "config\Market_Data_Config.csv",
    "config\Economic_Data_Config.csv",
    "config\Calc_Data_Config.csv"
]
market_config = ROOT_PATH.joinpath(config_paths[0])
economic_config = ROOT_PATH.joinpath(config_paths[1]).abspath()
calc_config = ROOT_PATH.joinpath(config_paths[2]).abspath()

target_list = pd.read_csv(market_config, sep=";", header=0).loc[:, "Codigo"].to_list()

markets_used = pd.read_csv(market_config, sep=";", header=0).loc[:, "Codigo"].to_list()

markets_remove = pd.read_csv(market_config, sep=";", header=0).loc[:, "Market"].to_list()
# markets_used = ['SPY', 'GDX', 'BND']

target = target

# ML random seed
seed = 2

extract = True

cutoff_date = cutoff_date

In [5]:
### Loading and extracting the data
# Initialize the Economic Data Analyzer class
eda = EconomicDataAnalyzer()
# Load the Market Data
print("> Load the market data config")
market = eda.read_config(market_config)
market_dict = eda.convert_to_dictionary(markets_used=markets_used)
market_dict = market_dict['Market']
# Load the economic data config
print("> Load the economic data config")
econ = eda.read_config(economic_config)
fred_series_dict = eda.convert_to_dictionary(markets_used=None)
fred_series_dict = fred_series_dict["Indicador"]
calc = eda.read_config(calc_config)
series_calc_dict = eda.convert_to_dictionary(markets_used=None)
series_calc_dict = series_calc_dict["Indicador"]
# Setting up the indicator dictionaries
print("> Setting up the indicator dictionaries")
indicators = {}
for ind in list(econ["Tipo"].unique()):
    indicators[ind] = econ[econ["Tipo"] == ind]["Indicador"].to_list()
if extract == True:
    # Extracting the indicator data
    print("> Extracting the indicator data")
    indicators_df = eda.indicator_extraction(fred_series_dict, series_calc_dict, root_path=ROOT_PATH)
    # Extracting the market data
    print("> Extracting the market data")
    stocks = list(market_dict.keys())
    market_df = eda.market_extraction(stocks, start, end, root_path=ROOT_PATH)
else:
    print("No data extraction, reading data from data file")
    path = ROOT_PATH.joinpath('data', 'raw', 'indicators_df.csv')
    indicators_df = pd.read_csv(path)
    path = ROOT_PATH.joinpath('data', 'raw', 'market_df.csv')
    market_df = pd.read_csv(path)

> Load the market data config
> Load the economic data config
> Setting up the indicator dictionaries
> Extracting the indicator data
   -->Extracting 10-Year Treasury Yield...
   -->Extracting 2-Year Treasury Yield...
   -->Extracting Federal Funds Effective Rate...
   -->Extracting 1-Month Treasury Yield...
   -->Extracting 3-Month Treasury Yield...
   -->Extracting 5-Year Treasury Yield...
   -->Extracting 20-Year Treasury Yield...
   -->Extracting 30-Year Treasury Yield...
   -->Extracting 15-Year Mortgage Rate...
   -->Extracting 30-Year Mortgage Rate...
   -->Extracting Unemployment Rate...
   -->Extracting GDP...
   -->Extracting Real GDP...
   -->Extracting CPI...
   -->Extracting PPI...
   -->Extracting Consumer Confidence Index...
   -->Extracting Government Debt...
   -->Extracting Debt Service vs Disposable Income...
   -->Extracting US Dollar Index...
   -->Extracting Corporate Profits...
   -->Extracting EBIT...
   -->Extracting Retained Earnings...
   -->Extracting Build

In [6]:
# ## Extracting European economic data
# import eurostat
# #### EU Yield Curve
# yield_curve = eurostat.get_data_df("irt_euryld_d")
# yld_crv = yield_curve.loc[(~yield_curve["bonds"].str.contains("AAA")) & (yield_curve["yld_curv"].str.contains("INS_FWD")) & (~yield_curve["maturity"].str.contains("M")),:]
# yld_crv.set_index("maturity", inplace=True)
# yld_crv["maturity_int"] = yld_crv.index.str.split("Y").str[1].astype(int)
# yld_crv.sort_values("maturity_int",ascending=True, inplace=True)
# yld_crv.drop(["maturity_int"], axis=1, inplace=True)
# yld_crv = yld_crv.T
# yld_crv = yld_crv.loc[yld_crv.index.drop(["freq",'yld_curv', 'bonds', 'geo\TIME_PERIOD'])]
# yld_crv.index = pd.to_datetime(yld_crv.index)
# yld_lst = []
# for yld in yld_crv.columns:
#     yld_lst.append(f"EU_yield_{yld}")
# yld_crv.set_axis(yld_lst, axis=1, inplace=True)
# intr_inds = pd.merge(indicators_df, yld_crv, left_index=True, right_index=True)
# #### EU HICP
# hicp = eurostat.get_data_df("PRC_HICP_MIDX")
# hicp = hicp.loc[(hicp["geo\TIME_PERIOD"] == "EU") & (hicp["coicop"] == "CP00") & (hicp["unit"] == "I15"),:].T
# hicp.drop(["freq", "geo\TIME_PERIOD", "coicop", "unit"], axis=0, inplace=True)
# hicp.set_axis(["EU CPI"], axis=1, inplace=True)
# hicp.index = pd.to_datetime(hicp.index)
# intr_inds = pd.merge(intr_inds, hicp, left_index=True, right_index=True)
# #### EU Government Deficit
# gov_def = eurostat.get_data_df("GOV_10DD_EDPT1")
# gov_def = gov_def.loc[(gov_def["geo\TIME_PERIOD"] == "EA20") & (gov_def["sector"] == "S13") & (gov_def["na_item"] == "B9") & (gov_def["unit"] == "PC_GDP"), :].T
# gov_def.set_axis(gov_def.loc["geo\TIME_PERIOD"].values, axis=1, inplace=True)
# gov_def.drop(["freq", "geo\TIME_PERIOD", "sector", "na_item", "unit"], axis=0, inplace=True)
# gov_def.plot()

In [7]:
## Preparing the data
### Data cleaning
# Cleaning the indicator data
print("> Cleaning the indicator data")
df_indicators, df_indicators_cum, df_indicators_diff, df_indicators_rets, df_indicators_limpio = eda.limpiar_indicators(
    df_indicators=indicators_df, 
    indicator_dict=indicators, 
    resample=periods, 
    fill_method="ffill", 
    start=start, 
    end=end, 
    root_path=ROOT_PATH)
# Cleaning the market data
print("> Cleaning market data")
df_market, df_markets_rets, df_markets_cum, df_markets_diff  = eda.limpiar_markets(
    markets_dict=market_dict,
    df_markets=market_df,
    resample=periods, 
    fill_method="ffill", 
    start=start, 
    end=end, 
    root_path=ROOT_PATH)
### Merge indicator and market data
list_market_dfs = [df_market,df_markets_rets,df_markets_cum,df_markets_diff]
list_indicators_dfs = [df_indicators_limpio,df_indicators_rets,df_indicators_cum,df_indicators_diff]

df_all_data, df_all_data_rets, df_all_data_cum, df_all_data_diff = eda.merge_data(list_market_dfs, list_indicators_dfs, root_path=ROOT_PATH)

> Cleaning the indicator data
> Cleaning market data


In [8]:
## Feature Engineering
### Remove Outliers 
df = eda.remove_outliers(df_all_data_rets)
### Adding features
df_all_data["CAPE Ratio"] = df_all_data["SP500"]/(df_all_data["Corporate Profits"]*0.01)
df["CAPE Ratio"] = df_all_data["SP500"]/(df_all_data["Corporate Profits"]*0.01)

# df_all_data["CAPE Ratio"].plot()
def trend_line(df, name, deg=2):
    coef = np.polyfit(range(0,len(df[name])), df[name], deg)
    x_trend = np.linspace(0,len(df[name]),len(df[name]))
    y_trend = np.polyval(coef, x_trend)
    df = pd.DataFrame(y_trend, index=df.index, columns=[name])
    return df

# df_all_data = pd.DataFrame()
df_all_data["SP_GDP"] = df_all_data["SP500"]/(df_all_data["GDP"]*.01)
df_all_data["SP_GDP_trend"] = trend_line(df_all_data, "SP_GDP", deg=5)
df["SP_GDP"] = df_all_data["SP500"]/(df_all_data["GDP"]*.01)
df["SP_GDP_trend"] = trend_line(df_all_data, "SP_GDP", deg=5)

# df_all_data = pd.DataFrame()
std = df_all_data["SP_GDP"].std()
df_all_data["SP_GDP_1std"] = df_all_data["SP_GDP_trend"] + (std)
df["SP_GDP_1std"] = df_all_data["SP_GDP_trend"] + (std)

# df_all_data = df_all_data.copy()
# df_ts = df_all_data.loc[:,df_all_data.columns.str.contains(f"t-")]
# df_all_data.drop(df_ts.columns,axis=1,inplace=True)
for ma in df_all_data.columns:
    df_all_data[f"{ma}_MA"] = df_all_data[[ma]].rolling(window=52).mean().fillna(method="ffill").fillna(method="bfill")
    df_all_data[f"{ma}_std"] = df_all_data[[ma]].rolling(window=52).std().fillna(method="ffill").fillna(method="bfill")
    df_all_data[f"{ma}_trend"] = trend_line(df_all_data[[ma]], ma, deg=6)
    df_all_data[f"{ma}_MA_trend_dif"] = df_all_data[f"{ma}_trend"] - df_all_data[f"{ma}_MA"]
    
    df[f"{ma}_MA"] = df_all_data[[ma]].rolling(window=52).mean().fillna(method="ffill").fillna(method="bfill")
    df[f"{ma}_trend"] = trend_line(df_all_data[[ma]], ma, deg=6)
    df[f"{ma}_MA_trend_dif"] = df_all_data[f"{ma}_trend"] - df_all_data[f"{ma}_MA"]
### Creating lags in the data
list_data_dfs = [df_all_data,df_all_data_rets,df_all_data_cum,df_all_data_diff]

df_all_lag_data, df_all_lag_data_rets, df_all_lag_data_cum, df_all_lag_data_diff = eda.lag_data(list_data_dfs, target, n_lags=24)
df = eda.remove_outliers(df_all_lag_data_rets)

In [9]:
for mkt in markets_remove:
    if mkt == target:
        pass
    else:
        for df_col in df.columns:
            if mkt in df_col:
                try:
                    df.drop(df_col, axis=1, inplace=True)
                except:
                    pass

In [10]:
## Data Preprocessing
econ_ml = Preprocessor()
### Feature Reduction
#### Feature selection by correlation
df_feat_corr = pd.DataFrame(df.corr().loc[target,:].sort_values(ascending=False))
df_feat_relevant_corr = df_feat_corr[(df_feat_corr[target]>0.05) | (df_feat_corr[target]<-0.05)]
df_feat_relevant_corr
#### Indentifying the most important features
##### Splitting the data

##### Creating the baseline for feature importance
baseline_models = econ_ml.define_baseline_models()

X_train, X_test, y_train, y_test = econ_ml.train_test_split_data(data=df, target_col=target, test_size=0.15)
model_results, baseline_preds, best_model, best_model_name = econ_ml.baseline_ml(target, X_train, X_test, y_train, y_test, baseline_models)

print("> Performing feature importance analysis")
df_top_data, feature_importance, top_feature_importance, score = econ_ml.feature_importance(target=target, 
                                                                                                df_data=df.loc[:cutoff_date],
                                                                                                model=best_model,
                                                                                                accepted_importance=0.85)
#### Feature removal
def feature_removal(df, df_top_data, model_results, best_model_name, score):
    best_model_score = model_results.loc[best_model_name,"score"]
    if score > best_model_score*.9:
        print("We choose to remove "+str(len(df.columns)-len(df_top_data.columns))+" features")
        df = df_top_data.copy()
    else:
        print("We choose to keep the original df with "+str(len(df_top_data.columns))+" features")
    return df

df = feature_removal(df, df_top_data, model_results, best_model_name, score)

Processing RandomForest
Processing GradientBoosting
Processing SVR
Processing KNeighborsRegressor
Processing XGBRegressor
--> We choose ['RandomForest' 'GradientBoosting' 'XGBRegressor'] as the best models due to their high scores and rmse
> Performing feature importance analysis
We choose to remove 507 features


In [49]:
# Calculating the stats

mean = df.mean()
var = df.var()
drift = mean - (.5 * var)
std = df.std()

# Setting the Monte Carlo Varuables
ind = 0
T = 104
num_ports = 100
date_range = pd.date_range(start=cutoff_date, periods=T, freq="W")

dict_future = {}
df_mean_future = pd.DataFrame(index=pd.date_range(start=cutoff_date, periods=T, freq="W"))
for ind, col in enumerate(df.columns):
    # Calculating the Weekly Returns
    weekly_rets = np.exp(drift.values[ind] + 2*std.values[ind] * norm.ppf(np.random.rand(T, num_ports)))

    # Getting the most current weekly return (run it back if it's too small)
    n = -1
    S0 = 0
    while (S0 < 0.01) and (S0 > -0.01):
        S0 = df.cumsum().iloc[n,ind]
        n = n - 1
    # Creating the empty list and filling the first row
    price_list = np.zeros_like(weekly_rets)
    price_list[0] = S0

    # Performing Monte Carlo Situlation a 'num_ports' times
    for t in range(1,T):
        price_list[t] = price_list[t-1] * weekly_rets[t]
        dict_future[col] = pd.DataFrame(price_list,index=date_range)
        mean_future = pd.DataFrame(dict_future[col].mean(axis=1))
        df_mean_future[col] = mean_future


In [81]:
df_mean_future.index = pd.to_datetime(df_mean_future.index)
df.index = pd.to_datetime(df.index)
df_new = pd.concat([df,df_mean_future.pct_change()], axis=1, join="outer")

In [82]:
df_new["10-Year Treasury Yield"]

Unnamed: 0,10-Year Treasury Yield,10-Year Treasury Yield.1,10-Year Treasury Yield.2
2004-02-01,0.017115,,
2004-02-08,-0.009615,,
2004-02-15,-0.016990,,
2004-02-22,0.012346,,
2004-02-29,-0.026829,,
...,...,...,...
2025-06-01,,-0.001823,-0.001823
2025-06-08,,-0.012904,-0.012904
2025-06-15,,-0.005071,-0.005071
2025-06-22,,-0.003196,-0.003196


In [70]:
df_new1 = pd.concat([df,df_mean_future.pct_change()], axis=1)
df_new1

Unnamed: 0,10-Year Treasury Yield,2-Year Treasury Yield,3-Month Treasury Yield,5-Year Treasury Yield,20-Year Treasury Yield,30-Year Treasury Yield,15-Year Mortgage Rate,30-Year Mortgage Rate,US Dollar Index,Industrial Production,...,10yTrea30yFRM (t-15),10yTrea30yFRM (t-16),10yTrea30yFRM (t-17),10yTrea30yFRM (t-19),10yTrea30yFRM (t-20),10yTrea30yFRM (t-21),10yTrea30yFRM (t-22),10yTrea30yFRM (t-23),10yTrea30yFRM (t-24),SP500
2023-07-09 00:00:00,,,,,,,,,,,...,,,,,,,,,,
2023-07-16 00:00:00,,,,,,,,,,,...,0.001488,0.013378,0.007406,-0.004469,0.021049,0.006923,0.007136,-0.000342,0.009989,0.003534
2023-07-23 00:00:00,,,,,,,,,,,...,0.001539,-0.002542,-0.002981,-0.009435,-0.006425,0.002118,-0.008611,-0.006888,0.005571,0.008106
2023-07-30 00:00:00,,,,,,,,,,,...,0.005195,-0.001906,0.000420,0.015289,-0.003763,0.007241,0.017117,-0.008964,0.000190,0.007580
2023-08-06 00:00:00,,,,,,,,,,,...,-0.000584,-0.004897,-0.003692,0.000878,0.008801,0.001590,0.002059,0.006657,-0.002167,0.002323
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-04,-0.028947,-0.008811,0.029963,-0.020408,-0.024213,-0.020202,0.035176,0.033486,-0.004766,-0.00542,...,,,,,,,,,,
2023-06-11,0.016260,0.020000,-0.023636,0.020833,0.004963,0.002577,-0.017799,-0.011782,-0.004206,0.00000,...,,,,,,,,,,
2023-06-18,0.005333,0.023965,-0.005587,0.017857,0.000000,-0.007712,0.004942,-0.002981,-0.007769,0.00000,...,,,,,,,,,,
2023-06-25,-0.007958,0.002128,0.013109,0.000000,-0.009877,-0.010363,-0.011475,-0.002990,0.006295,0.00000,...,,,,,,,,,,


In [62]:
## Performing Machine Learning
### Pick the best model

test_size = T/len(df)

X_train, X_test, y_train, y_test = econ_ml.train_test_split_data(data=df, target_col=target, test_size=0.15)
model_results, baseline_preds, best_model, best_model_name = econ_ml.baseline_ml(target, X_train, X_test, y_train, y_test, baseline_models)
print("> Performing Machine Learning")
### Define the grids
params_RandomForest = {
    "n_estimators": [120],
    "max_depth": [10,15,17],
    "max_features": ["sqrt", 3, 4]                          
    }

params_GradientBoosting = {
    'n_estimators': [50, 100, 150],  
    'learning_rate': [0.01, 0.05, 0.1],  
    'max_depth': [3, 5, 7],  
    }

params_XGBRegressor = {
    'n_estimators': [100, 150, 250],  
    'learning_rate': [0.01, 0.05, 0.1],  
    'max_depth': [ 5, 7, 11],
    'subsample': [0.8, 1.0],
    'max_leaf_nodes': [32, 64, 108]
    }

params_KNeighborsRegressor = {
    'n_neighbors': [3, 5, 7, 9],  
    'weights': ['uniform', 'distance'],  
    'p': [1, 2],  
    }

params_SVR = {
    'C': [0.1, 1.0, 10.0],  
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  
    'degree': [2, 3, 4],  
    'gamma': ['scale', 'auto', 0.1, 1.0],  
    }

baseline_params = {
    "RandomForest":params_RandomForest,
    "GradientBoosting":params_GradientBoosting,
    "SVR":params_SVR,
    "KNeighborsRegressor":params_KNeighborsRegressor,
    "XGBRegressor":params_XGBRegressor
}
print(">>> Performing Grid Search")
def model_gridSearch(baseline_models,baseline_params,model_results,X_train,y_train,X_test,y_test,cross_val=5):
    y_test = y_test.copy()
    models_gridsearch = {}
    for name, model in baseline_models.items():
        if name in model_results.index.values:
            for mod,params in baseline_params.items():
                if name == mod:
                    models_gridsearch[mod] = GridSearchCV(model, params, cv=cross_val, scoring="neg_root_mean_squared_error", verbose=1, n_jobs=1)
                    models_gridsearch[mod].fit(X_train, y_train)
    best_grids = [(i, j.best_score_) for i, j in models_gridsearch.items()]
    best_grids = pd.DataFrame(best_grids, columns=["Grid", "Best score"]).sort_values(by="Best score", ascending=False)
    y_pred = models_gridsearch[best_grids.loc[0,"Grid"]].predict(X_test)
    y_pred = pd.DataFrame(y_pred, columns=[target+"_Prediction"],index=y_test.index)
    y_pred.index, y_test.index = pd.to_datetime(y_test.index), pd.to_datetime(y_test.index)
    model_pred = pd.concat([y_test, y_pred], axis=1)
    top_model = models_gridsearch[best_grids.loc[0,"Grid"]]
    return models_gridsearch, best_grids, y_pred, y_test, model_pred, top_model
models_gridsearch, best_grids, y_pred, y_test, model_pred, top_model = model_gridSearch(baseline_models,baseline_params,model_results,X_train,y_train,X_test,y_test,cross_val=cross_val)
try:
    X_test.index = pd.to_datetime(X_test.index)
except:
    pass
full_test = pd.concat([model_pred, X_test], axis=1)
print(">>> Saving the best model and the data")
# Save the best model
dump(top_model, r'C:\Users\Joan Oliver\Documents\GitHub\Economic_Market_Forecasting\Economic_Market_Forecasting\EMF_webapp\EMF_project\models'+f"\{target}_best_model.joblib")
# Save the data
model_pred.to_csv(data_path+f"\{target}_data.csv")

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().