In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [None]:
import os, uuid
os.system("nvidia-smi")

In [None]:
tf.config.list_physical_devices('GPU')

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
indicators = pd.read_parquet("../../data/indicators/US/all_indicators_raw_outer.parquet", engine="pyarrow")
indicators["date"] = pd.to_datetime(indicators["date"])
indicators.reset_index(drop=True, inplace=True)

In [None]:
nber_recessions = pd.read_parquet("../../data/indicators/US/nber_recession.parquet")
nber_recessions["date"] = pd.to_datetime(nber_recessions["date"])
nber_recessions = nber_recessions[nber_recessions["date"] >= "1962-01-01"]

In [None]:
us_top_500 = pd.read_parquet("../../data/indicators/US/us_top_500.parquet", engine="pyarrow")
us_top_500["date"] = pd.to_datetime(us_top_500["date"])
data = pd.merge(indicators, us_top_500, on=["date"], how="outer")

In [None]:
data.set_index("date", inplace=True)

In [None]:
np.random.seed(49)

In [None]:
data["ism_prod"] = data["ISM_prod_index"].copy()
data["vix"] = data["vix_SP500_close"].copy()
data["inflation"] = data["inflation"]/100
data.loc[data.index < pd.Timestamp("1997-01-01"), "dvps_12m"] = np.nan

In [None]:
#Shifting appropriate date periods, daily data:
data["vix"] = data["vix"].shift(1, freq = "D")
data["market_cap_usd"] = data["market_cap_usd"].shift(1, freq = "D")
data["credit_spread"] = data["credit_spread"].shift(1, freq = "D")
data["rate_fed_funds"] = data["rate_fed_funds"].shift(1, freq = "D")
data["rate_1_year"] = data["rate_1_year"].shift(1, freq = "D")
data["rate_3_year"] = data["rate_3_year"].shift(1, freq = "D")
data["rate_5_year"] = data["rate_5_year"].shift(1, freq = "D")
data["rate_10_year"] = data["rate_10_year"].shift(1, freq = "D")

In [None]:
#Shifting appropriate date periods, weekly data:
data["initial_claims"] = data["initial_claims"].dropna().shift(1, freq = "W")

In [None]:
#Shifting appropriate date periods, monthly and quarterly data:
data["real_gnp"] = data["real_gnp"].dropna().shift(3 + 2, freq = "MS")
data["real_gdp"] = data["real_gdp"].dropna().shift(3 + 2, freq = "MS")
data["M1"] = data["M1"].dropna().shift(1, freq = "MS")
data["M2"] = data["M2"].dropna().shift(1, freq = "MS")
data["ism_prod"] = data["ism_prod"].resample("ME").mean().shift(1, freq="D")
data["pce"] = data["pce"].dropna().shift(1, freq = "MS").shift(7, freq = "D")
data["unemployment"] = data["unemployment"].dropna().shift(2, freq = "MS")
data["earnings_yield"] = data["earnings_yield_12m"].dropna().shift(-1, freq = "D").resample("QE").last().shift(1, freq = "D").shift(2, freq="MS")
data["dividend_yield"] = data["dividend_yield_12m"].dropna().shift(0, freq = "MS")
data["eps"] = data["eps_12m"].dropna().shift(-1, freq = "D").resample("QE").last().shift(1, freq = "D").shift(2, freq="MS")
data["dvps"] = data["dvps_12m"].dropna().shift(0, freq = "MS")
data["inflation"] = data["inflation"].dropna().shift(2, freq = "MS")

#### Data is resampled to month-end and added one day to, so the first day of the month is information from last month

In [None]:
#Daily data, resample to monthly, pct_change
data["vix_change"] = data["vix"].resample("ME").mean().shift(1, freq="D").dropna().pct_change()
data["mc_change"] = data["market_cap_usd"].resample("ME").mean().shift(1, freq="D").dropna().pct_change()
data["credit_spread_change"] = data["credit_spread"].resample("ME").mean().shift(1, freq="D").dropna().pct_change()
data["rate_fed_funds_change"] = data["rate_fed_funds"].resample("ME").mean().shift(1, freq="D").dropna().pct_change()
data["rate_1_year_change"] = data["rate_1_year"].resample("ME").mean().shift(1, freq="D").dropna().pct_change()
data["rate_3_year_change"] = data["rate_3_year"].resample("ME").mean().shift(1, freq="D").dropna().pct_change()
data["rate_5_year_change"] = data["rate_5_year"].resample("ME").mean().shift(1, freq="D").dropna().pct_change()
data["rate_10_year_change"] = data["rate_10_year"].resample("ME").mean().shift(1, freq="D").dropna().pct_change()

In [None]:
#Weekly data, resample to monthly, pct_change
data["initial_claims_change"] = data["initial_claims"].resample("ME").mean().shift(1, freq="D").dropna().pct_change()

In [None]:
#Monthly data, pct_change
data["real_gnp_change"] = data["real_gnp"].dropna().pct_change()
data["real_gdp_change"] = data["real_gdp"].dropna().pct_change()
data["m1_change"] = data["M1"].dropna().pct_change()
data["m2_change"] = data["M2"].dropna().pct_change()
data["ism_prod_change"] = data["ism_prod"].dropna().pct_change()
data["pce_change"] = data["pce"].dropna().pct_change()
data["unemployment_change"] = data["unemployment"].dropna().pct_change()
data["earnings_yield_change"] = data["earnings_yield"].dropna().pct_change()
data["dividend_yield_change"] = data["dividend_yield"].dropna().pct_change()
data["eps_change"] = data["eps"].dropna().pct_change()
data["dvps_change"] = data["dvps"].dropna().pct_change()
data["inflation_change"] = data["inflation"].dropna().pct_change()


In [None]:
def create_sequences(X, lookback):
    Xs = []
    for i in range(len(X) - lookback + 1):
        Xs.append(X[i:(i + lookback)])
    return np.array(Xs)

In [None]:
def prepare_data(data, features, target_data, train_test_split_date, lookback, n_test_periods = None, embargo_periods=1, scale_data = False):

    X_data = data[features]

    y_data = target_data

    
    X_test = X_data[X_data.index >= train_test_split_date]
    y_test = y_data[y_data.index >= train_test_split_date]

    X_train = X_data[X_data.index < train_test_split_date]
    y_train = y_data[y_data.index < train_test_split_date]

    if (scale_data):
        scaler = StandardScaler()

        scaler.fit(X_train)

        X_train.loc[:,features] = scaler.transform(X_train)
        X_test.loc[:,features] = scaler.transform(X_test)

    
    X_train_seq = create_sequences(X_train.values, lookback)
    y_train_seq = y_train.values[lookback - 1:]

    if n_test_periods:
        X_test = X_test.iloc[:n_test_periods]
        X_test_seq = create_sequences(pd.concat([X_train.iloc[-lookback + 1:], X_test]).values, lookback)
        y_test_seq = y_test.values[lookback - 1:]
        y_test = y_test.iloc[:n_test_periods]
        y_test_seq = np.concatenate([y_train_seq[-lookback + n_test_periods + 1:], y_test_seq])

    else:
        X_test_seq = create_sequences(X_test.values, lookback)
        X_test_seq = np.concatenate([X_train_seq[-lookback + 1:], X_test_seq])
    
        y_test_seq = y_test.values[lookback - 1:]
        y_test_seq = np.concatenate([y_train_seq[-lookback + 1:], y_test_seq])
        
    test_dates = X_test.index
    train_dates = X_train.iloc[lookback - 1:].index
    
    if embargo_periods:
        train_dates = train_dates[:-embargo_periods]
        X_train_seq = X_train_seq[:-embargo_periods]
        y_train_seq = y_train_seq[:-embargo_periods]

    return X_train_seq, y_train_seq, X_test_seq, y_test_seq, X_train, X_test, y_train, y_test, train_dates, test_dates

In [None]:
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import StandardScaler
def process_data(data, feature, winsorize_std = 3, winsorize_quantile=None, scale_data=False, log_transform=False, plot=False):
    assert(not (winsorize_std and winsorize_quantile))
    
    data_train = data[[feature]].copy().dropna()

    if winsorize_quantile is not None:
        data_train[feature] = data_train[feature].clip(lower = data_train[feature].quantile(winsorize_quantile), upper = data_train[feature].quantile(1-winsorize_quantile))

    if winsorize_std is not None:
        data_train[feature] = data_train[feature].clip(lower = -data_train[feature].std()*winsorize_std, upper = data_train[feature].std()*winsorize_std)

    scaler = StandardScaler()

    if log_transform:
        if (data_train[feature].min() + 1) <= 0:
            print(data_train[feature].min())
            print(f"Feature {feature} has too negative values, cannot log transform")
            return
        data_train[feature] = np.log(1 + data_train[feature])

    if scale_data:
        data_train[feature] = scaler.fit_transform(data_train[feature].values.reshape(-1, 1))

    if plot:
        print(feature)
        data_train[feature].plot()
        plt.show()

    return data_train[feature]

In [None]:
min_date = pd.Timestamp("1962-01-01")
max_date = pd.Timestamp("2023-12-31")   

In [None]:
features = [
                "mc_change", 
                "inflation",
                #"inflation_change",
                #"unemployment", 
                "unemployment_change", 
                #"rate_fed_funds",
                "rate_fed_funds_change", 
                "initial_claims_change",
                #"ism_prod_index",
                "ism_prod_change",
                "real_gnp_change", 
                "real_gdp_change", 
                "m1_change", 
                "m2_change", 
                #"rate_1_year",
                #"rate_3_year",
                #"rate_5_year",
                #"rate_10_year",
                "rate_1_year_change",
                "rate_3_year_change",
                "rate_5_year_change",
                "rate_10_year_change",
                #"earnings_yield",
                "earnings_yield_change",
                "eps_change",
                #"dvps_change",
                #"credit_spread",
                "credit_spread_change",
                #"pce_change",
                #"vix_change"
                    ]

In [None]:
for feature in features:
    print(feature, str(data[feature].dropna().index.min()), str(data[feature].dropna().index.max()))

In [None]:
resample_freq = "MS"

In [None]:
data_copy = data.copy()


In [None]:
for feature in features:
    data_copy[feature] = process_data(data_copy, feature, winsorize_std = 3, winsorize_quantile=None, scale_data=True, log_transform=True, plot=False)

In [None]:
#Resample to monthly
for feature in features:
    data_copy[feature] = data_copy[feature].resample(resample_freq).first().ffill()

In [None]:
data_copy = data_copy[data_copy.index >= min_date]
data_copy = data_copy[data_copy.index <= max_date]
data_copy = data_copy[features].dropna()
print(data_copy.index.min())
print(data_copy.index.max())

# Classification probability

In [None]:
#Use recession months as class 1
#PROBLEMATIC PERIOD, NEEDS TO BE RETRAINED AT SPECIFIC TIMES

y_data = nber_recessions.copy()
y_data.set_index("date", inplace=True)
y_data["class"] = y_data["recession"]
y_data = y_data.resample(resample_freq).sum()
y_data["class"] = y_data["class"].apply(lambda x: 1 if x >= 15 else 0)
y_data = y_data[y_data.index >= data_copy.index.min()]
y_data = y_data[y_data.index <= data_copy.index.max()]
print(y_data.index.min())
print(y_data.index.max())
y_data = y_data["class"]

In [None]:
bear_dates = pd.read_csv("../../time_periods/bear_dates_sp500.csv", engine="pyarrow")
bear_dates["date"] = pd.to_datetime(bear_dates["date"])

In [None]:
#Use bear dates as class 1
#PROBLEMATIC PERIOD, NEEDS TO BE RETRAINED AT SPECIFIC TIMES

y_data = bear_dates.copy()
y_data.set_index("date", inplace=True)
y_data["class"] = 1
y_data = y_data.resample("D").asfreq().fillna(0)
y_data = y_data.resample(resample_freq).sum()
y_data["class"] = y_data["class"].apply(lambda x: 1 if x > 15 else 0)
y_data = y_data[y_data.index >= data_copy.index.min()]
y_data = y_data[y_data.index <= data_copy.index.max()]
print(y_data.index.min())
print(y_data.index.max())



In [None]:
bull_dates = pd.read_csv("../../time_periods/bull_dates_sp500.csv", engine="pyarrow")
bull_dates["date"] = pd.to_datetime(bull_dates["date"])

In [None]:
#Use bull dates as class 1
#PROBLEMATIC PERIOD, NEEDS TO BE RETRAINED AT SPECIFIC TIMES

y_data = bull_dates.copy()
y_data.set_index("date", inplace=True)
y_data["class"] = 1
y_data = y_data.resample("D").asfreq().fillna(0)
y_data = y_data.resample(resample_freq).sum()
y_data["class"] = y_data["class"].apply(lambda x: 1 if x > 15 else 0)
y_data = y_data[y_data.index >= data_copy.index.min()]
y_data = y_data[y_data.index <= data_copy.index.max()]
print(y_data.index.min())
print(y_data.index.max())

In [None]:
# Use negative change in market cap as class 1
y_data = data_copy[["mc_change"]].dropna().shift(-1)
y_data = y_data[y_data.index >= data_copy.index.min()]
y_data = y_data[y_data.index <= data_copy.index.max()]
print(y_data.index.min())
print(y_data.index.max())
y_data["class"] = 0
y_data.loc[y_data["mc_change"] < 0, "class"] = 1
y_data = y_data["class"]

### Different compositions of the LSTM model is used. See the thesis

# BEAR, BULL, AND RECESSIONS

## Testing one model composition

In [None]:
lookback = 70

In [None]:
model = Sequential()
model.add(LSTM(1000, return_sequences=True, input_shape=(lookback, len(features))))
model.add(LSTM(1000, return_sequences=False, input_shape=(lookback, len(features))))
model.add(Dense(1000, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(500, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Output layer for negative market probability
model.compile(loss='binary_crossentropy', optimizer='adam')

In [None]:
#Train model after before each date in list:

train_split_dates_nber = [pd.Timestamp("1981-07-08"), pd.Timestamp("1983-07-08"), pd.Timestamp("1992-12-22"), 
                        pd.Timestamp("2003-07-17"), pd.Timestamp("2010-09-20")]


train_split_dates_bear = [pd.Timestamp("1983-02-15"), pd.Timestamp("1985-02-01"), pd.Timestamp("1988-06-07"), 
                        pd.Timestamp("1991-04-10"), pd.Timestamp("1999-04-10"), pd.Timestamp("2000-11-25"),
                        pd.Timestamp("2003-09-12"), pd.Timestamp("2009-09-10"), pd.Timestamp("2011-01-10"),  
                        pd.Timestamp("2012-04-04"), pd.Timestamp("2016-08-12"), pd.Timestamp("2019-06-28"), 
                        #  pd.Timestamp("2020-09-28"), pd.Timestamp("2023-04-15"), pd.Timestamp("2024-05-01"), 
                         ]

train_split_dates_bull = [pd.Timestamp("1983-02-15"), pd.Timestamp("1985-02-01"), pd.Timestamp("1988-06-07"), 
                        pd.Timestamp("1991-04-10"), pd.Timestamp("1999-04-10"), pd.Timestamp("2000-11-25"),
                        pd.Timestamp("2003-09-12"), pd.Timestamp("2009-09-10"), pd.Timestamp("2011-01-10"),  
                        pd.Timestamp("2012-04-04"), pd.Timestamp("2016-08-12"), pd.Timestamp("2019-06-28"), 
                        #  pd.Timestamp("2020-09-28"), pd.Timestamp("2023-04-15"), pd.Timestamp("2024-05-01"), 
                         ]

train_split_dates = train_split_dates_bear



train_split_dates_next_month = [date + pd.offsets.MonthBegin(0) for date in train_split_dates]

for i, current_train_test_split_date in enumerate(train_split_dates_next_month):
    
    print(current_train_test_split_date)
    X_train_seq, y_train_seq, X_test_seq, y_test_seq, X_train, X_test, y_train, y_test, train_dates, test_dates = prepare_data(data_copy,
        features, y_data, current_train_test_split_date, lookback = lookback)

    num_0 = len(y_train[y_train == 0])
    num_1 = len(y_train[y_train == 1])
    num_both = len(y_train)

    weight_0 = (1 / num_0) * (num_both / 2)
    weight_1 = (1 / num_1) * (num_both / 2)

    class_weights = {0: weight_0, 1: weight_1}

    model.fit(X_train_seq, y_train_seq, epochs=40, verbose=0, class_weight=class_weights)

    train_results = model.predict(X_train_seq)
    test_results = model.predict(X_test_seq)
    
    print(train_dates)
    print(train_dates[-1])
    
    print(test_results)
    print(test_dates)

    if i == 0:
        new_test_results_df = pd.DataFrame(test_results, index=test_dates, columns=["p"])
        new_test_results_df["split_date"] = current_train_test_split_date
        new_test_results_df["real_class"] = y_data[y_data.index.isin(new_test_results_df.index)]
        test_results_df = new_test_results_df
    else:
        new_test_results_df = pd.DataFrame(test_results, index=test_dates, columns=["p"])
        new_test_results_df["split_date"] = current_train_test_split_date
        new_test_results_df["real_class"] = y_data[y_data.index.isin(new_test_results_df.index)]
        test_results_df = pd.concat([test_results_df, new_test_results_df])

    print("New test results df:", new_test_results_df.shape)

test_results_df["pred_class"] = 1
test_results_df.loc[test_results_df["p"] < 0.5, "pred_class"] = 0

test_results_df_seq = test_results_df.copy()
for i, date in enumerate(train_split_dates_next_month):
    if (i + 1) < len(train_split_dates_next_month):
        test_results_df_seq.loc[test_results_df_seq["split_date"] == date] = test_results_df_seq[(test_results_df_seq["split_date"] == date) & (test_results_df_seq.index < train_split_dates_next_month[i+1])]
        test_results_df_seq.dropna(inplace=True)
        

    
save_results = True

if save_results:
    model_id = uuid.uuid4()

    #model_name = f"nber_recession_class_train_before_1980_win_std_3_scale_log_retrained_{str(model_id)[:8]}"
    #model_name = f"qbull_class_train_before_1980_win_std_3_scale_log_retrained_{str(model_id)[:8]}"
    model_name = f"qbear_class_train_before_1980_win_std_3_scale_log_retrained_{str(model_id)[:8]}"

    directory = f"../../results/regime/lstm/{model_name}"
    if not os.path.exists(directory):
        os.makedirs(directory)
    test_results_df.to_csv(f"{directory}/test_results.csv")
    test_results_df_seq.to_csv(f"{directory}/test_results_seq.csv")
    with open(f"{directory}/summary.txt", 'w') as f:
        model.summary(print_fn=lambda x: f.write(x + '\n'))

## Testing multiple compositions

In [None]:
#Train model after before each date in list:

train_split_dates_nber = [pd.Timestamp("1981-07-08"), pd.Timestamp("1983-07-08"), pd.Timestamp("1992-12-22"), 
                        pd.Timestamp("2003-07-17"), pd.Timestamp("2010-09-20")]


train_split_dates_bear = [pd.Timestamp("1975-04-01"), pd.Timestamp("1983-02-15"), pd.Timestamp("1985-02-01"), pd.Timestamp("1988-06-07"), 
                        pd.Timestamp("1991-04-10"), pd.Timestamp("1999-04-10"), pd.Timestamp("2000-11-25"),
                        pd.Timestamp("2003-09-12"), pd.Timestamp("2009-09-10"), pd.Timestamp("2011-01-10"),  
                        pd.Timestamp("2012-04-04"), pd.Timestamp("2016-08-12"), pd.Timestamp("2019-06-28"), 
                        pd.Timestamp("2020-09-28"), pd.Timestamp("2023-04-15"), pd.Timestamp("2024-05-01"), 
                         ]

train_split_dates_bull = [
    pd.Timestamp('1976-01-15'), pd.Timestamp('1981-05-26'), pd.Timestamp('1983-12-23'),
    pd.Timestamp('1988-02-25'), pd.Timestamp('1990-04-09'), pd.Timestamp('1994-08-01'),
    pd.Timestamp('1999-01-17'),pd.Timestamp('2000-09-24'),pd.Timestamp('2004-09-01'),
    pd.Timestamp('2005-08-28'),pd.Timestamp('2006-11-08'),pd.Timestamp('2008-01-17'),
    pd.Timestamp('2010-10-23'),pd.Timestamp('2011-11-02'),pd.Timestamp('2015-11-19'),
    pd.Timestamp('2018-07-25'),pd.Timestamp('2019-03-20'),pd.Timestamp('2020-08-19'),
    pd.Timestamp('2022-07-03')
                         ]

train_split_dates = train_split_dates_bear

lookback_list = [2,5]
dense_scale_list = [0.5,0.75,1,1.25,1.5,2]
last_layer_list = [True, False]

for lookback in lookback_list:
    for dense_scale in dense_scale_list:
        for last_layer in last_layer_list:

            model = Sequential()
            model.add(LSTM(1000, return_sequences=True, input_shape=(lookback, len(features))))
            model.add(LSTM(1000, return_sequences=False, input_shape=(lookback, len(features))))
            model.add(Dense(int(dense_scale*1000), activation='relu'))
            model.add(Dropout(0.2))
            model.add(Dense(int(dense_scale*500), activation='relu'))
            model.add(Dense(int(dense_scale*250), activation='relu'))
            model.add(Dense(int(dense_scale*100), activation='relu'))
            if last_layer:
                model.add(Dense(int(dense_scale*50), activation='relu'))
            model.add(Dense(1, activation='sigmoid'))  # Output layer for negative market probability
            model.compile(loss='binary_crossentropy', optimizer='adam')

            train_split_dates_next_month = [date + pd.offsets.MonthBegin(0) for date in train_split_dates]

            for i, current_train_test_split_date in enumerate(train_split_dates_next_month):

                print(current_train_test_split_date)
                X_train_seq, y_train_seq, X_test_seq, y_test_seq, X_train, X_test, y_train, y_test, train_dates, test_dates = prepare_data(data_copy,
                    features, y_data, current_train_test_split_date, lookback = lookback)

                num_0 = len(y_train[y_train == 0])
                num_1 = len(y_train[y_train == 1])
                num_both = len(y_train)

                weight_0 = (1 / num_0) * (num_both / 2)
                weight_1 = (1 / num_1) * (num_both / 2)

                class_weights = {0: weight_0, 1: weight_1}

                model.fit(X_train_seq, y_train_seq, epochs=40, verbose=0, class_weight=class_weights)

                train_results = model.predict(X_train_seq)
                test_results = model.predict(X_test_seq)

                print(train_dates)
                print(train_dates[-1])

                print(test_results)
                print(test_dates)

                if i == 0:
                    new_test_results_df = pd.DataFrame(test_results, index=test_dates, columns=["p"])
                    new_test_results_df["split_date"] = current_train_test_split_date
                    new_test_results_df["real_class"] = y_data[y_data.index.isin(new_test_results_df.index)]
                    test_results_df = new_test_results_df
                else:
                    new_test_results_df = pd.DataFrame(test_results, index=test_dates, columns=["p"])
                    new_test_results_df["split_date"] = current_train_test_split_date
                    new_test_results_df["real_class"] = y_data[y_data.index.isin(new_test_results_df.index)]
                    test_results_df = pd.concat([test_results_df, new_test_results_df])

                print("New test results df:", new_test_results_df.shape)

            test_results_df["pred_class"] = 1
            test_results_df.loc[test_results_df["p"] < 0.5, "pred_class"] = 0

            test_results_df_seq = test_results_df.copy()
            for i, date in enumerate(train_split_dates_next_month):
                if (i + 1) < len(train_split_dates_next_month):
                    test_results_df_seq.loc[test_results_df_seq["split_date"] == date] = test_results_df_seq[(test_results_df_seq["split_date"] == date) & (test_results_df_seq.index < train_split_dates_next_month[i+1])]
                    test_results_df_seq.dropna(inplace=True)



            save_results = True

            if save_results:
                model_id = uuid.uuid4()

                #model_name = f"nber_recession_class_train_before_1980_win_std_3_scale_log_retrained_{str(model_id)[:8]}"
                #model_name = f"qbull_class_train_before_1980_win_std_3_scale_log_retrained_{str(model_id)[:8]}"

                model_name = f"qbear_class_train_before_1980_win_std_3_scale_log_retrained_{str(model_id)[:8]}"

                directory = f"../../results/regime/lstm/{model_name}"
                if not os.path.exists(directory):
                    os.makedirs(directory)
                test_results_df.to_csv(f"{directory}/test_results.csv")
                test_results_df_seq.to_csv(f"{directory}/test_results_seq.csv")
                with open(f"{directory}/summary.txt", 'w') as f:
                    model.summary(print_fn=lambda x: f.write(x + '\n'))

# Index Market Cap change

In [None]:
#Train model for each freq
time_between_training = 1 #months
embargo_months = 1

train_test_split_date = pd.Timestamp("1980-01-01")

current_train_test_split_date = train_test_split_date

while current_train_test_split_date + pd.DateOffset(months=time_between_training) < max_date:
    
    print(current_train_test_split_date)
    X_train_seq, y_train_seq, X_test_seq, y_test_seq, X_train, X_test, y_train, y_test, train_dates, test_dates = prepare_data(data_copy,
        features, y_data, current_train_test_split_date, lookback = lookback, n_test_periods=time_between_training)

    num_0 = len(y_train[y_train == 0])
    num_1 = len(y_train[y_train == 1])
    num_both = len(y_train)

    weight_0 = (1 / num_0) * (num_both / 2)
    weight_1 = (1 / num_1) * (num_both / 2)

    class_weights = {0: weight_0, 1: weight_1}

    model.fit(X_train_seq, y_train_seq, epochs=40, verbose=0, class_weight=class_weights)

    train_results = model.predict(X_train_seq)
    test_results = model.predict(X_test_seq)
    
    print(train_dates)
    print(train_dates[-1])
    
    print(test_results)
    print(test_dates)

    if current_train_test_split_date == train_test_split_date:
        train_results_df = pd.DataFrame(train_results, index=train_dates, columns=["p"])
        new_test_results_df = pd.DataFrame(test_results, index=test_dates, columns=["p"])
        test_results_df = new_test_results_df
    else:
        new_test_results_df = pd.DataFrame(test_results, index=test_dates, columns=["p"])
        test_results_df = pd.concat([test_results_df, new_test_results_df])

    print("New test results df:", new_test_results_df.shape)

    current_train_test_split_date = current_train_test_split_date + pd.DateOffset(months=time_between_training)

test_results_df["pred_class"] = 1
test_results_df.loc[test_results_df["p"] < 0.5, "pred_class"] = 0
test_results_df["real_class"] = y_data[y_data.index.isin(test_results_df.index)]

train_results_df["pred_class"] = 1
train_results_df.loc[train_results_df["p"] < 0.5, "pred_class"] = 0

all_results = pd.concat([train_results_df, test_results_df])

model_id = uuid.uuid4()

model_name = f"mc_change_class_train_before_1980_win_std_3_scale_log_retrained_{str(model_id)[:8]}"

directory = f"../../results/regime/lstm/{model_name}"
if not os.path.exists(directory):
    os.makedirs(directory)
test_results_df.to_csv(f"{directory}/test_results.csv")
with open(f"{directory}/summary.txt", 'w') as f:
    model.summary(print_fn=lambda x: f.write(x + '\n'))
    