# Feature Selection for log_diff_model (Multivariate Model)

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import Sequential
from bdi_predict.ml_logic.sequence_gen import WindowGenerator
from tensorflow.keras import layers
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
bdi = pd.read_csv("../raw_data/data/BDI/log_diff_BDI_daily.csv")
bdi.head(2)

Unnamed: 0.1,Unnamed: 0,Date,Price,abs_price,log_price,log_diff
0,0,1995-01-03,1964.0,,3.293141,
1,1,1995-01-04,1961.0,-3.0,3.292478,-0.000664


In [3]:
bdi["Date"] = pd.to_datetime(bdi["Date"])
bdi.set_index("Date", inplace=True)
bdi= bdi[bdi.index != "1995-01-03"]
bdi.drop(columns="Unnamed: 0", inplace=True)
bdi.head(2)

Unnamed: 0_level_0,Price,abs_price,log_price,log_diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1995-01-04,1961.0,-3.0,3.292478,-0.000664
1995-01-05,1967.0,6.0,3.293804,0.001327


In [4]:
merged = pd.read_csv("../raw_data/data/merged_daily_data.csv")
merged["Date"] = pd.to_datetime(merged["Date"])
merged.set_index("Date", inplace=True)
cip = merged[["CIP"]]

In [5]:
feat = pd.read_csv("../raw_data/data/merged_features.csv")
feat["Date"] = pd.to_datetime(feat["Date"])
feat.set_index("Date", inplace=True)
feat.drop(columns="Unnamed: 0", inplace=True)
feat = feat[["close_y", "Nickel_x", "DAP"]]
feat.rename(columns={"close_y":"iron", 
                    "Nickel_x":"nickel", 
                    "DAP":"DAP"}, inplace=True)
comb=feat
comb

Unnamed: 0_level_0,iron,nickel,DAP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1995-01-03,12.270000,8505.450000,198.630000
1995-01-04,12.270000,8505.450000,198.630000
1995-01-05,12.270000,8505.450000,198.630000
1995-01-06,12.270000,8505.450000,198.630000
1995-01-09,12.270000,8505.450000,198.630000
...,...,...,...
2022-11-08,27.540690,10264.567966,306.576271
2022-11-09,26.013621,10003.955424,297.296610
2022-11-10,24.486552,9743.342881,288.016949
2022-11-11,22.959483,9482.730339,278.737288


In [6]:
cass = pd.read_csv("../raw_data/data/CASS/CASS FREIGHT INDEX.csv")
cass = cass[["time", "close"]]
cass["time"] = pd.to_datetime(cass["time"], origin="unix")
cass.set_index("time", inplace=True)
datetime_index = pd.date_range(start="1994-01-01", end="2021-12-01", name="Date")
len(datetime_index)
cass_df = pd.DataFrame(index=datetime_index)
cass_df["cass"] = cass["close"]
cass_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10197 entries, 1994-01-01 to 2021-12-01
Freq: D
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   cass    0 non-null      float64
dtypes: float64(1)
memory usage: 159.3 KB


In [7]:
df = pd.merge(bdi, cip, on="Date", how="inner")
#df.drop(columns="Price_y", inplace=True)


In [10]:
df.head(3)

Unnamed: 0_level_0,Price,abs_price,log_price,log_diff,CIP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1995-01-04,1961.0,-3.0,3.292478,-0.000664,11.988118
1995-01-05,1967.0,6.0,3.293804,0.001327,12.173925
1995-01-06,1983.0,16.0,3.297323,0.003518,12.35472


# Data Cleaning

## Holdout Method (manual, chronological)

In [11]:
input_length = len(df)

In [12]:
def train_test_split(df:pd.DataFrame,
                     train_test_ratio: float,
                     input_length: int) -> tuple:
    '''
    Returns a train dataframe and a test dataframe (df_train, df_test)
    from which one can sample (X,y) sequences using TimeseriesGenerator.
    df_train should contain all the timesteps until round(train_test_ratio * len(fold))   
    '''
    
    # TRAIN SET

    last_train_idx = round(train_test_ratio * len(df))
    df_train = df.iloc[0:last_train_idx, :]

    # TEST SET
 
    first_test_idx = last_train_idx - input_length
    df_test = df.iloc[first_test_idx:, :]

    return (df_train, df_test)

In [13]:
df_train, df_test = train_test_split(df=df, train_test_ratio=0.8, input_length=input_length)

In [58]:
X_train = df_train[["Price", "CIP"]]
y_train = df_train["log_diff"]

X_test  = df_test[["Price", "CIP"]]
y_test = df_test["log_diff"]

In [59]:
#Feature Scaling

scaler_X = MinMaxScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.fit_transform(X_test)



In [60]:
generator = TimeseriesGenerator(X_train_scaled, y_train, length=20, batch_size=8, sampling_rate=1, stride=1)

In [61]:
len(generator)

704

In [62]:
generator_val = TimeseriesGenerator(X_test_scaled, y_test, length=20, batch_size=8, sampling_rate=1, stride=1)

In [63]:
for X, y in generator:
    print(X.shape, y.shape)
    print(X[0], y[0])
    #listy = [float(X[0][i]) for i in range(len(X[0]))]
    #print(listy)
    break

(8, 20, 2) (8,)
[[0.14519214 0.39141632]
 [0.14571379 0.39855159]
 [0.14710485 0.40549438]
 [0.14806121 0.42519177]
 [0.14988698 0.43138855]
 [0.15023474 0.43740479]
 [0.15188663 0.4432429 ]
 [0.15379934 0.44890526]
 [0.15440793 0.46486177]
 [0.15475569 0.46984505]
 [0.15475569 0.47466453]
 [0.15466875 0.47932262]
 [0.1537124  0.48382169]
 [0.15240828 0.49638873]
 [0.15136498 0.50027566]
 [0.1501478  0.50401553]
 [0.14823509 0.50761073]
 [0.14710485 0.51106366]
 [0.1465832  0.52059267]
 [0.14623544 0.52350039]] -0.0015435717560743


# The Model

In [64]:
# Learning Rate Scheduler: 

lr_schedule = ExponentialDecay(
    initial_learning_rate=1e-3,
    decay_steps=10000,
    decay_rate=0.9)

In [65]:
#Optimizers

rmsprop = RMSprop(learning_rate=lr_schedule) #or 0.001 for inital test
adam = Adam(learning_rate=lr_schedule)

In [66]:
#Early Stopping Criterion:

es = EarlyStopping(monitor="val_loss", patience=500, restore_best_weights=True)

In [67]:
from functools import partial
from tensorflow.keras.layers import LeakyReLU

In [68]:
leaky_relu = LeakyReLU(alpha=0.01)

In [69]:
#Initializing and compiling model:

def init_model():
    
    
    model = Sequential()

    #LSTM layers
    model.add(layers.LSTM(60, activation="tanh", input_shape=(20,2), return_sequences=False))

    #Dense layers
    model.add(layers.Dense(120, activation=leaky_relu))
    model.add(layers.Dense(1, activation="linear"))

    #compiling model
    model.compile(loss="mse", optimizer=rmsprop, metrics="mae")
    
    return model

# Training and Testing

In [70]:
#Instantiating model

model = init_model()

In [71]:
#Model layers & params overview:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_7 (LSTM)               (None, 60)                15120     
                                                                 
 dense_11 (Dense)            (None, 120)               7320      
                                                                 
 dense_12 (Dense)            (None, 1)                 121       
                                                                 
Total params: 22,561
Trainable params: 22,561
Non-trainable params: 0
_________________________________________________________________


In [72]:
#TRAINING THE MODEL:

history = model.fit(
    generator, epochs=1000, validation_data=generator_val, shuffle=True
)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000

KeyboardInterrupt: 

In [None]:
model.evaluate(generator_val)

In [None]:
#model.save("cip_model_best")

In [None]:
# Learning Curves:
plot_history(history)

In [None]:
#Baseline MAE
0.0048812746058099115

# Learning Curves viz code:

In [35]:
def plot_history(history):
    
    fig, ax = plt.subplots(1,2, figsize=(20,7))
    # Loss:MSE
    ax[0].plot(history.history['loss'])
    ax[0].plot(history.history['val_loss'])
    ax[0].set_title('MSE')
    ax[0].set_ylabel('Loss')
    ax[0].set_xlabel('Epoch')
    ax[0].legend(['Train', 'Validation'], loc='best')
    ax[0].grid(axis="x",linewidth=0.5)
    ax[0].grid(axis="y",linewidth=0.5)
    
    # Metrics:MAE
    
    ax[1].plot(history.history['mae'])
    ax[1].plot(history.history['val_mae'])
    ax[1].set_title('MAE')
    ax[1].set_ylabel('MAE')
    ax[1].set_xlabel('Epoch')
    ax[1].legend(['Train', 'Validation'], loc='best')
    ax[1].grid(axis="x",linewidth=0.5)
    ax[1].grid(axis="y",linewidth=0.5)
                        
    return ax