# Feature Selection for log_diff_model (Multivariate Model)

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import Sequential
from bdi_predict.ml_logic.sequence_gen import WindowGenerator
from tensorflow.keras import layers
from tensorflow.keras import models
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
bdi = pd.read_csv("../raw_data/data/BDI/log_diff_BDI_daily.csv")
bdi["Date"] = pd.to_datetime(bdi["Date"])
bdi.set_index("Date", inplace=True)
bdi= bdi[bdi.index != "1995-01-03"]
bdi.drop(columns="Unnamed: 0", inplace=True)
bdi.head(2)

Unnamed: 0_level_0,Price,abs_price,log_price,log_diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1995-01-04,1961.0,-3.0,3.292478,-0.000664
1995-01-05,1967.0,6.0,3.293804,0.001327


In [3]:
merged = pd.read_csv("../raw_data/data/merged_daily_data.csv")
merged["Date"] = pd.to_datetime(merged["Date"])
merged.set_index("Date", inplace=True)
cip = merged[["CIP"]]
cip.head(2)

Unnamed: 0_level_0,CIP
Date,Unnamed: 1_level_1
1995-01-02,11.601218
1995-01-03,11.797236


In [4]:
df = pd.read_csv("../api_data/cip_model_data.csv")
df.tail(21)

Unnamed: 0,Date,Price,log_price,log_diff,CIP
7044,2022-10-17,1819.0,3.259833,-0.000906,4.3
7045,2022-10-18,1762.0,3.246006,-0.013827,4.3
7046,2022-10-19,1705.0,3.231724,-0.014282,4.3
7047,2022-10-20,1648.0,3.216957,-0.014767,4.3
7048,2022-10-21,1591.0,3.20167,-0.015287,4.3
7049,2022-10-24,1534.0,3.185825,-0.015845,4.3
7050,2022-10-25,1491.8,3.173711,-0.012115,4.3
7051,2022-10-26,1449.6,3.161248,-0.012462,4.3
7052,2022-10-27,1407.4,3.148418,-0.012831,4.3
7053,2022-10-28,1365.2,3.135196,-0.013221,4.3


# Setting up the X_pred input

In [5]:
input_length = len(df)

In [6]:
def train_test_split(df:pd.DataFrame,
                     train_test_ratio: float,
                     input_length: int) -> tuple:
    '''
    Returns a train dataframe and a test dataframe (df_train, df_test)
    from which one can sample (X,y) sequences using TimeseriesGenerator.
    df_train should contain all the timesteps until round(train_test_ratio * len(fold))   
    '''
    
    # TRAIN SET

    last_train_idx = round(train_test_ratio * len(df))
    df_train = df.iloc[0:last_train_idx, :]

    # TEST SET
 
    first_test_idx = last_train_idx - input_length
    df_test = df.iloc[first_test_idx:, :]

    return (df_train, df_test)

In [7]:
df_train, df_test = train_test_split(df=df, train_test_ratio=0.8, input_length=input_length)

X_train = df_train[["Price", "CIP"]]
y_train = df_train["log_diff"]

X_test  = df_test[["Price", "CIP"]]
y_test = df_test["log_diff"]

In [8]:
#Feature Scaling - fitting to the training data

scaler_X = MinMaxScaler()

scaler_X.fit(X_train)
X_train_scaled = scaler_X.transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

In [19]:
prediction_df

Unnamed: 0_level_0,Price,log_price,log_diff,CIP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-10-17,1819.0,3.259833,-0.000906,4.3
2022-10-18,1762.0,3.246006,-0.013827,4.3
2022-10-19,1705.0,3.231724,-0.014282,4.3
2022-10-20,1648.0,3.216957,-0.014767,4.3
2022-10-21,1591.0,3.20167,-0.015287,4.3
2022-10-24,1534.0,3.185825,-0.015845,4.3
2022-10-25,1491.8,3.173711,-0.012115,4.3
2022-10-26,1449.6,3.161248,-0.012462,4.3
2022-10-27,1407.4,3.148418,-0.012831,4.3
2022-10-28,1365.2,3.135196,-0.013221,4.3


In [20]:
prediction_df = df.tail(21)
prediction_df["Date"] = pd.to_datetime(prediction_df["Date"])
prediction_df.set_index("Date", inplace=True)
prediction_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df["Date"] = pd.to_datetime(prediction_df["Date"])


Unnamed: 0_level_0,Price,log_price,log_diff,CIP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-10-17,1819.0,3.259833,-0.000906,4.3
2022-10-18,1762.0,3.246006,-0.013827,4.3
2022-10-19,1705.0,3.231724,-0.014282,4.3
2022-10-20,1648.0,3.216957,-0.014767,4.3
2022-10-21,1591.0,3.20167,-0.015287,4.3
2022-10-24,1534.0,3.185825,-0.015845,4.3
2022-10-25,1491.8,3.173711,-0.012115,4.3
2022-10-26,1449.6,3.161248,-0.012462,4.3
2022-10-27,1407.4,3.148418,-0.012831,4.3
2022-10-28,1365.2,3.135196,-0.013221,4.3


In [21]:
X_pred_prescaling = prediction_df[["Price", "CIP"]]
X_pred_input_scaled = scaler_X.transform(X_pred_prescaling)

y_true = np.array(prediction_df["log_diff"])

In [22]:
type(X_pred_input_scaled)

numpy.ndarray

In [23]:
type(y_true)

numpy.ndarray

In [24]:
predict_generator = TimeseriesGenerator(X_pred_input_scaled, y_true, length=20, batch_size=1, sampling_rate=1, stride=1)

In [25]:
for X_input, y_true in predict_generator:
    print(X_input.shape, y_true.shape)
    print(X_input[0], y_true[0])

(1, 20, 2) (1,)
[[0.13284646 0.09618118]
 [0.1278908  0.09618118]
 [0.12293514 0.09618118]
 [0.11797948 0.09618118]
 [0.11302382 0.09618118]
 [0.10806816 0.09618118]
 [0.10439923 0.09618118]
 [0.10073031 0.09618118]
 [0.09706138 0.09618118]
 [0.09339245 0.09618118]
 [0.08972353 0.09618118]
 [0.09027995 0.09618118]
 [0.09083638 0.09618118]
 [0.0913928  0.09618118]
 [0.09194923 0.09618118]
 [0.09250565 0.09618118]
 [0.0896192  0.09618118]
 [0.08673274 0.09618118]
 [0.08384629 0.09618118]
 [0.08095983 0.09618118]] -0.0119604247651161


In [26]:
model = models.load_model('/Users/justinrlawes/code/LeibnizianOptimist/bdi_predict/training_outputs/models/cip_model_best')

In [57]:
y_pred_log_diff = model.predict(predict_generator)
y_pred_log_diff

array([[-0.01014559]], dtype=float32)

In [58]:
X_pred_input_scaled.shape

(21, 2)

In [59]:
prediction_df

Unnamed: 0_level_0,Price,log_price,log_diff,CIP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-10-17,1819.0,3.259833,-0.000906,4.3
2022-10-18,1762.0,3.246006,-0.013827,4.3
2022-10-19,1705.0,3.231724,-0.014282,4.3
2022-10-20,1648.0,3.216957,-0.014767,4.3
2022-10-21,1591.0,3.20167,-0.015287,4.3
2022-10-24,1534.0,3.185825,-0.015845,4.3
2022-10-25,1491.8,3.173711,-0.012115,4.3
2022-10-26,1449.6,3.161248,-0.012462,4.3
2022-10-27,1407.4,3.148418,-0.012831,4.3
2022-10-28,1365.2,3.135196,-0.013221,4.3


In [60]:
log_val_nov_11 = prediction_df.iloc[19, 1]
log_val_nov_11

3.087142279383808

In [61]:
float(y_pred_log_diff[0][0])

-0.01014559157192707

In [62]:
y_pred_log_diff = float(y_pred_log_diff[0][0])

In [63]:
#WORKING OUT COMMON LOG Y PRED, y_pred_log

y_pred_log = y_pred_log_diff + log_val_nov_11
print(y_pred_log)
print(type(y_pred_log))


3.076996687811881
<class 'numpy.float64'>


In [64]:
prev_day = prediction_df.iloc[19, 0]
prev_day

1222.2

In [65]:
#CONVERTING COMMON LOG y_pred_log to y_pred
y_pred = 10**y_pred_log
y_pred

1193.9789984353042

In [66]:
1193-1222

-29

# Data Cleaning

In [25]:
generator = TimeseriesGenerator(X_train_scaled, y_train, length=20, batch_size=8, sampling_rate=1, stride=1)

In [26]:
generator_val = TimeseriesGenerator(X_test_scaled, y_test, length=20, batch_size=8, sampling_rate=1, stride=1)

# The Model

In [27]:
model.evaluate(generator_val)

KeyError: 20