In [None]:
%load_ext autoreload
%autoreload 2

from FP567_Lib import *

import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
import matplotlib as mpl
import matplotlib.pyplot as plt
import pathlib
import os
import numpy as np
import pandas as pd

In [66]:
'''
We have all the files in resources/market_item_data/, which are item infos in the form of:
{ 
    item_id : 
    [
        [date/time_n+1, price_at_time_n+1, amount_sold_at_time_n+1], [date/time_n+2, price_at_time_n+2, amount_sold_at_time_n+2], ..., [date/time_n+m, price_at_time_n+m, amount_sold_at_time_n+m]
    ]
}
for some amount of time.
The amount of time varies between items, as not all items have existed as long as others.

Let's make a Market object, which computes a bunch of the info we want.
'''
market = Market()

In [3]:
'''
So, lets extend all the items that are not highest_unit_time worth of info, backwards in time,
so that the items that do not have as many as highest_unit_time, now have highest_unit_time
amount of info, with the time stamp, but just 0, 0 for those days.

To do that, we can call the balance method of the Market object, using the 
longest time span of unix times, and 0, 0 as the default amount sold and price
'''
market.balance_as_is(0, 0)
market.is_balanced()

True

In [114]:
'''
Now we have a balanced market and want to include updates into a matrix in the below form.
Notice how it is essentially m matrices, where each matrix represents a unit of time,
appended onto one another from left to right, and is n rows by k+2 cols = amount_sold col + price col + k embedded update cols
        | unix_time_0                                                                                                       | unix_time_1                                                                                                       |     | unix_time_m 
________| amount_sold_0 | price_0 | update_unix_time_0_feat_1 | update_unix_time_0_feat_2 | ... | update_unix_time_0_feat_k | amount_sold_1 | price_1 | update_unix_time_1_feat_1 | update_unix_time_1_feat_2 | ... | update_unix_time_1_feat_k | ... | amount_sold_m | price_m | update_unix_time_m_feat_1 | update_unix_time_m_feat_2 | ... | update_unix_time_m_feat_k |
item_1  |               |         |                           |                           | ... |                           |               |         |                           |                           | ... |                           | ... |               |         |                           |                           | ... |                           |
item_2  |               |         |                           |                           | ... |                           |               |         |                           |                           | ... |                           | ... |               |         |                           |                           | ... |                           |
.       |       .           .                   .                       .                   ...               .                    .             .                .                            .                ...               .                           .           .                  .                           .                ...               .             | 
.       |       .           .                   .                       .                   ...               .                    .             .                .                            .                ...               .                           .           .                  .                           .                ...               .             | 
.       |       .           .                   .                       .                   ...               .                    .             .                .                            .                ...               .                           .           .                  .                           .                ...               .             | 
item_n  |               |         |                           |                           | ... |                           |               |         |                           |                           | ... |                           |     |               |         |                           |                           | ... |                           |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Alot of them will be zeros.
There will be zeros for
    update_unix_time_i_feat_j for all j when there was no update for unix_time_i.
    an item's amount_sold_i and price_i when that item was not being sold for unix_time_i.

So, need to build a matrix of all the items info and tack on the embedded update.
Lets build it so that it goes embedded update cols, amount sold col, price col, so that later,
when we do forcasting and a day's worth of cols are forcasted, the price will be the last col
of the output and thus, easier to quickly spot the forcasted price.
'''
market.build_features_matrix()

In [None]:
'''
Save the features matrix so we dont have to keep making it.
'''
market.save_market_with_updates_rep_as_csv(PATH_TO_ASSEMBLED_FORCASTING_MATRIX)

In [2]:
'''
Train a forcasting model.
'''
df, forcasted_day_len = get_forcasting_market_df(get_a_random_df=True)

In [None]:
'''
Here we define the length of days we want to forcast at a time, divy up the data into
training and validation, standardize the data.
'''
total_days = df.shape[1]/forcasted_day_len
if not total_days.is_integer() or df.shape[1] % int(total_days) != 0:
    raise Exception(
        "The market matrix is malformed. Total days =",
         int(total_days),
        "and forcasted day length =",
        forcasted_day_len,
        "and total cols in the market =", df.shape[1])

num_training_days = int(total_days*0.8)
num_validation_days = total_days - num_training_days
num_training_cols = num_training_days*forcasted_day_len
num_validation_cols = num_validation_days*forcasted_day_len

# divide into training and validation
training_days_df = df.loc[:, :num_training_cols]
validation_days_df = df.loc[:, num_training_cols:num_training_cols + num_validation_cols]
# Normalize the data and convert to numpys
scaler = StandardScaler()
training_days_mat = scaler.fit(training_days_df).fit_transform(training_days_df).astype(np.float16)
validation_days_mat = scaler.fit(validation_days_df).fit_transform(validation_days_df).astype(np.float16)
training_days_mat

In [4]:
'''
Find a days at a time to predict value that divides evenly into both the validations
and training days (hopefully)
'''
initial_window_size_to_days_to_predict_scale = 12
days_at_a_time_to_predict = 7 # just do 4 if none found
for i in range(3, 10):
    if i % num_validation_days == i % num_training_days == 0:
        days_at_a_time_to_predict = i
        break

'''
We want to say a window_size worth of columns are equal to predict_size worth of columns,
then slide the window predict_size columns and repeat over and over.
'''
window_size = (days_at_a_time_to_predict * initial_window_size_to_days_to_predict_scale) * forcasted_day_len
predict_size = days_at_a_time_to_predict*forcasted_day_len
tensor_shape = (training_days_mat.shape[0], window_size)

In [None]:
'''
Make two tensor stacks:
    X =
        [
            sparse_tensor(items_feats from time 0 to time n),
            sparse_tensor(items_feats from time p to time n + p),
            ...,
            sparse_tensor(items_feats from time i*p to time n + i*p)
        ]
    Y =
        [
            sparse_tensor(items_feats from time n to time n + p),
            sparse_tensor(items_feats from time (n + p) to time (n + p) + p),
            ...,
            sparse_tensor(items_feats from time (n + i*p) to time (n + i*p) + p)
        ]
where we will train our model that X[j] = Y[j]
'''
i = 0
X_sparse_tensor_stack = []
Y_sparse_tensor_stack = []
while True:
    window_start = predict_size * i
    window_end = window_start + window_size
    predict_window_end = window_end+predict_size
    if predict_window_end > training_days_mat.shape[1]:
        break

    # take all the cols from the start of our time span, to current window end
    X = training_days_mat[:, window_start:window_end]
    # make an association that those columns will be equal to the next days_at_a_time_to_predict days worth of columns
    Y = training_days_mat[:, window_end:predict_window_end]

    X_tensor = make_numpy_mat_into_tf_sparse_tensor(X, make_practice_sparse=True)
    Y_tensor = make_numpy_mat_into_tf_sparse_tensor(Y, make_practice_sparse=True)

    if X_tensor is None or Y_tensor is None:
        print("i =", i)
        print("X =\n", X)
        print("Y =\n", Y)
        break
    # else
    X_sparse_tensor_stack.append(X_tensor)
    Y_sparse_tensor_stack.append(Y_tensor)
    i = i + 1

In [40]:
'''
Do the same thing but for the validation data
'''
i = 0
X_sparse_tensor_stack_validation = []
Y_sparse_tensor_stack_validation = []
while True:
    window_start = predict_size * i
    window_end = window_start + window_size
    predict_window_end = window_end+predict_size
    if predict_window_end > validation_days_mat.shape[1]:
        break

    # take all the cols from the start of our time span, to current window end
    X = validation_days_mat[:, window_start:window_end]
    # make an association that those columns will be equal to the next days_at_a_time_to_predict days worth of columns
    Y = validation_days_mat[:, window_end:predict_window_end]

    X_tensor = make_numpy_mat_into_tf_sparse_tensor(X, make_practice_sparse=True)
    Y_tensor = make_numpy_mat_into_tf_sparse_tensor(Y, make_practice_sparse=True)

    if X_tensor is None or Y_tensor is None:
        print("i =", i)
        print("X =\n", X)
        print("Y =\n", Y)
        break
    # else
    X_sparse_tensor_stack_validation.append(X_tensor)
    Y_sparse_tensor_stack_validation.append(Y_tensor)
    i = i + 1

In [44]:
batch_size = 32
model = get_model(tensor_shape, batch_size, predict_size)
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_16 (LSTM)              (32, 3500, 2520)          50813280  
                                                                 
 dropout (Dropout)           (32, 3500, 2520)          0         
                                                                 
 lstm_17 (LSTM)              (32, 3500, 1260)          19056240  
                                                                 
 dropout_1 (Dropout)         (32, 3500, 1260)          0         
                                                                 
 lstm_18 (LSTM)              (32, 3500, 630)           4765320   
                                                                 
 dropout_2 (Dropout)         (32, 3500, 630)           0         
                                                                 
 time_distributed_1 (TimeDis  (32, 3500, 210)        

In [None]:
# set up a unique folder for this model and set up check points
checkpoint_dir_x = os.path.join(
    PATH_TO_MODELS_DIRECTORY,
    datetime.now().strftime("%m/%d/%H:%M:%S"))
pathlib.Path(checkpoint_dir_x).mkdir(parents=True, exist_ok=True)

call_backs = [
    keras.callbacks.ModelCheckpoint(
        os.path.join(checkpoint_dir_x, "save_at_{epoch}.h5"),
        save_weights_only=True),
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10), # stop if the validation loss goes up for given number of epochs
]

# Train!
epochs = 30
history = model.fit(
    X_sparse_tensor_stack,
    Y_sparse_tensor_stack,
    epochs=epochs,
    validation_data=(X_sparse_tensor_stack_validation, Y_sparse_tensor_stack_validation),
    verbose=1, # 0 is silent, 1 for loading bar, 2 for stats each epoch
    callbacks=call_backs)

In [None]:
# Save the model summary, plus it's hyper params, epoch count, optimizer type, etc to a json to be
# reloaded later.
# Plot the model's change in training and validation accuracy and loss over epochs.
# Plot a confusion matrix using the validation data.
print_model_summary_to_file(model, checkpoint_dir_x)
save_training_params(
    batch_size,
    training_days_mat.shape[0],
    days_at_a_time_to_predict,
    initial_window_size_to_days_to_predict_scale,
    predict_size,
    window_size,
    checkpoint_dir_x)
plot_accuracy(plt, history, checkpoint_dir_x)
plot_loss(plt, history, checkpoint_dir_x)