In [2]:
import os
import sys

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.initializers import Constant
from tensorflow.keras.callbacks import CSVLogger


import numpy as np
import csv
import pandas as pd

## Prepare data

In [3]:
with open ("../data/ml-100k/u.data", "r") as f:
    data = list(csv.reader(f, delimiter="\t"))
data = np.array(data)
film_dim = np.amax(np.array(data[:,1]).astype(np.float))
user_dim = np.amax(np.array(data[:,0]).astype(np.float))
print("film size", film_dim)
print("user size", user_dim)
data = data.astype(np.int)
data = pd.DataFrame(data)
data.sort_values([0,3],inplace=True) ## Sort data
data.rename(columns= {0: 'user_id',
                      1: 'movie_id',
                      2: 'rating',
                      3: 'time'},
            inplace=True)
data_x = []
data_y = []
max_len = 0
for i in range(1, user_dim.astype(np.int)+1):
    user = data[data.user_id == i]
    if len(user) > max_len:
        max_len = len(user)
    data_x.append(user['movie_id'])
    data_y.append(user['rating'])

data_x =  tf.keras.preprocessing.sequence.pad_sequences(data_x,padding='post',maxlen=max_len)
data_y =  tf.keras.preprocessing.sequence.pad_sequences(data_y,padding='post',maxlen=max_len)

train_x = data_x[:700]
test_x = data_x[700:]
train_y = data_y[:700]
test_y = data_y[700:]

film size 1682.0
user size 943.0


### Embeddings size
While LSTM units = 64 and movies were not order by time of rating

| Hyperparamater-value | Accuracy | Test Accuracy |
| -------------------- | -------- | ------------- |
| 64                   | 46.94%   | 35.22%        |
| 128                  | 50.09%   | 34.63%        |
| 256                  | 53.71%   | 34.13%        |
| 512                  | 58.82%   | 34.34%        |

### LSTM Units
While embedding size was 300 and movies were not order by time of rating

| Hyperparamater-value | Accuracy | Test Accuracy |
| -------------------- | -------- | ------------- |
| lstm_units (32)      | 51.75%   | 33.73%        |
| lstm_units (64)      | 53.68%   | 33.22%        |
| lstm_units (128)     | 53.52%   | 33.76%        |

### Sort data by time

LSTM unit = 64
Embeddings size = 512
Accuracy: 63.40%
Test Accuracy: 32.43%

### Adding droput layer

We have tried to add droput layer for our model to combat some overtraining in case there is some. But with parameters of LSTM units (64) and embbedings size (128) we didnt see any improvement:

Accuracy was 51.64% and validadtion accuracy: was 34.92%.
Therefore there are no significant improvements by adding Dropout(0.5) layer 

In [5]:
class POSTagger(keras.Model):

    def __init__(self):
        super(POSTagger, self).__init__()
        lstm_units = 64
        num_classes = 6
        self.emb = Embedding(
            input_dim=1683 + 1,
            output_dim=128,
            mask_zero=True,
            trainable=True)

        self.lstm = Bidirectional(LSTM(
            units=lstm_units, return_sequences=True))
        
        self.dropout = Dropout(0.5)
        
        self.dense = Dense(
            units=num_classes,
            activation='softmax')
        
        

    def call(self, x):
        mask = self.emb.compute_mask(x)
        x = self.emb(x)
        x = self.lstm(x, mask=mask)
        x = self.dropout(x)
        x = self.dense(x)
        return x


model = POSTagger()

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])

csv_logger = CSVLogger('../logs/LSTM_log.csv', append=True, separator=';')

model.fit(
    x=train_x,
    y=train_y,
    batch_size=10,
    epochs=30,
    validation_data=(test_x, test_y),
    callbacks=[csv_logger])

Train on 700 samples, validate on 243 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f42a4971198>

Save weights of the model

In [45]:
model.save_weights("../models/LSTM", save_format="tf")

Since subclassed keras models canot be saved first we have to create and compile one and then we can load weights:

In [46]:
sample_model = POSTagger()
sample_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])

sample_model.load_weights("../models/LSTM")
pred = sample_model.predict(test_x[0], verbose=0)

### Example of prediction of our model

we can see that for first user we are able to predict rating for each of this users movies. Since there is 5 star rating (+ 1 for not rated movie) we get array with prediction of 6 of those ratings.

In [49]:
print(test_x[0][:5])
print(pred[:5])
print(test_y[0][:5])

[269 286 690 300 313]
[[[0.01448555 0.05996639 0.05869912 0.21839991 0.3288063  0.31964275]]

 [[0.03245169 0.10657497 0.14628832 0.17322175 0.26685464 0.27460858]]

 [[0.07072053 0.11664139 0.15948306 0.19422805 0.23257437 0.22635266]]

 [[0.0188613  0.0640035  0.11775716 0.26862097 0.31216374 0.21859339]]

 [[0.0063609  0.02451925 0.07248284 0.11742138 0.2614604  0.51775527]]]
[5 4 4 3 4]


## Removing the classification

From predicted results we see that we are predicting label instead of numerical value. We tried to adjust this model for predicting continous values in range from (0,5>. We realised that using `sparse_categorical_crossentropy` is not the right choice and instead we tried with somethin more commonly used for regression like `mse`. Alas we were not able to adjust the layers of our model som out shape and dimensions would mathch with train_y.
And other aproach lead to model which was prediciting values as if our vector of movie_ids was just list of x values for continuos function `f(x) = y` and all information about the movie and user preferences were lost. 