In [51]:
import os
import sys

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.initializers import Constant

import numpy as np
import csv
import pandas as pd

import tensorflow as tf
import tensorflow.keras as keras

from IPython.display import SVG
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Reshape, Dot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Add, Activation, Lambda
from tensorflow.keras.layers import Concatenate, Dense, Dropout

from sklearn.preprocessing import LabelEncoder

## Prepare data

In [95]:
with open ("../data/ml-100k/u.data", "r") as f:
    data = list(csv.reader(f, delimiter="\t"))
data = np.array(data)
film_dim = np.amax(np.array(data[:,1]).astype(np.float))
user_dim = np.amax(np.array(data[:,0]).astype(np.float))
print("film size", film_dim)
print("user size", user_dim)
data = data.astype(np.int)
data = pd.DataFrame(data)
data.sort_values([0,3],inplace=True) ## Sort data
data.rename(columns= {0: 'user_id',
                      1: 'movie_id',
                      2: 'rating',
                      3: 'time'},
            inplace=True)
data_x = []
data_y = []
max_len = 0
for i in range(1, user_dim.astype(np.int)+1):
    user = data[data.user_id == i]
    if len(user) > max_len:
        max_len = len(user)
    data_x.append(user['movie_id'])
    data_y.append(user['rating'])

data_x =  tf.keras.preprocessing.sequence.pad_sequences(data_x,padding='post',maxlen=max_len)
data_y =  tf.keras.preprocessing.sequence.pad_sequences(data_y,padding='post',maxlen=max_len)

train_x = data_x[:700]
test_x = data_x[700:]
train_y = data_y[:700]
test_y = data_y[700:]

film size 1682.0
user size 943.0


In [97]:
users = np.array(data.user_id.astype(np.float))
movies = np.array(data.movie_id.astype(np.float))
ratings = np.array(data.rating.astype(np.float))

users.shape

(100000,)

In [24]:
n_latent_factors = 3

In [83]:
def RecommenderV1(n_users, n_movies, n_factors):
    user = Input(shape=(1,))
    u = Embedding(n_users+1, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(user)
    u = Reshape((n_factors,))(u)
    
    movie = Input(shape=(1,))
    m = Embedding(n_movies+1, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(movie)
    m = Reshape((n_factors,))(m)
    
    x = Dot(axes=1)([u, m])

    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt, metrics=['accuracy'])

    return model

In [84]:
n_factors = 50
model = RecommenderV1(n_users, n_movies, n_factors)
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_14 (Embedding)        (None, 1, 50)        47200       input_9[0][0]                    
__________________________________________________________________________________________________
embedding_15 (Embedding)        (None, 1, 50)        84150       input_10[0][0]                   
____________________________________________________________________________________________

In [25]:
X_train_array = [users[:90000], movies[:90000]]
X_test_array = [users[90000:], movies[90000:]]

y_train = ratings[:90000]
y_test = ratings[90000:]

In [26]:
X_train_array

[array([  0.,   0.,   0., ..., 848., 848., 848.]),
 array([167., 171., 164., ..., 297., 120., 405.])]

In [85]:
trained = model.fit(x=X_train_array, y=y_train, batch_size=64, epochs=5, verbose=1, validation_data=(X_test_array, y_test))

Train on 90000 samples, validate on 10000 samples
Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [30]:
class EmbeddingLayer:
    def __init__(self, n_items, n_factors):
        self.n_items = n_items
        self.n_factors = n_factors
    
    def __call__(self, x):
        x = Embedding(self.n_items, self.n_factors, embeddings_initializer='he_normal',
                      embeddings_regularizer=l2(1e-6))(x)
        x = Reshape((self.n_factors,))(x)
        return x

def RecommenderV2(n_users, n_movies, n_factors, min_rating, max_rating):
    user = Input(shape=(1,))
    u = EmbeddingLayer(n_users, n_factors)(user)
    ub = EmbeddingLayer(n_users, 1)(user)
    
    movie = Input(shape=(1,))
    m = EmbeddingLayer(n_movies, n_factors)(movie)
    mb = EmbeddingLayer(n_movies, 1)(movie)

    x = Dot(axes=1)([u, m])
    x = Add()([x, ub, mb])
    x = Activation('softmax')(x)
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)

    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt, metrics=['accuracy'])

    return model

In [93]:
model = RecommenderV2(n_users+1, n_movies+1, n_factors, 0, 5)
model.summary()
history = model.fit(x=X_train_array, y=y_train, batch_size=64, epochs=5, verbose=1, validation_data=(X_test_array, y_test))

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_16 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_24 (Embedding)        (None, 1, 50)        47200       input_15[0][0]                   
__________________________________________________________________________________________________
embedding_26 (Embedding)        (None, 1, 50)        84150       input_16[0][0]                   
____________________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [41]:
predictions = model.predict(X_test_array)

array([5.], dtype=float32)

In [102]:
def RecommenderNet(n_users, n_movies, n_factors, min_rating, max_rating):
    
    user = Input(shape=(1,))
    u = EmbeddingLayer(n_users+1, n_factors)(user)
    
    movie = Input(shape=(1,))
    m = EmbeddingLayer(n_movies+1, n_factors)(movie)
    
    x = Concatenate()([u, m])
    x = Dropout(0.05)(x)
        
    x = Dense(128)(x)
    x = Activation('relu')(x)
    x = Dropout(0.75)(x)
    x = Dense(128)(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(128)(x)
    x = Activation('relu')(x)
    x = Dropout(0.25)(x)
    
    x = Dense(1)(x)
    x = Activation('sigmoid')(x)
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)

    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt, metrics=['accuracy'])

    return model

In [103]:
model = RecommenderNet(n_users, n_movies, n_factors, min(ratings), max(ratings))
model.summary()
history = model.fit(x=X_train_array, y=y_train, batch_size=64, epochs=20,
                    verbose=1, validation_data=(X_test_array, y_test))

Model: "model_24"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_50 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_51 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_48 (Embedding)        (None, 1, 50)        47200       input_50[0][0]                   
__________________________________________________________________________________________________
embedding_49 (Embedding)        (None, 1, 50)        84150       input_51[0][0]                   
___________________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




KeyboardInterrupt: 

In [None]:
book_input = Input(shape=[1], name="Book-Input")
book_embedding = Embedding(n_books+1, 5, name="Book-Embedding")(book_input)
book_vec = Flatten(name="Flatten-Books")(book_embedding)
user_input = Input(shape=[1], name="User-Input")
user_embedding = Embedding(n_users+1, 5, name="User-Embedding")(user_input)
user_vec = Flatten(name="Flatten-Users")(user_embedding)
prod = Dot(name="Dot-Product", axes=1)([book_vec, user_vec])
model = Model([user_input, book_input], prod)
model.compile('adam', 'mean_squared_error')

### Embeddings size
While LSTM units = 64 and movies were not order by time of rating

| Hyperparamater-value | Accuracy | Test Accuracy |
| -------------------- | -------- | ------------- |
| 64                   | 46.94%   | 35.22%        |
| 128                  | 50.09%   | 34.63%        |
| 256                  | 53.71%   | 34.13%        |
| 512                  | 58.82%   | 34.34%        |

### LSTM Units
While embedding size was 300 and movies were not order by time of rating

| Hyperparamater-value | Accuracy | Test Accuracy |
| -------------------- | -------- | ------------- |
| lstm_units (32)      | 51.75%   | 33.73%        |
| lstm_units (64)      | 53.68%   | 33.22%        |
| lstm_units (128)     | 53.52%   | 33.76%        |

### Sort data by time

LSTM unit = 64
Embeddings size = 512
Accuracy: 63.40%
Test Accuracy: 32.43%

In [None]:
class POSTagger(keras.Model):

    def __init__(self):
        super(POSTagger, self).__init__()
        # FIXME
        lstm_units = 64
        num_classes = 6
        self.emb = Embedding(
            input_dim=1683 + 1,
            output_dim=512,
            mask_zero=True,
            trainable=True)

        self.lstm = Bidirectional(LSTM(
            units=lstm_units, return_sequences=True))

        self.dense = Dense(
            units=num_classes,
            activation='softmax')

    def call(self, x):
        # FIXME
        mask = self.emb.compute_mask(x)
        x = self.emb(x)
        x = self.lstm(x, mask=mask)
        x = self.dense(x)
        return x


model = POSTagger()

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])

model.fit(
    x=train_x,
    y=train_y,
    batch_size=10,
    epochs=30,
    validation_data=(test_x, test_y))

In [22]:
train_y

array([[5, 5, 5, ..., 0, 0, 0],
       [4, 3, 3, ..., 0, 0, 0],
       [2, 2, 2, ..., 0, 0, 0],
       ...,
       [3, 3, 4, ..., 0, 0, 0],
       [3, 3, 3, ..., 0, 0, 0],
       [5, 3, 4, ..., 0, 0, 0]], dtype=int32)

In [2]:
links= pd.read_csv('../data/small/links.csv')
movies=pd.read_csv('../data/small/movies.csv')
tags=pd.read_csv('../data/small/tags.csv')
ratings=pd.read_csv('../data/small/ratings.csv')

dataset=movies.merge(ratings,on='movieId').merge(tags,on='movieId').merge(links,on='movieId')

to_drop=['title','genres','timestamp_x','timestamp_y','userId_y','imdbId','tmdbId']

dataset.drop(columns=to_drop,inplace=True)

dataset=pd.get_dummies(dataset)

In [3]:
dataset

Unnamed: 0,movieId,userId_x,rating,"tag_""artsy""",tag_06 Oscar Nominated Best Movie - Animation,tag_1900s,tag_1920s,tag_1950s,tag_1960s,tag_1970s,...,tag_women,tag_wonderwoman,tag_workplace,tag_writing,tag_wrongful imprisonment,tag_wry,tag_younger men,tag_zither,tag_zoe kazan,tag_zombies
0,1,1,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,5,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,5,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233208,187595,586,5.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
233209,193565,184,3.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
233210,193565,184,3.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
233211,193565,184,3.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
labels = dataset.filter('rating')
dataset = dataset.drop(columns='rating')

train_dataset = dataset[:1600]
train_labels = labels[:1600]
validation_dataset = dataset[1600:2000]
validation_labels = labels[1600:2000]

In [4]:
labels = None
dataset = None

In [19]:
from tensorflow.keras.models import Sequential

model = Sequential()
model.add(Dense(13, input_dim=1586, kernel_initializer='zero', activation='relu'))
model.add(Dense(6, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal',activation='linear'))

model.compile(loss='mean_squared_error', optimizer='adam',metrics=['mse','mae'])

model.summary()
history = model.fit(train_dataset,train_labels,batch_size=30, epochs=10,verbose=1, validation_split=0.3)
score = model.evaluate(validation_dataset,validation_labels)
print("Test score:", score)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 13)                20631     
_________________________________________________________________
dense_10 (Dense)             (None, 6)                 84        
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 7         
Total params: 20,722
Trainable params: 20,722
Non-trainable params: 0
_________________________________________________________________


ValueError: A target array with shape (15000, 0) was passed for an output of shape (None, 1) while using as loss `mean_squared_error`. This loss expects targets to have the same shape as the output.

In [6]:
train_labels

0
1
2
3
4
...
1595
1596
1597
1598
1599


In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout

max_features = 1684

model = Sequential()
model.add(Embedding(max_features, output_dim=256, mask_zero=True, trainable=True))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(train_x, train_y, batch_size=16, epochs=1, validation_data=(test_x, test_y))
#score = model.evaluate(x_test, y_test, batch_size=16)

Train on 700 samples, validate on 243 samples
 16/700 [..............................] - ETA: 8:53

InvalidArgumentError:  logits and labels must have the same first dimension, got logits shape [16,1] and labels shape [11792]
	 [[node loss/dense_5_loss/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1751) ]] [Op:__inference_distributed_function_26551]

Function call stack:
distributed_function


In [15]:
test_y.shape

(243, 737)