In [117]:
import tensorflow as tf
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3333)
#sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

import os
os.environ["CUDA_DEVICE_ORDER"]='PCI_BUS_ID'
os.environ["CUDA_VISIBLE_DEVICES"]='2'

import keras
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Dense, Lambda, Flatten, Dropout
from keras.optimizers import Adam, sgd
from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers.embeddings import Embedding
from keras.regularizers import l1, l2

import pandas as pd
import numpy as np

#if 'session' in locals() and session is not None:
#    print('Close interactive session')
#    session.close()

path = "D:\\jtownend\\fast.ai\\data\\movielens\\"

In [28]:
ratings = pd.read_csv(path+'ratings.csv')
movie_names = pd.read_csv(path+'movies.csv').set_index('movieId')['title'].to_dict()

users = ratings.userId.unique()
movies = ratings.movieId.unique()
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()

#replace the user and movie IDs with contiguous integers
userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}
ratings.movieId = ratings.movieId.apply(lambda x: movieid2idx[x])
ratings.userId = ratings.userId.apply(lambda x: userid2idx[x])

#divide randomly into train and validation sets
np.random.seed = 42
msk = np.random.rand(len(ratings)) < 0.8
trn = ratings[msk]
val = ratings[~msk]

#set number of latent factors
n_factors = 50

## Dot product

In [58]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
u = Embedding(n_users, n_factors, input_length=1, W_regularizer=l2(1e-4))(user_in)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
m = Embedding(n_movies, n_factors, input_length=1, W_regularizer=l2(1e-4))(movie_in)

x = merge([u, m], mode='dot')
x = Flatten()(x)
model = Model([user_in, movie_in], x)
model.compile(loss='mse', optimizer=Adam(0.01), metrics=['accuracy'])

In [60]:
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=4, 
          validation_data=([val.userId, val.movieId], val.rating))

Train on 80084 samples, validate on 19920 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x247e54a9dd8>

In [61]:
model.optimizer.lr=0.001

In [62]:
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=3, 
          validation_data=([val.userId, val.movieId], val.rating))

Train on 80084 samples, validate on 19920 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x247e54da390>

## Introduce bias to improve result

In [63]:
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg))(inp)

def create_bias(inp, n_in):
    x = Embedding(n_in, 1, input_length=1)(inp)
    return Flatten()(x)

In [64]:
user_in, u = embedding_input('user_in', n_users, n_factors, 1e-4)
movie_in, m = embedding_input('movie_in', n_movies, n_factors, 1e-4)

ub = create_bias(user_in, n_users)
mb = create_bias(movie_in, n_movies)

In [65]:
x = merge([u, m], mode='dot')
x = Flatten()(x)
x = merge([x, ub], mode='sum')
x = merge([x, mb], mode='sum')
model = Model([user_in, movie_in], x)
model.compile(Adam(0.001), loss='mse')

In [66]:
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=1, 
          validation_data=([val.userId, val.movieId], val.rating))

Train on 80084 samples, validate on 19920 samples
Epoch 1/1


<keras.callbacks.History at 0x247e9bcb320>

In [72]:
model.optimizer.lr=0.02

In [74]:
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=6, 
          validation_data=([val.userId, val.movieId], val.rating))

Train on 80084 samples, validate on 19920 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x247eaefe8d0>

In [75]:
model.optimizer.lr=0.001

In [76]:
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=6, 
          validation_data=([val.userId, val.movieId], val.rating))

Train on 80084 samples, validate on 19920 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x247eaefe860>

In [77]:
model.save_weights("D:\\jtownend\\fast.ai\\data\\movielens\\movielens-bias.h5")

In [None]:
model.load_weights("D:\\jtownend\\fast.ai\\data\\movielens\\movielens-bias.h5")

In [78]:
model.predict([np.array([3]), np.array([6])])

array([[ 4.78690624]], dtype=float32)

## Analyze results

In [79]:
#get top 2000 most popular movies
g=ratings.groupby('movieId')['rating'].count()
topMovies=g.sort_values(ascending=False)[:2000]
topMovies = np.array(topMovies.index)

In [93]:
#set up another "model" with input of a movie and output of the movie bias (not sure how this works! where are the biases from?)
get_movie_bias = Model(movie_in, mb)
movie_bias = get_movie_bias.predict(topMovies)
movie_ratings = [(b[0], movie_names[movies[i]]) for i,b in zip(topMovies,movie_bias)]

In [95]:
movie_ratings

[(1.4195137, 'Forrest Gump (1994)'),
 (1.4857801, 'Pulp Fiction (1994)'),
 (1.8113035, 'Shawshank Redemption, The (1994)'),
 (1.5332462, 'Silence of the Lambs, The (1991)'),
 (1.4902164, 'Star Wars: Episode IV - A New Hope (1977)'),
 (1.1075996, 'Jurassic Park (1993)'),
 (1.4778115, 'Matrix, The (1999)'),
 (1.2142898, 'Toy Story (1995)'),
 (1.6278839, "Schindler's List (1993)"),
 (1.2836015, 'Terminator 2: Judgment Day (1991)'),
 (1.4696461, 'Star Wars: Episode V - The Empire Strikes Back (1980)'),
 (1.3274693, 'Braveheart (1995)'),
 (1.307214, 'Back to the Future (1985)'),
 (1.5374744, 'Fargo (1996)'),
 (1.4403865,
  'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)'),
 (1.4808617, 'American Beauty (1999)'),
 (0.86855781, 'Independence Day (a.k.a. ID4) (1996)'),
 (1.2844898, 'Star Wars: Episode VI - Return of the Jedi (1983)'),
 (0.97603464, 'Aladdin (1992)'),
 (1.3020918, 'Fugitive, The (1993)'),
 (1.4781756, 'Fight Club (1999)'),
 (1.061716, 'Dances wit

In [100]:
#same thing but for the embeddings
get_movie_emb = Model(movie_in, m)
movie_emb = np.squeeze(get_movie_emb.predict([topMovies]))
movie_emb.shape

(2000, 50)

In [101]:
#reduce dimensionality with PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
movie_pca = pca.fit(movie_emb.T).components_

In [110]:
#the first component seems to be "lighthearted"
fac0 = movie_pca[0]
movie_comp = [(f, movie_names[movies[i]]) for f,i in zip(fac0, topMovies)]
sorted(movie_comp, reverse=True)[:10]

[(0.11966001, 'Independence Day (a.k.a. ID4) (1996)'),
 (0.10379191, 'Armageddon (1998)'),
 (0.068023458, 'Shrek (2001)'),
 (0.06713897, 'Stargate (1994)'),
 (0.065725878, 'Star Wars: Episode I - The Phantom Menace (1999)'),
 (0.065280564, 'Pearl Harbor (2001)'),
 (0.063906938, 'Pay It Forward (2000)'),
 (0.061858326, 'Con Air (1997)'),
 (0.061536286, 'Ace Ventura: Pet Detective (1994)'),
 (0.060085941, 'Three Musketeers, The (1993)')]

In [111]:
sorted(movie_comp, reverse=True)[-10:]

[(-0.074374177, 'Leaving Las Vegas (1995)'),
 (-0.075018689, 'Godfather, The (1972)'),
 (-0.075442955, 'Sideways (2004)'),
 (-0.077937499, 'Blade Runner (1982)'),
 (-0.083713144, '2001: A Space Odyssey (1968)'),
 (-0.089220598, 'Fargo (1996)'),
 (-0.0892848, 'Big Lebowski, The (1998)'),
 (-0.090143129, 'Clockwork Orange, A (1971)'),
 (-0.093870796, 'Taxi Driver (1976)'),
 (-0.11106043, 'Pulp Fiction (1994)')]

In [112]:
#the second component seems to be "serious epic"
fac1 = movie_pca[1]
movie_comp = [(f, movie_names[movies[i]]) for f,i in zip(fac1, topMovies)]
sorted(movie_comp, reverse=True)[:10]

[(0.16545823, 'Lord of the Rings: The Two Towers, The (2002)'),
 (0.15565476, 'Lord of the Rings: The Fellowship of the Ring, The (2001)'),
 (0.13412994, 'Lord of the Rings: The Return of the King, The (2003)'),
 (0.13293108, 'Matrix, The (1999)'),
 (0.12311188, 'Jurassic Park (1993)'),
 (0.11891862,
  'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)'),
 (0.11722182, 'Gladiator (2000)'),
 (0.11457396, 'Star Wars: Episode V - The Empire Strikes Back (1980)'),
 (0.11172211, 'Star Wars: Episode VI - Return of the Jedi (1983)'),
 (0.1085334, 'Dances with Wolves (1990)')]

In [113]:
sorted(movie_comp, reverse=True)[-10:]

[(-0.043559909, 'Hoop Dreams (1994)'),
 (-0.044469871, 'Judge Dredd (1995)'),
 (-0.044979084, 'Four Weddings and a Funeral (1994)'),
 (-0.045300372, 'Ransom (1996)'),
 (-0.045804527, 'Alien: Resurrection (1997)'),
 (-0.047873728, 'Lives of Others, The (Das leben der Anderen) (2006)'),
 (-0.050127681, 'Room with a View, A (1986)'),
 (-0.051874865, 'Island of Dr. Moreau, The (1996)'),
 (-0.054059375, 'Annie Hall (1977)'),
 (-0.054278318, 'Little Miss Sunshine (2006)')]

In [114]:
#the third component seems to be "violent/adult"
fac2 = movie_pca[2]
movie_comp = [(f, movie_names[movies[i]]) for f,i in zip(fac2, topMovies)]
sorted(movie_comp, reverse=True)[:10]

[(0.099625774, 'Seven (a.k.a. Se7en) (1995)'),
 (0.097437553, 'Natural Born Killers (1994)'),
 (0.089570835, 'Stargate (1994)'),
 (0.081196181, 'Happy Gilmore (1996)'),
 (0.076776825, 'American Psycho (2000)'),
 (0.076167397, 'Dogma (1999)'),
 (0.075688913, 'Die Hard: With a Vengeance (1995)'),
 (0.075214759, 'Eyes Wide Shut (1999)'),
 (0.072266996, 'Armageddon (1998)'),
 (0.070500925,
  'Léon: The Professional (a.k.a. The Professional) (Léon) (1994)')]

In [115]:
sorted(movie_comp, reverse=True)[-10:]

[(-0.072909266, "Bug's Life, A (1998)"),
 (-0.075818278, 'Aladdin (1992)'),
 (-0.076405525, 'Grease (1978)'),
 (-0.076853089, 'Chicken Run (2000)'),
 (-0.081125103, 'Dances with Wolves (1990)'),
 (-0.082340948, 'Apollo 13 (1995)'),
 (-0.083156765, 'Sound of Music, The (1965)'),
 (-0.096660517, 'Babe (1995)'),
 (-0.098518029, 'Sling Blade (1996)'),
 (-0.10521671, 'Beauty and the Beast (1991)')]

## Nueral net

In [118]:
#often easier and more accurate to use a NN instead of special built architecture (like a dot product)
user_in, u = embedding_input('user_in', n_users, n_factors, 1e-4)
movie_in, m = embedding_input('movie_in', n_movies, n_factors, 1e-4)

x = merge([u, m], mode='concat')
x = Flatten()(x)
x = Dropout(0.3)(x)
x = Dense(70, activation='relu')(x)
x = Dropout(0.75)(x)
x = Dense(1)(x)
nn = Model([user_in, movie_in], x)
nn.compile(Adam(0.001), loss='mse')

In [119]:
nn.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=8, 
          validation_data=([val.userId, val.movieId], val.rating))

Train on 80084 samples, validate on 19920 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x247ed0bd518>