In [None]:
#CODE TO SETUP WANDB
!pip install wandb
 
import wandb
from wandb.keras import WandbCallback

wandb.login()

In [None]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing

import matplotlib.image as mpimg
from skimage.io import imread, imshow
from skimage.color import rgb2gray
from skimage import data, color, io, filters, morphology,transform, exposure, feature, util
from scipy import ndimage


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, BatchNormalization,concatenate, InputLayer
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, Conv2DTranspose, Reshape
from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import image
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping

import random
import cv2
from PIL import Image

pd.options.mode.chained_assignment = None

### Loading the dataset.

The file **ratings.dat** consists of:

- Column 0: ID of the reviewer.
- Column 1: ID of the movie.
- Column 2: Movie rating (1 --> 5). 
- Column 3: Timestamp of the review.

Each reviewer reviewed at least 20 movies.
One row in the dataset stands for exactly one review.


In [None]:
dataset = pd.read_csv('ratings.dat',sep='::',header=None, engine='python')

dataset.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [None]:
# Reshaping the dataset so that it becomes better usable
dataset.columns = dataset.columns.map(str)
dataset=dataset.rename(columns={'0': 'reviewer', '1': 'movie', '2': 'score', '3': 'time'})
dataset = pd.pivot(dataset, index='reviewer',
                   columns="movie", values="score").reset_index()
dataset=dataset.fillna(0)

In [None]:
dataset

movie,reviewer,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,3913,3914,3915,3916,3917,3918,3919,3920,3921,3922,3923,3924,3925,3926,3927,3928,3929,3930,3931,3932,3933,3934,3935,3936,3937,3938,3939,3940,3941,3942,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,4.0,0.0,3.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,4.0,3.0,0.0,4.0,3.0,4.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6036,6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Compile the trainig set and test set

# For each reviewer, a certained number of reviewed movies will be assigned to the training set, the other reviewed movies will be assigned to the test set.

# The ratings of the movies that end up in the test set will be put to zero in the training set and will be considered as not reviewed.

train=[]
test=[]
for index, row in dataset.iterrows():
    #Get indexes
    s=dataset.iloc[index]>0.0
    #------------------------------------------------------------------------------------------------------
    test_inds=np.random.choice(np.delete(np.flatnonzero(s.values),0), size=10, replace=False)
    train_row=dataset.iloc[index]
    train_row.iloc[test_inds]=train_row.iloc[test_inds].apply(lambda x: 0)
    train.append(train_row)
    #------------------------------------------------------------------------------------------------------
    train_inds=list(set(np.delete(np.flatnonzero(s.values),0))-set(test_inds))
    test_row=dataset.iloc[index]
    test_row.iloc[train_inds]=test_row.iloc[train_inds].apply(lambda x: 0)
    test.append(test_row)

In [None]:
train_df = pd.DataFrame(
    columns=dataset.columns, data=train)

In [None]:
test_df = pd.DataFrame(
    columns=dataset.columns, data=test)

In [None]:
# Drop the reviewer column, this is not needed and can only create bias.
train_df=train_df.drop(['reviewer'],axis=1)
test_df=test_df.drop(['reviewer'], axis=1)

In [None]:
# Normalize the data
train_df = train_df.astype('float32') /5
test_df = test_df.astype('float32') /5

In [None]:
# Custom loss function that masks unrated movies
def custom_loss(y_true,y_pred):
    y_mask=keras.backend.clip(y_true, 0, 0.01)*100
    return K.mean(K.square(y_mask*(y_pred - y_true)), axis=-1)

In [None]:
# Building the autoencoder, this is a undercomplete autoencoder
def train():
    adam = tf.keras.optimizers.Adam(0.01)
    es = EarlyStopping(monitor='loss', mode='min', verbose=1, patience=20)
    #model
    model = Sequential()
    model.add(Dense(train_df.shape[0], activation='relu'))
    model.add(Dense(1000, activation='relu'))
    model.add(Dense(120, activation='relu'))
    model.add(Dense(1000, activation='relu'))
    model.add(Dense(train_df.shape[1], activation='sigmoid'))

    model.compile(optimizer=adam,loss=custom_loss)


    history=model.fit(train_df, train_df,
                    epochs=100,
                    batch_size=64,
                    validation_split=0.2,
                    shuffle=True, callbacks=[es])
    return model, history

In [None]:
model, history=train()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### Making recommendations.

Movies used to base recommendations on:

- 47::Seven (Se7en) (1995)::Crime|Thriller : 5
- 70::From Dusk Till Dawn (1996)::Action|Comedy|Crime|Horror|Thriller : 5
- 145::Bad Boys (1995)::Action: 4
- 165::Die Hard: With a Vengeance (1995)::Action|Thriller: 4
- 193::Showgirls (1995)::Drama: 2
- 318::Shawshank Redemption, The (1994)::Drama: 4
- 480::Jurassic Park (1993)::Action|Adventure|Sci-Fi: 3
- 488::M. Butterfly (1993)::Drama: 2
- 527::Schindler's List (1993)::Drama|War: 4
- 539::Sleepless in Seattle (1993)::Comedy|Romance: 3
- 540::Sliver (1993)::Thriller: 2
- 597::Pretty Woman (1990)::Comedy|Romance: 3
- 608::Fargo (1996)::Crime|Drama|Thriller: 5 
- 648::Mission: Impossible (1996)::Action|Adventure|Mystery: 3
- 778::Trainspotting (1996)::Drama : 4
- 858::Godfather, The (1972)::Action|Crime|Drama: 4
- 1407::Scream (1996)::Horror|Thriller: 1
- 2324::Life Is Beautiful (La Vita è bella) (1997)::Comedy|Drama: 5
- 2346::Stepford Wives, The (1975)::Sci-Fi|Thriller: 1
- 3827::Space Cowboys (2000)::Action|Sci-Fi: 1 


In [None]:
recomm_df=pd.DataFrame(columns=train_df.columns)
recomm_df = recomm_df.append(pd.Series(0, index=recomm_df.columns), ignore_index=True)

In [None]:
recomm_df

movie,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,3913,3914,3915,3916,3917,3918,3919,3920,3921,3922,3923,3924,3925,3926,3927,3928,3929,3930,3931,3932,3933,3934,3935,3936,3937,3938,3939,3940,3941,3942,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Make recommendations
recomm_df.loc[0].loc[[1407,2346,3827]]=1
recomm_df.loc[0].loc[[193,488,540]]=2
recomm_df.loc[0].loc[[480,539,597,648]]=3
recomm_df.loc[0].loc[[145,165,318,527,778,858]]=4
recomm_df.loc[0].loc[[47,70,608,2324]]=5

In [None]:
# Same normalization as the training set
recomm_df = recomm_df.astype('float32') /5

In [None]:
pred = model.predict(recomm_df)

In [None]:
#loading in movies to use the names
movies_df=pd.read_csv('movies.dat',sep='::',header=None, engine='python')
movies_df.columns = movies_df.columns.map(str)
movies_df=movies_df.rename(columns={'0': 'ID', '1': 'movie', '2': 'genres'})

In [None]:
#10 recommended movies
movies_df.loc[movies_df['ID'].isin(top_10_idx)]

Unnamed: 0,ID,movie,genres
48,49,When Night Is Falling (1995),Drama|Romance
306,309,"Red Firecracker, Green Firecracker (1994)",Drama
509,513,Radioland Murders (1994),Comedy|Mystery|Romance
699,708,"Truth About Cats & Dogs, The (1996)",Comedy|Romance
734,744,Brothers in Trouble (1995),Drama
792,802,Phenomenon (1996),Drama|Romance
850,861,Supercop (1992),Action|Thriller
1770,1839,My Giant (1998),Comedy
2629,2698,Zone 39 (1997),Sci-Fi
3042,3111,Places in the Heart (1984),Drama
