In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install lightfm



In [3]:
import tensorflow as tf
from tensorflow import keras 


import numpy as np
import pandas as pd
from lightfm import LightFM
from scipy.sparse import coo_matrix, csr_matrix, identity, hstack
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()
tf.random.set_seed(12)
np.random.seed(12)

In [4]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

###Preprocessing

In [5]:
overview_embeddings = np.loadtxt('/content/drive/My Drive/Colab Notebooks/Course Work/overview_embeddings.csv', delimiter=',')
movies_with_descriprions_ids = np.loadtxt('/content/drive/My Drive/Colab Notebooks/Course Work/movies_with_descriprions_ids.csv', 
                                          delimiter=',',
                                          dtype=np.int32)

In [6]:
movies_with_descriprions_ids, ids = np.unique(movies_with_descriprions_ids, return_index=True)

In [7]:
overview_embeddings = overview_embeddings[ids]

In [8]:
overview_embeddings.shape, movies_with_descriprions_ids.shape

((44478, 512), (44478,))

In [9]:
rating_frame = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Course Work/ratings.csv.zip', compression='zip')

In [10]:
rating_frame.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [11]:
rating_frame.shape

(26024289, 4)

In [12]:
rating_frame_cut = rating_frame[rating_frame.movieId.isin(movies_with_descriprions_ids)].drop('timestamp', axis=1)

In [13]:
def encode_binary(x):
    return 1 if x >= 4 else 0

rating_frame_cut['rating'] = rating_frame_cut['rating'].apply(encode_binary)

In [14]:
rating_frame_cut[rating_frame_cut.userId < 200_000].shape

(8433072, 3)

In [15]:
users_watches_df = rating_frame_cut \
    .drop('rating', axis=1) \
    .groupby('userId') \
    .count() \
    .rename(columns={'movieId':'movies_watched'}) \
    .sort_values(by='movies_watched', ascending=False) \
    .reset_index()

In [16]:
min_watched = 5

popular_users_ids = list(users_watches_df[users_watches_df.movies_watched >= min_watched] \
                                .sort_values(by='userId').userId)

In [17]:
len(popular_users_ids)

226831

In [18]:
rating_frame_cut.head()

Unnamed: 0,userId,movieId,rating
0,1,110,0
1,1,147,1
2,1,858,1
4,1,1246,1
5,1,1968,1


In [19]:
rating_frame_cut.shape

(11395911, 3)

#### Map Film Ids

In [20]:
count_watchs_df = rating_frame_cut \
        .drop('rating', axis=1) \
        .groupby('movieId').count() \
        .rename(columns={'userId':'count_watches'}) \
        .sort_values(by='count_watches', ascending=False) \
        .reset_index()

In [21]:
count_watchs_df.shape

(7437, 2)

In [22]:
min_watches = 100

popular_movies_ids = list(count_watchs_df[count_watchs_df.count_watches >= min_watches] \
                                .sort_values(by='movieId').movieId)

In [23]:
rating_frame_cut[rating_frame_cut.movieId.isin(popular_movies_ids)].shape

(11305102, 3)

In [24]:
popular_movies_count = len(popular_movies_ids)
popular_movies_count

3047

In [25]:
rating_frame_popular = rating_frame_cut[rating_frame.movieId.isin(popular_movies_ids)]

  """Entry point for launching an IPython kernel.


### NN to predict embeddings for rare movies

In [26]:
overview_embeddings_cut = overview_embeddings[np.isin(movies_with_descriprions_ids, popular_movies_ids)]
overview_embeddings_cut.shape

(3047, 512)

In [27]:
popular_movies_df = pd.DataFrame(data=overview_embeddings_cut, index=popular_movies_ids)
popular_movies_df.shape 

(3047, 512)

In [28]:
popular_movies_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
2,-0.063632,-0.021689,-0.001672,-0.056707,0.023280,-0.028782,0.079365,0.014391,0.033346,-0.047571,-0.048074,0.039464,-0.022523,0.060883,-0.009820,-0.048013,0.123393,-0.038303,-0.049405,0.053534,-0.034999,-0.040403,-0.022911,-0.032337,0.024984,0.086244,-0.042168,0.059111,0.013760,0.040173,-0.105198,-0.030170,0.073989,0.075087,0.037670,0.028474,0.046803,0.091315,-0.099410,-0.029462,...,0.002564,-0.016918,-0.087870,-0.004313,0.015471,-0.099055,-0.083354,0.015796,0.027837,0.054338,-0.073930,-0.026379,-0.033141,-0.075928,0.051716,-0.034912,-0.083465,-0.037052,0.067410,0.002467,-0.016237,0.002955,-0.007692,0.055967,-0.063561,-0.061162,-0.051129,0.037840,-0.021997,0.070783,0.005627,-0.040736,0.045244,-0.033222,-0.066562,0.016495,-0.018865,0.044875,0.009732,-0.011618
3,-0.052403,0.048977,0.073582,-0.038233,0.079737,0.028199,-0.008878,-0.034516,0.078757,-0.020848,-0.062129,0.007152,-0.026222,0.070052,0.038264,-0.047463,0.149617,-0.003784,-0.006646,0.096226,-0.042537,-0.003140,0.027999,0.080535,0.070568,0.005424,-0.044746,-0.020116,-0.011342,0.044311,-0.157764,0.067190,0.015806,0.054057,0.010995,-0.004820,0.009314,0.025173,-0.052741,-0.027638,...,0.019108,0.020216,0.036468,0.003494,-0.061052,-0.117819,-0.075828,-0.005846,0.059614,0.048441,-0.029708,0.001129,0.064793,-0.046923,0.029877,-0.043802,-0.084264,-0.068776,0.014481,0.007714,0.032785,-0.061673,-0.029677,0.076203,-0.027136,-0.097708,-0.018777,0.005804,-0.011055,0.050350,-0.065242,-0.032147,0.013144,-0.024119,-0.005807,-0.037398,-0.009252,0.045188,0.013856,-0.015382
5,0.004734,0.052584,-0.052721,-0.031953,0.117899,0.038494,-0.057737,0.013634,-0.060796,0.018252,-0.015995,0.061810,-0.046597,0.067901,0.049254,0.019261,0.131938,0.020937,-0.027062,0.063947,-0.041787,0.025571,0.071683,-0.050329,0.075886,0.093710,0.007726,-0.054145,-0.016001,-0.003046,-0.097116,0.007729,-0.025882,0.024662,0.002176,0.017157,0.014345,0.003879,-0.031765,-0.003057,...,0.001340,-0.065171,0.046681,-0.015996,-0.001690,-0.084468,-0.057414,-0.009645,-0.006576,0.062917,-0.019382,0.005151,0.006326,-0.018794,0.051200,0.023577,0.028245,-0.018197,0.004145,0.028543,0.016395,-0.068586,0.003437,0.089179,-0.035923,-0.120468,0.015506,-0.031504,0.000394,0.007437,-0.015473,0.063158,0.043150,-0.023848,-0.003147,0.039977,-0.003568,0.054965,0.037221,-0.042485
6,0.005913,0.035986,0.008065,-0.014503,0.028662,0.101780,0.016314,0.033582,-0.020211,0.029238,-0.046743,0.084002,-0.014333,0.024695,0.044216,-0.040117,0.115840,-0.036511,0.005959,0.002694,-0.030620,0.045682,-0.063498,-0.078795,-0.006398,0.006857,0.005875,0.046946,-0.017120,-0.003248,-0.127906,-0.007111,-0.050460,-0.076789,-0.053699,-0.035127,-0.015497,0.034961,0.014101,0.036683,...,0.010717,0.025140,-0.024463,0.000428,-0.032382,-0.102055,-0.063975,0.020699,-0.046925,0.009384,-0.102195,0.004018,-0.017689,0.003124,0.060046,-0.029222,-0.002081,0.000775,-0.007453,0.030261,-0.039343,-0.047788,0.007379,0.090456,0.007967,-0.075084,-0.051626,0.003171,-0.006301,0.075703,-0.046565,-0.019694,0.003513,-0.020538,-0.056601,-0.077374,0.011168,0.001649,-0.001520,-0.067955
11,-0.067423,0.035942,0.051897,-0.025577,0.060747,-0.097099,0.029158,0.042724,0.080296,-0.037233,0.008157,-0.008215,-0.030769,0.062885,0.055624,-0.008581,0.142265,-0.018248,0.008496,0.054594,-0.042948,-0.051012,0.017305,0.021228,0.095397,0.008064,0.012265,0.097218,0.013129,0.011034,0.037482,-0.000963,0.013578,-0.051612,-0.015247,0.030348,0.118344,0.045575,-0.086810,-0.010888,...,0.029151,0.053464,-0.041842,0.070768,-0.035127,-0.077306,-0.083497,0.007017,-0.039241,0.024325,-0.091565,-0.037558,-0.001871,0.083271,0.037757,-0.084064,-0.010256,0.043028,-0.058954,-0.047549,0.027913,0.058033,-0.047417,0.001452,-0.008247,-0.021142,0.046553,0.048840,0.054827,0.069924,-0.065519,0.002277,-0.002972,-0.006613,-0.004477,-0.041207,0.003670,0.051337,-0.003209,0.017595
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163937,0.009728,0.022843,0.040420,-0.055978,0.092938,-0.004441,-0.068977,-0.005165,0.036155,-0.016967,-0.027344,-0.011312,-0.045359,0.021757,0.012788,-0.018282,0.163860,-0.010107,-0.038649,0.066767,-0.031565,-0.107054,0.054912,-0.031669,-0.026909,0.084513,-0.076899,-0.030658,-0.043234,-0.001018,-0.135971,0.046454,-0.063856,0.097778,-0.076993,-0.057303,-0.000332,-0.003401,-0.032345,0.058890,...,-0.016122,-0.021533,0.080262,-0.003283,-0.012840,-0.100431,-0.071441,0.023816,0.058872,0.028622,0.007676,0.039511,0.010540,-0.044541,0.010997,-0.070534,-0.043114,-0.044307,0.013761,-0.036215,0.004312,-0.016155,-0.036134,0.104068,-0.031398,-0.087163,-0.041278,-0.017524,0.051168,-0.025539,-0.061431,0.011782,-0.000518,-0.032259,-0.092533,0.006351,0.035024,-0.004971,0.010187,-0.035458
166643,-0.021708,0.007756,0.016179,0.033954,0.075572,-0.121282,-0.036368,-0.057719,-0.054387,0.046796,-0.030757,0.019260,-0.088566,0.069082,0.071398,-0.011714,0.052791,0.011678,-0.051900,0.015414,0.008993,-0.087251,0.011464,0.071990,-0.039721,-0.005445,0.029939,0.054132,-0.059611,-0.023549,-0.106763,0.010794,-0.007392,0.012250,-0.067783,0.016953,-0.009126,-0.065409,0.010359,0.012145,...,0.010465,0.017086,0.026838,0.053806,-0.031488,0.031671,-0.069945,-0.010258,-0.024474,0.042919,0.044448,0.022135,-0.049923,-0.009423,0.011740,-0.047495,0.037320,0.038478,-0.002560,0.012583,-0.031313,0.059100,0.031675,0.008647,-0.036069,-0.056817,-0.095877,0.020688,0.029545,0.068882,-0.046878,-0.040988,0.048547,0.020358,-0.072326,-0.056472,-0.009098,0.036748,0.017291,-0.004366
167738,-0.032329,0.029226,0.036259,-0.033300,-0.023697,0.023915,0.014966,0.015199,0.007761,-0.012645,0.025234,-0.045038,-0.014592,0.030158,0.010082,0.008639,0.081441,-0.047034,-0.014088,0.091253,-0.038807,-0.009682,-0.000167,0.054354,0.029982,0.008566,-0.061552,-0.057223,-0.004625,0.010995,-0.131024,-0.020143,-0.053044,0.027755,-0.029745,-0.031050,-0.004434,-0.019159,-0.035222,0.041916,...,-0.032118,-0.020527,0.063227,0.028228,-0.012385,-0.024522,-0.074301,0.016974,-0.006905,0.076404,-0.053561,-0.012629,0.060770,-0.058154,-0.009555,0.022777,0.001614,0.014908,-0.005218,-0.006930,0.029061,0.039158,-0.012188,-0.011179,0.011874,0.070760,-0.044680,0.004387,0.060342,0.009355,-0.035206,-0.003299,-0.009440,0.003033,-0.035812,0.087556,-0.032788,0.081692,-0.085572,0.016406
168712,0.035538,0.017640,0.016940,-0.001332,0.027325,-0.041204,0.049524,0.022233,-0.042354,0.017703,0.029351,0.046796,-0.015422,0.047791,0.033364,0.084864,0.015988,-0.056436,-0.036689,-0.003970,-0.006908,-0.022019,-0.055093,0.019031,0.016638,0.051863,-0.006982,-0.042420,0.021382,-0.044136,-0.079877,-0.015535,0.030862,-0.004397,0.055675,0.012752,-0.035660,-0.027057,-0.001277,-0.115704,...,-0.025161,-0.015355,0.002818,0.036008,-0.002646,-0.075562,-0.069607,0.034122,-0.043531,-0.011007,-0.042169,-0.075574,0.012369,-0.045008,0.019756,-0.008057,-0.051215,0.043275,0.025248,0.009451,-0.032837,-0.032927,-0.006162,-0.027089,0.025353,-0.038125,0.007189,0.013676,0.034563,0.007220,-0.014726,0.003279,-0.046556,0.009737,-0.152109,-0.010054,-0.038873,0.020512,-0.007220,0.021138


In [29]:
rating_frame_popular.shape

(11305102, 3)

In [30]:
from sklearn.model_selection import train_test_split


rating_frame_popular_train_main, rating_frame_popular_test = \
    train_test_split(rating_frame_popular, test_size=.4, 
                    random_state=125, shuffle=True)

In [31]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, embeddings_df, batch_size=1_000, shuffle=True):
        'Initialization'
        self.batch_size = batch_size
        self.df = df
        self.embeddings_df = embeddings_df

        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(self.df.shape[0] / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X, y = self.__data_generation(indexes)
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(self.df.shape[0])
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'
        cur_df = self.df.iloc[list_IDs_temp]

        cur_df = cur_df.join(self.embeddings_df, on='movieId', how='inner')
        X = tf.convert_to_tensor(cur_df.drop(['rating'], axis=1))
        y = tf.convert_to_tensor(cur_df['rating'])
        return X, y
            

In [32]:
from tensorflow.keras import layers


class TextEmbeddingsRecommenderNet(keras.Model):
    def __init__(self, num_users, num_movies,
                 embedding_size=32, text_embedding_size=0, **kwargs):
        super(TextEmbeddingsRecommenderNet, self).__init__(**kwargs)
        
        self.text_embedding_size = text_embedding_size
        act_func = 'relu'
        
        self.qi_model =  keras.Sequential([
            keras.Input(text_embedding_size),
            keras.layers.Dense(512, activation=act_func),
            keras.layers.Dropout(.1, seed=12),
            keras.layers.Dense(256, activation=act_func),
            keras.layers.Dropout(.1, seed=12),
            keras.layers.Dense(128, activation=act_func),
            keras.layers.Dense(64, activation=act_func),
            keras.layers.Dense(embedding_size, activation='linear')])

        
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)

        self.movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.movie_bias = layers.Embedding(num_movies, 1)

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])

        movie_vector = self.movie_embedding(inputs[:, 1])
        if self.text_embedding_size > 0:
            movie_vector += self.qi_model(inputs[:, 2:])

        movie_bias = self.movie_bias(inputs[:, 1])

        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)

        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        x = dot_user_movie + user_bias + movie_bias
        return tf.nn.sigmoid(x)

In [33]:
from sklearn.model_selection import train_test_split

def run_rec_models_popular(epochs=100, val_freq = 10, verbose=0):
    num_users = max(rating_frame_popular['userId']) + 1
    num_movies = max(rating_frame_popular['movieId']) + 1


    model = TextEmbeddingsRecommenderNet(num_users, num_movies, text_embedding_size=512)

    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(), 
        optimizer=keras.optimizers.Adam(lr=0.0001),
        metrics=['AUC']
    )


    rating_frame_popular_train, rating_frame_popular_val = \
        train_test_split(rating_frame_popular_train_main, 
                        test_size=0.33, 
                        random_state=42, 
                        shuffle=True)

    batch_size = 30_000

    generator_train = DataGenerator(rating_frame_popular_train, 
                                    popular_movies_df, 
                                    batch_size=batch_size)
    generator_val = DataGenerator(rating_frame_popular_val, 
                                popular_movies_df, 
                                batch_size=batch_size)

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
    
    hist1 = model.fit(x=generator_train, 
                        validation_data=generator_val,
                        validation_freq=val_freq,
                        epochs=epochs,
                        callbacks=[early_stopping],
                        verbose=verbose,
                        workers=32,
                      )
    
    print(f'First model is ready, {len(hist1.history["loss"])} epochs passed')


    model_base = TextEmbeddingsRecommenderNet(num_users, num_movies)
    model_base.compile(
        loss=tf.keras.losses.BinaryCrossentropy(), 
        optimizer=keras.optimizers.Adam(lr=0.0001),
        metrics=['AUC']
    )


    hist2 = model_base.fit(
    x=tf.convert_to_tensor(rating_frame_popular_train.iloc[:, :-1]),
    y=tf.convert_to_tensor(rating_frame_popular_train.iloc[:, -1]),
    batch_size=batch_size,
    epochs=epochs,
    verbose=verbose,
    validation_data=(rating_frame_popular_val.iloc[:, :-1], 
                     rating_frame_popular_val.iloc[:, -1]),
    validation_freq=val_freq,
    validation_batch_size=batch_size,
    workers=32,
    )

    print(f'Second model is ready, {len(hist2.history["loss"])} epochs passed')
    
    fig, (ax11, ax12) = plt.subplots(1, 2, figsize=(18, 8))

    def plot_loss_and_metric(hist, label, col):
        plotting_range = np.array(range(val_freq, 
                                    val_freq * len(hist.history['val_auc']) + 1, 
                                    val_freq))
        
        ax11.plot(plotting_range, 
                    hist.history['val_loss'], 
                    linestyle='dashed',
                    color=col,
                    label=label + ' validation')
        
        ax11.plot(plotting_range, 
                    np.array(hist.history['loss'])[plotting_range - 1],
                    color=col, 
                    label=label + ' train')
        ax11.set_title('Loss')
        
        ax11.legend()

        ax12.plot(plotting_range, 
                    hist.history['val_auc'], 
                    label=label)
        ax12.set_title('Validation AUC ROC')
        ax12.legend()

    plot_loss_and_metric(hist1, 'Text embeddings', 'tab:blue')
    plot_loss_and_metric(hist2, 'Baseline', 'tab:orange')

In [None]:
 run_rec_models_popular(epochs=100, val_freq=10, verbose=1)