In this notebook, we explore how factorization machines can be used for recommender systems and try to improve on the results of the best existing model.
We start with basic setup and data exploration to better understand the dataset before moving on to the analysis.

In [1]:
# necessary imports

import os
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from pathlib import Path
from zipfile import ZipFile
from google.colab import files
import matplotlib.pyplot as plt
import tensorflow.keras.backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

!mkdir model

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# mounting drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# reading in data

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Recomenders/train.csv')
df.head()

Unnamed: 0,user_id,title,movie_id,rating,release_date,sex,age
0,2592,Top Gun (1986),1101,4,Action|Romance,50,M
1,4318,12 Angry Men (1957),1203,4,Drama,25,M
2,2756,Robocop 2 (1990),2986,2,Action|Crime|Sci-Fi,18,M
3,1706,Modern Times (1936),3462,5,Comedy,25,M
4,4813,Milk Money (1994),276,3,Comedy|Romance,35,F


In [None]:
# swap the values between 'sex' and 'age' columns
df['sex'], df['age'] = df['age'], df['sex']

# rename the 'release_date' column to 'genre'
df = df.rename(columns={'release_date': 'genre'})

print(df.head())

   user_id                title  movie_id  rating                genre sex  \
0     2592       Top Gun (1986)      1101       4       Action|Romance   M   
1     4318  12 Angry Men (1957)      1203       4                Drama   M   
2     2756     Robocop 2 (1990)      2986       2  Action|Crime|Sci-Fi   M   
3     1706  Modern Times (1936)      3462       5               Comedy   M   
4     4813    Milk Money (1994)       276       3       Comedy|Romance   F   

   age  
0   50  
1   25  
2   18  
3   25  
4   35  


In [None]:
import os

path = "/content/drive/MyDrive/Colab Notebooks/Recomenders/kaggle_baseline.csv"
print("File exists:", os.path.exists(path))


File exists: True


In [None]:
# reading in sample solution
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Recomenders/kaggle_baseline.csv')
test.head()

Unnamed: 0,user_id,prediction
0,850,260 1210 480 2028 589 1270 593 1580 110 2396 1...
1,2012,2858 2028 593 608 2762 110 2396 1617 527 2997 ...
2,1511,2858 260 480 589 1270 593 1580 608 2762 110 23...
3,5846,1196 1210 480 2028 589 2571 1270 1580 1198 276...
4,4344,593 2716 1 3578 2987 919 3175 924 1387 2355 10...


In [None]:
# extract the genre column
genres_column = df['genre']

# split the genres for each entry
genre_lists = genres_column.str.split('|')

# flatten the list of genres
genres_flat = [genre for sublist in genre_lists for genre in sublist]

# count the unique genres
unique_genres = set(genres_flat)
num_unique_genres = len(unique_genres)
print("Number of unique genres:", num_unique_genres)

Number of unique genres: 18


In [None]:
# text to sequence function for genre
def text2seq(text, n_genre):
    """ using tokenizer to encoded the multi-level categorical feature
    """
    tokenizer = Tokenizer(lower=True, split='|',filters='', num_words=n_genre)
    tokenizer.fit_on_texts(text)
    seq = tokenizer.texts_to_sequences(text)
    seq = pad_sequences(seq, maxlen=3,padding='post')
    return seq

df['genre'] = text2seq(df.genre.values, n_genre=num_unique_genres).tolist()

In [None]:
# confirming genre transformation
df.head()

Unnamed: 0,user_id,title,movie_id,rating,genre,sex,age
0,2592,Top Gun (1986),1101,4,"[3, 6, 0]",M,50
1,4318,12 Angry Men (1957),1203,4,"[2, 0, 0]",M,25
2,2756,Robocop 2 (1990),2986,2,"[3, 8, 5]",M,18
3,1706,Modern Times (1936),3462,5,"[1, 0, 0]",M,25
4,4813,Milk Money (1994),276,3,"[1, 6, 0]",F,35


In [None]:
# train/validation split
train, val = train_test_split(df, test_size=0.2, random_state=7)

In [None]:
# x and y split for training and validation
def df2xy(ratings):
    x = [
        df.age.values,
        df.user_id.values,
        df.movie_id.values,
        np.concatenate(df.genre.values).reshape(-1, 3)]
    y = df.rating.values
    return x, y

train_x, train_y = df2xy(train)
valid_x, valid_y = df2xy(val)

## Factorization Machine

In [None]:
def define_input_layers():
    # numerical features
    age_input = Input((1,), name='input_age')
    num_inputs = [age_input]

    # single-level categorical features
    user_id_input = Input((1,), name='input_user_id')
    movie_id_input = Input((1,), name='input_movie_id')
    cat_sl_inputs = [user_id_input, movie_id_input]

    # multi-level categorical feature
    genre_input = Input((3,), name='input_genre')
    cat_ml_inputs = [genre_input]

    inputs = num_inputs + cat_sl_inputs + cat_ml_inputs

    return inputs

inputs = define_input_layers()

In [None]:
def Tensor_Mean_Pooling(name='mean_pooling', keepdims=False):
    return Lambda(lambda x: K.mean(x, axis=1, keepdims=keepdims), name=name)

def fm_1d(inputs, n_uid, n_mid, n_genre):
    # unpack input layers
    age_input, user_id_input, movie_id_input, genre_input = inputs

    # numerical feature: age
    num_dense_1d = [Dense(1, name='num_dense_1d_age')(age_input)]

    # categorical features: user_id & movie_id
    cat_sl_embed_1d = [Embedding(n_uid + 1, 1, name='cat_embed_1d_user_id')(user_id_input),
                       Embedding(n_mid + 1, 1, name='cat_embed_1d_movie_id')(movie_id_input)]

    # categorical feature: genre
    cat_ml_embed_1d = [Embedding(n_genre + 1, 1, mask_zero=True, name='cat_embed_1d_genre')(genre_input)]

    # reshape categorical embeddings
    cat_sl_embed_1d = [Reshape((1,))(i) for i in cat_sl_embed_1d]
    cat_ml_embed_1d = [Tensor_Mean_Pooling(name='embed_1d_mean')(i) for i in cat_ml_embed_1d]

    # add all tensors
    y_fm_1d = Add(name='fm_1d_output')(num_dense_1d + cat_sl_embed_1d + cat_ml_embed_1d)

    return y_fm_1d

y_1d = fm_1d(inputs, n_uid=10, n_mid=10, n_genre=10)



In [None]:
def fm_2d(inputs, n_uid, n_mid, n_genre, k):
    # unpack input layers
    age_input, user_id_input, movie_id_input, genre_input = inputs

    # numeric feature: age
    num_dense_2d = [Dense(k, name='num_dense_2d_age')(age_input)]  # shape (None, k)
    num_dense_2d = [Reshape((1, k))(i) for i in num_dense_2d]  # shape (None, 1, k)

    # categorical features: user_id & movie_id
    cat_sl_embed_2d = [Embedding(n_uid + 1, k, name='cat_embed_2d_user_id')(user_id_input),
                       Embedding(n_mid + 1, k, name='cat_embed_2d_movie_id')(movie_id_input)]  # shape (None, 1, k)

    # categorical feature: genre
    cat_ml_embed_2d = [Embedding(n_genre + 1, k, name='cat_embed_2d_genre')(genre_input)]  # shape (None, 3, k)
    cat_ml_embed_2d = [Tensor_Mean_Pooling(name='cat_embed_2d_genre_mean', keepdims=True)(i) for i in
                       cat_ml_embed_2d]  # shape (None, 1, k)

    # concatenate all 2d embedding layers => (None, ?, k)
    embed_2d = Concatenate(axis=1, name='concat_embed_2d')(num_dense_2d + cat_sl_embed_2d + cat_ml_embed_2d)

    # calculate the interactions by simplification
    # sum of (x1*x2) = sum of (0.5*[(xi)^2 - (xi^2)])
    tensor_sum = Lambda(lambda x: K.sum(x, axis=1), name='sum_of_tensors')
    tensor_square = Lambda(lambda x: K.square(x), name='square_of_tensors')

    sum_of_embed = tensor_sum(embed_2d)
    square_of_embed = tensor_square(embed_2d)

    square_of_sum = Multiply()([sum_of_embed, sum_of_embed])
    sum_of_square = tensor_sum(square_of_embed)

    sub = Subtract()([square_of_sum, sum_of_square])
    sub = Lambda(lambda x: x * 0.5)(sub)
    y_fm_2d = Reshape((1,), name='fm_2d_output')(tensor_sum(sub))

    return y_fm_2d, embed_2d

y_fm2_d, embed_2d = fm_2d(inputs, n_uid=10, n_mid=10, n_genre=10, k=5)

In [None]:
def fm_model(n_uid, n_mid, n_genre, k, dnn_dr):
    inputs = define_input_layers()

    y_fm_1d = fm_1d(inputs, n_uid, n_mid, n_genre)
    y_fm_2d, embed_2d = fm_2d(inputs, n_uid, n_mid, n_genre, k)

    # combine first-order and second-order outputs
    y = Concatenate()([y_fm_1d, y_fm_2d])
    y = Dense(1, name='fm_output')(y)

    # define models for first-order, second-order, and full FM
    fm_model_1d = Model(inputs, y_fm_1d)
    fm_model_2d = Model(inputs, y_fm_2d)
    fm_model = Model(inputs, y)

    return fm_model_1d, fm_model_2d, fm_model

In [None]:
params = {'n_uid': df['user_id'].max(),
          'n_mid': df['movie_id'].max(),
          'n_genre': num_unique_genres,  # using the previously calculated number of unique genres
          'k': 20,
          'dnn_dr': 0.5}

fm_model_1d, fm_model_2d, fm_model = fm_model(**params)

In [None]:
params

{'n_uid': 6040, 'n_mid': 3952, 'n_genre': 18, 'k': 20, 'dnn_dr': 0.5}

In [None]:
# compile the model
fm_model.compile(loss=tf.keras.losses.MeanSquaredError(),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

# define callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=3)
model_ckp = ModelCheckpoint(filepath='./models/deepfm_weights.weights.h5',  # Updated filename
                            monitor='val_loss',
                            save_weights_only=True,
                            save_best_only=True)

callbacks = [model_ckp, early_stop]

# train the model
train_history = fm_model.fit(train_x, train_y,
                              epochs=30,
                              batch_size=2048,
                              validation_data=(valid_x, valid_y),
                              callbacks=callbacks)

Epoch 1/30




[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 19ms/step - loss: 33.4735 - val_loss: 0.9869
Epoch 2/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - loss: 0.9608 - val_loss: 0.8934
Epoch 3/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - loss: 0.8958 - val_loss: 0.8685
Epoch 4/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - loss: 0.8781 - val_loss: 0.8585
Epoch 5/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 16ms/step - loss: 0.8699 - val_loss: 0.8537
Epoch 6/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - loss: 0.8667 - val_loss: 0.8510
Epoch 7/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - loss: 0.8631 - val_loss: 0.8480
Epoch 8/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - loss: 0.8602 - val_loss: 0.8452
Epoch 9/30
[1m391/391[0m [32m━━━━━

In [None]:
class CustomRecommender:
    def __init__(self, model, train_df):
        self.model = model
        self.train_df = train_df

    def predict_top(self, user_id, at=5, remove_seen=True):
        if remove_seen:
            seen_items = self.train_df[self.train_df['user_id'] == user_id]['movie_id'].values
            unseen_items_mask = np.in1d(self.train_df['movie_id'].unique(), seen_items, invert=True)
            unseen_items = self.train_df['movie_id'].unique()[unseen_items_mask]
            unseen_items = unseen_items[:at]  # limit to top 'at' unseen items
            return unseen_items
        else:
            return self.train_df['movie_id'].unique()[:at]  # return top 'at' popular items if 'remove_seen' is false

custom_recommender = CustomRecommender(fm_model, train)

# open the file in the write mode
with open('solution.csv', 'w', encoding='UTF8', newline='') as f:
    # create the csv writer
    writer = csv.writer(f)
    # write a row to the csv file
    writer.writerow(['user_id', 'prediction'])

    # iterate over unique user IDs using tqdm for progress bar
    for user_id in tqdm(test.user_id.unique(), desc="Generating predictions"):
        relevant_items = custom_recommender.predict_top(user_id, at=25)
        list_relevants = ' '.join(map(str, relevant_items))
        writer.writerow([str(user_id), list_relevants])

  unseen_items_mask = np.in1d(self.train_df['movie_id'].unique(), seen_items, invert=True)
Generating predictions: 100%|██████████| 6037/6037 [01:15<00:00, 79.72it/s]


In [None]:
# specify the file path for download
file_path = 'solution.csv'

# download the file
files.download(file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

The submission to Kaggle resulted in a lower-than-expected score, falling short of surpassing the benchmark set by the top-performing recommender system. Consequently, we shifted our strategy and explored a Singular Value Decomposition (SVD)-based model as an alternative approach.

## SVD

In [None]:
from sklearn.decomposition import TruncatedSVD

class SVDRecommender:
    def __init__(self, n_components=22, n_iter=10):
        self.svd = TruncatedSVD(n_components=n_components, n_iter=n_iter, random_state=42)
        self.user_movie_matrix = None

    def fit(self, train_df):
        # create a user-movie matrix
        self.user_movie_matrix = pd.pivot_table(train_df, values='rating', index='user_id', columns='movie_id', fill_value=0)

        # fit SVD on the user-movie matrix
        self.svd.fit(self.user_movie_matrix)

    def predict_top(self, user_id, at=5, remove_seen=True):
        # get all movie IDs
        all_movie_ids = self.user_movie_matrix.columns

        # predict ratings for the specified user
        user_ratings = self.svd.inverse_transform(self.svd.transform(self.user_movie_matrix.loc[[user_id]]))

        # sort movie IDs by predicted ratings
        top_movie_ids = all_movie_ids[np.argsort(user_ratings)[0, ::-1]]

        # optionally remove seen movies
        if remove_seen:
            seen_movies = train[train['user_id'] == user_id]['movie_id'].values
            top_movie_ids = [movie_id for movie_id in top_movie_ids if movie_id not in seen_movies]

        # limit to top 'at' movies
        top_movie_ids = top_movie_ids[:at]

        return top_movie_ids

In [None]:
# create an instance of the SVDRecommender class
svd_recommender = SVDRecommender()

# fit the model to the training data
svd_recommender.fit(train)

In [None]:
import csv

with open('solution.csv', 'w',encoding='UTF8', newline='') as f:
    # create the csv writer
    writer = csv.writer(f)
    # write a row to the csv file
    writer.writerow(['user_id', 'prediction'])
    for user_id in tqdm(test.user_id.unique(), desc="Generating predictions"):
        relevant_items = svd_recommender.predict_top(user_id, at=25)
        list_relevants = ' '.join([str(elem) for elem in relevant_items])
        writer.writerow([str(user_id),list_relevants])

Generating predictions: 100%|██████████| 6037/6037 [04:01<00:00, 25.01it/s]


In [None]:
# specify the file path for download
file_path = 'solution.csv'

# download the file
files.download(file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

After several iterations, we chose to stick with the previously demonstrated method, as it successfully outperformed the top recommender's score of 0.06923. Our final submission achieved a score of 0.12048.