#### Data Visualization

In [3]:
import pandas as pd
import matplotlib.pyplot as pyplot

In [4]:
URM_all_dataframe = pd.read_csv('data_train.csv', 
                                sep=",", 
                                header= 0, 
                                dtype={0:int, 1:int, 2:float},
                                engine='python')

URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]

In [5]:
URM_all_dataframe.head(n=100)

Unnamed: 0,UserID,ItemID,Interaction
0,1,7,1.0
1,1,15,1.0
2,1,16,1.0
3,1,133,1.0
4,1,161,1.0
...,...,...,...
95,4,47,1.0
96,4,70,1.0
97,4,79,1.0
98,4,119,1.0


In [6]:
print ("The number of interactions is {}".format(len(URM_all_dataframe)))

The number of interactions is 478730


In [7]:
# Extract the list of unique user id and item id 
userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

In [8]:
# Display some statistics
n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))

Number of items	 22222, Number of users	 12638
Max ID items	 22347, Max Id users	 13024



In [9]:
# Move to sparse format
import scipy.sparse as sps

URM_train = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values))) ## .values --> numpy array, df[..] --> pd series

# Go to CSR format
URM_train.tocsr()

<13025x22348 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [10]:
N_USERS_TRAIN, N_ITEMS_TRAIN = URM_train.shape
print ("Training set: Number of items\t {}, Number of users\t {}".format(N_ITEMS_TRAIN, N_USERS_TRAIN))

Training set: Number of items	 22348, Number of users	 13025


Maybe removing all the items with no interaction (or no very positive interaction, in our case) can be useful!

### Create the RecommenderObject for the FunkSVD model

In [11]:
%load_ext Cython

In [12]:
%%cython
import numpy as np
import pandas as pd
import time

from libc.stdlib cimport rand, srand, RAND_MAX

class FunkSVDRecommender(object):
    def __init__(self, n_epochs= 100, n_factors= 10, learning_rate= 1e-4, regularization_factor= 1e-5):
        self.n_factors = n_factors
        self.learning_rate = learning_rate
        self.regularization_factor = regularization_factor
        self.n_epochs = n_epochs

        self.user_factors = None
        self.item_factors = None

        self.model_name = "FunkSVD"

    def fit(self, URM_train, n_factors= 10, l_rate= 1e-3, regularization_factor= 1e-5):
        '''
        Fit the FunkSVD model to the training data.

        Parameters:
        - URM_train: Sparse user-item interaction matrix (training data)
        - n_factors: Number of latent factors
        - l_rate: Learning rate for the optimization algorithm
        - regularization_factor: Regularization factor to prevent overfitting
        '''

        # Initialize the model with the training data
        self.URM_train = URM_train

        # Convert the sparse URM_train matrix to a coordinate format for efficient access
        URM_train_coo = URM_train.tocoo()

        # Extract the number of users, items, and interactions from the URM_train matrix
        self.n_users, self.n_items = URM_train_coo.shape
        self.n_interactions = URM_train.nnz

        # Declare variables using Cython for faster execution
        cdef int n_interactions = URM_train.nnz
        cdef int sample_num, sample_index, user_id, item_id, factor_index
        cdef double rating, predicted_rating, prediction_error
        cdef int num_factors = self.n_factors
        cdef double lr = self.learning_rate
        cdef double alpha = self.regularization_factor

        # Extract data from the URM_train_coo matrix
        cdef int[:] URM_train_coo_row = URM_train_coo.row
        cdef int[:] URM_train_coo_col = URM_train_coo.col
        cdef double[:] URM_train_coo_data = URM_train_coo.data

        # Initialize user and item factors with random values
        cdef double[:,:] user_factors = np.random.random((self.n_users, num_factors))
        cdef double[:,:] item_factors = np.random.random((self.n_items, num_factors))
        cdef double H_i, W_u
        cdef double item_factors_update, user_factors_update
        cdef double loss

        for epoch in range(self.n_epochs):
            loss = 0.0

            for sample_num in range(n_interactions):
                # Randomly choose an interaction index
                sample_index = rand() % n_interactions
                
                # Extract user, item, and rating information for the chosen interaction
                user_id = URM_train_coo_row[sample_index]
                item_id = URM_train_coo_col[sample_index]
                rating = URM_train_coo_data[sample_index]

                predicted_rating = 0.0
                
                # Calculate the predicted rating using user and item factors
                for factor_index in range(num_factors):
                    predicted_rating += user_factors[user_id, factor_index] * item_factors[item_id, factor_index]

                # Calculate prediction error and update the loss
                prediction_error = rating - predicted_rating
                loss += prediction_error**2

                # Update user and item factors based on the prediction error and regularization
                for factor_index in range(num_factors):
                    
                    H_i = item_factors[item_id, factor_index]
                    W_u = user_factors[user_id, factor_index]  

                    user_factors_update = prediction_error * H_i - alpha * W_u
                    item_factors_update = prediction_error * W_u - alpha * H_i

                    user_factors[user_id, factor_index] += lr * user_factors_update 
                    item_factors[item_id, factor_index] += lr * item_factors_update    

        # Save train loss
        self.train_loss = loss

        # Save factorization
        self.user_factors = user_factors
        self.item_factors = item_factors

    def predict(self, user_id, item_id):
        '''
        Predict the rating for a single user-item pair based on the learned factors.
        '''
        # Predict the rating for a user-item pair based on the learned factors
        cdef double[:] user_vector = self.user_factors[user_id, :]
        cdef double[:] item_vector = self.item_factors[item_id, :]
        cdef double predicted_rating = np.dot(user_vector, item_vector)

        return predicted_rating
        
    
    def recommend(self, user_id, at=10, remove_seen=True):
        '''
        Predict the ratings for all the items available for a single user.
        '''
        cdef int[:] item_ids = np.arange(self.n_items, dtype=np.int32)
        cdef int[:] seen_items = np.empty(self.n_items, dtype=np.int32)
        cdef int[:] recommended_indices

        # Initialize list to store recommended items
        recommended_items = []

        # Predict ratings for all items
        predicted_ratings = [np.float32(self.predict(user_id, item_id)) for item_id in item_ids]

        if remove_seen:
            # Get the items already seen by the user
            try:
                row_user_id = self.URM_train.getrow(user_id)
                seen_items = row_user_id.nonzero()[1]
                for seen_item in seen_items:
                    predicted_ratings[seen_item] = -np.inf

            except IndexError:
                # If the row does not exist, do nothing or handle the case accordingly
                pass

        recommended_indices = np.argsort(predicted_ratings)[-at:][::-1].astype(np.int32)
        recommended_items.extend(item_ids[i] for i in recommended_indices)
        
        elapsed_time = time.time() - start_time
        samples_per_second = sample_num/elapsed_time
        print("Epoch {} complete in in {:.2f} seconds, loss is {:.3E}. Samples per second {:.2f}".format(n_epoch+1, time.time() - start_time, loss/sample_num, samples_per_second))

        return np.array(recommended_items)

    def write_predictions(self, at=10):
        prediction_df = pd.read_csv('data_target_users_test.csv', sep= ",",
                                    header=0, 
                                    dtype={0:int},
                                    engine='python')
        #prediction_df=prediction_df[:10]

        cdef int[:] user_array = np.array(prediction_df['user_id'])
        cdef int[:,:] rec_array

        rec_array = np.asarray([self.recommend(user_id, at=at) for user_id in np.asarray(user_array)]).reshape(len(user_array), at)
        
        print("Predictions done.")
        def transform_items_to_string(item_list):
            return ' '.join(map(str, item_list))

        prediction_df['item_list'] = np.asarray([transform_items_to_string(item) for item in rec_array])
        print("Predictions successfully converted to string.")
        print(prediction_df.head(10))

        prediction_df.to_csv('submission.csv',index=False)
        print("Predictions successfully written to csv.")

        return

Content of stdout:
_cython_magic_89fcdd45de0bbcfa63b104f2e4dcad77b09592b9.c
   Creating library C:\Users\melan\.ipython\cython\Users\melan\.ipython\cython\_cython_magic_89fcdd45de0bbcfa63b104f2e4dcad77b09592b9.cp311-win_amd64.lib and object C:\Users\melan\.ipython\cython\Users\melan\.ipython\cython\_cython_magic_89fcdd45de0bbcfa63b104f2e4dcad77b09592b9.cp311-win_amd64.exp
Generating code
Finished generating code

### Train the FunkSVD on our dataset and create recommendations

In [13]:
# Initialize the model
recommender = FunkSVDRecommender(n_epochs= 1000, n_factors= 20)

# Fit the model for our dataset
recommender.fit(URM_train)

print(recommender.model_name, " model has reached a ", recommender.train_loss, " loss on the train set.")

FunkSVD  model has reached a  321.13163825963665  loss on the train set.


### Make predictions

In [14]:
# Import users ids for prediction
prediction_df = pd.read_csv('data_target_users_test.csv', sep= ",",
                                header=0, 
                                dtype={0:int},
                                engine='python')
prediction_df.tail(10)

Unnamed: 0,user_id
10872,13010
10873,13012
10874,13013
10875,13015
10876,13019
10877,13020
10878,13021
10879,13022
10880,13023
10881,13024


In [15]:
recommender.recommend(13024)

array([10175, 10620, 21205, 15653, 21965, 10832, 16263,  3861,  7900,
       12153])

In [16]:
recommender.write_predictions()

Predictions done.
Predictions successfully converted to string.
   user_id                                          item_list
0        1  13353 11233 6573 14929 10463 1811 11621 14566 ...
1        2  725 12729 7935 18265 8132 4110 21101 14436 495...
2        3  5162 21965 7662 15639 11133 20940 15064 13553 ...
3        4  1295 6521 1186 4819 17657 6225 22289 15092 193...
4        5  11038 17834 9377 14545 19916 4196 15793 22030 ...
5        6  12149 22103 17325 14863 15434 12630 15462 1297...
6        8  12063 12649 12460 13510 17379 5929 2466 13134 ...
7        9  10175 21540 10620 8092 20118 11845 4492 14176 ...
8       10  8167 21614 11994 18945 9816 16453 7649 18822 6...
9       11  18265 9065 22336 21971 6225 9580 3400 13456 37...
Predictions successfully written to csv.
