# Install and load necesary packages

In [1]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Please don't change this cell
df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
# Set column names based on the MovieLens dataset
column_names = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 
                'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 
                'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
                'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
                'Thriller', 'War', 'Western']

# Use pandas to read the csv file
df_movies = pd.read_csv('ml-100k/u.item', sep='|', names=column_names, encoding='latin-1')

df_movies

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Split dataset
## Random Train and Test Split

In [4]:
from sklearn.model_selection import train_test_split

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

train_df, test_df = train_test_split(df, test_size=0.2, random_state = 10)
train_df, test_df

# Training Dataset
train_ds = np.zeros((n_users, n_items))
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
train_ds = pd.DataFrame(train_ds)

# Testing Dataset
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    test_ds[row[1]-1, row[2]-1] = row[3]
test_ds = pd.DataFrame(test_ds)

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

943 users
1682 items
Construct the rating matrix based on train_df:
     0     1     2     3     4     5     6     7     8     9     ...  1672  \
0     0.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   0.0   
1     4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0  ...   0.0   
2     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
3     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
4     4.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
..    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
938   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   5.0   0.0  ...   0.0   
939   0.0   0.0   0.0   2.0   0.0   0.0   4.0   5.0   3.0   0.0  ...   0.0   
940   5.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
941   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
942   0.0   5.0   0.0   0.0   0.0   0.0   0.0   0.0   3.0   0.0  ...   0.0

# Utils

In [5]:
# Please don't change this cell
EPSILON = 1e-9

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Adjusted Weighted Slope Scheme

In [6]:
class AWSS:
    """
    This class is to implement the AWSS algorithm which stand for Adjusted Weighted Slope Scheme.
    The class has three methods:
    1. fit: To train the model by calculating the deviation matrix
    2. predict: To predict the ratings for the test dataset
    3. recommend_movies: To recommend movies for a given user
    """
    def __init__(self, train_ds, n_users = n_users, n_items = n_items, limbda = 0.05, GAMMA = 30, EPSILON = 1e-6):
        """
        self.train: The training dataset
        self.n_users: The number of users
        self.n_items: The number of items
        self.limbda: Model hyperparameter to control the two parts of the equation. First: Σu∈Sj,i(χ) ( (uj − ui) / card(Sj,i(χ)) ) and Second: Σ u∈Sj,i(χ) ((uj − ui) · exp(sim(u, u′))) / Σ u∈Sj,i(χ) (exp(sim(u, u′)) · card(Sj,i(χ)))
        self.GAMMA: Model hyperparameter to control the contribution of similarity between users. A higher GAMMA would give more weight to the users who are more similar.
        self.EPSILON: A small value to avoid division by zero
        self.devs: The deviation matrix
        self.counts: The number of users who rated both items
        self.preds: The prediction matrix
        """
        self.train_ds = train_ds
        self.n_users = n_users
        self.n_items = n_items
        self.limbda = limbda
        self.GAMMA = GAMMA
        self.EPSILON = EPSILON
        self.devs = None
        self.counts = None
        self.preds = None
        
    def fit(self):
        """
        This method is to train the model by calculating the deviation matrix.
        Based on the following equation:
        devj,i = λ · (Σu∈Sj,i(χ) ( (uj − ui) / card(Sj,i(χ)) )) + (1 − λ) · (Σ u∈Sj,i(χ) ((uj − ui) · exp(sim(u, u′))) / Σ u∈Sj,i(χ) (exp(sim(u, u′)) · card(Sj,i(χ))))
        The method are divided into three parts:
        PART 1: Calculate the similarity between every pair of users using centered cosine similarity for the second part of equation.
        PART 2: Calculate the first part of the equation Calculate the first part of the equation which is the average difference in ratings between two items (j and i) given by users who have rated both items.
        PART 3: Calculate the second part of the equation which is the new term that represents the influence of the similarity between users who rated both items.
        PART 4: Combining the first and second parts of the equation to calculate the deviation matrix.
        """
        
        # PART 1: Calculate the similarity between every pair of users using centered cosine similarity for the second part of equation.
        # ----------------

        # sim(u, u′) = Σi∈I(u)∩I(u′) ( (rui − ru) · (rui′ − ru′) ) / ( sqrt(Σi∈I(u)∩I(u′) (rui − ru)^2) · sqrt(Σi∈I(u)∩I(u′) (rui′ − ru′)^2) )
        
        # Matrix to hold user similarities
        sim_matrix = np.zeros((self.n_users,self.n_users))

        # Calculate the similarity between every pair of users
        for u, ratings_u in enumerate(train_ds.values):
            # u: represents a user index
            # ratings_u: represents the ratings of all items for user u
            for u_prime, ratings_u_prime in enumerate(train_ds.values):
                # u_prime: represents a user index (the active user)
                # ratings_u_prime: represents the ratings of all items for user u_prime

                # Taking the mean of ratings for u and u_prime after removing the unrated items
                mean_u = np.mean(ratings_u[np.where(ratings_u > 0)])
                mean_u_prime = np.mean(ratings_u_prime[np.where(ratings_u_prime > 0)])
                
                # Normalize the ratings for each user. As this is a centered cosine similarity, we need to center the ratings around the mean rating of each user.
                # Different users may use the rating scale differently. 
                # For example, some users might rate items more harshly than others, giving mostly 2 and 3 ratings even if they like the items, 
                # while others might give 4 and 5 ratings to items they like. By centering each user's ratings around their mean rating, 
                # we account for these differences in rating scale between users. This makes ratings more comparable across users.
                ratings_u_centered = ratings_u - mean_u
                ratings_u_prime_centered = ratings_u_prime - mean_u_prime
                
                # Calculate the numerator and denominator of the equation
                r_ui_sub_r_i_sq = np.square(ratings_u_centered)
                r_uj_sub_r_j_sq = np.square(ratings_u_prime_centered)
                r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
                r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))
                sim_u_u_prime = np.sum(ratings_u_centered * ratings_u_prime_centered) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

                # significance weighting
                weighted_sim = (min(len(ratings_u[np.where(ratings_u > 0)]), self.GAMMA) / self.GAMMA) * sim_u_u_prime
                sim_matrix[u][u_prime] = weighted_sim
        
        # Matrix to hold the deviations and counts
        devs = np.zeros((self.n_items, self.n_items))
        counts = np.zeros((self.n_items, self.n_items))

        # Loop over all item pairs
        for j in range(self.n_items):
            for i in range(self.n_items):
                # PART 2: Calculate the first part of the equation which is the average difference in ratings between two items (j and i) given by users who have rated both items.
                # ----------------

                # The first part of the equation represents the average difference in ratings between two items (j and i) given by users who have rated both items. 
                # In other words, only those ratings by users who have rated some common item with the predictee user.                
                # Σu∈Sj,i(χ) ( (uj − ui) / card(Sj,i(χ)) )  
                # Individual symbols meaning:
                # χ: The set of all evaluations. On other words, the dataset
                # u: represents a user
                # j: represents an item
                # i: represents an item
                # uj: represents the rating of user u for item j
                # ui: represents the rating of user u for item i
                # u∈Sj,i(χ): Filter users who have rated both movie j and movie i. On other words, Sj,i(χ) denotes the set of all evaluations u ∈ χ such that they contain item i and j
                # card(Sj,i (X)): The total number of users who have rated both movie j and movie i.
                # j: represents an item index
                # item_j_vec: represents the ratings of all users for item j
                
                # Get ratings for items i and j
                item_i_vec = train_ds.T.values[i]
                item_j_vec = train_ds.T.values[j]
                
                # Find the index of users who have rated both item j and item i
                mask = np.logical_and(item_i_vec>0, item_j_vec>0)
                
                # if none is rated by users then skip
                if np.sum(mask) == 0:
                    continue
                
                # Calculate the difference between ratings of item j and item i
                # Eg.          [user1,user2,user3,user4,user5]
                # item_j_vec = [1,2,3,4,5]
                # item_i_vec = [5,4,3,2,1]
                # Then ratings_diff = [4,2,0,-2,-4]    
                diff_ratings = item_j_vec[mask] - item_i_vec[mask]

                # Calculate the average difference in ratings between two items (j and i) given by users who have rated both items
                # In the previous example, the average_diff_ratings = (4+2+0-2-4)/5 = 0
                average_diff_ratings = np.mean(diff_ratings)

                # PART 3: Calculate the second part of the equation which is the new term that represents the influence of the similarity between users who rated both items.
                # ----------------

                # Σ u∈Sj,i(χ) ((uj − ui) · exp(sim(u, u′))) / Σ u∈Sj,i(χ) (exp(sim(u, u′)) · card(Sj,i(χ)))
                # Symbols meaning:
                # sim(u, u′): represents the similarity between user u and user u′
                # exp(sim(u, u′)): represents the exponential of the similarity between user u and user u′
                # Σ u∈Sj,i(χ) (exp(sim(u, u′)) · card(Sj,i(χ))): represents the sum of the exponential of the similarity between user u and user u′ multiplied by the number of users who rated both items
                # Σ u∈Sj,i(χ) ((uj − ui) · exp(sim(u, u′))): represents the sum of the difference of ratings between item j and item i multiplied by the exponential of the similarity between user u and user u′

                # Get the similarities for the corresponding users
                user_similarities = sim_matrix[mask, mask]
                
                # Exponentiate and sum the similarities
                # apply an exponential function to the similarities to accentuate the impact of highly similar users
                exp_user_similarities = np.exp(self.GAMMA * user_similarities)
                sum_exp_user_similarities = np.sum(exp_user_similarities)
                
                # Multiply the difference of ratings by the exponentiated similarities
                weighted_diff_ratings = diff_ratings * exp_user_similarities
                
                # Sum up the weighted difference of ratings
                sum_weighted_diff_ratings = np.sum(weighted_diff_ratings)
                
                # Calculate the second term
                second_term = sum_weighted_diff_ratings / (sum_exp_user_similarities + EPSILON)
                
                # PART 4: Combining the first and second parts of the equation to calculate the deviation matrix.
                # ----------------
                
                # Final calculation of the deviation
                dev = (self.limbda * average_diff_ratings) + ((1 - self.limbda) * second_term)
                
                # Store the calculated deviation in the matrix
                devs[j][i] = dev

                # Count the number of users who rated both items and store it
                counts[j][i] = np.sum(mask)

        return devs, counts

    def predict(self):
        """
        This method is to predict the ratings for the test dataset.
        Based on the following equation:
        prediction = Σ i∈S(u′)−{j}((devj,i + u′i)cj,i) / Σ i∈S(u′)−{j} cj,i
        """

        # Initialize the prediction matrix
        preds = np.zeros((self.n_users, self.n_items))

        # devs and counts are calculated in the fit method
        devs, counts = AWSS.fit(self)

        # Iterate over all users
        for u_prime, ratings_u_prime in enumerate(train_ds.values):
            # Calculate cj,i for all items
            # This represents the number of users who rated both item j and item i
            cj_i = counts

            # Calculate the set of items rated by u_prime, excluding item j
            S_u_prime = np.where(ratings_u_prime > 0)[0]

            # Calculate the predictions for each item j
            for j in range(n_items):
                # Removing item j from the set of items
                S_u_prime_minus_j = S_u_prime[S_u_prime != j]

                # Calculate the numerator Σ i∈S(u′)−{j}((devj,i + u′i)cj,i)
                numerator = np.sum((devs[j, S_u_prime_minus_j] + ratings_u_prime[S_u_prime_minus_j]) * cj_i[j, S_u_prime_minus_j])
                # Calculate the denominator Σ i∈S(u′)−{j} cj,i
                denominator = np.sum(cj_i[j, S_u_prime_minus_j])

                # Calculate the prediction and add it to the prediction matrix
                preds[u_prime, j] = numerator / (denominator + EPSILON)
        
        return preds

    def recommend_items(self,user_id, top_n=10):
        """
        This method is to recommend movies for a given user.
        """
        # Make sure predictions have been calculated
        preds = AWSS.predict(self)
        
        # Get the user's predicted ratings
        user_ratings = preds[user_id]
        
        # Get the indices of the user's top n ratings
        top_n_indices = np.argsort(user_ratings)[-top_n:]

        # Get the movie names corresponding to the top n indices
        movie_names = df_movies[df_movies['movie_id'].isin(top_n_indices)]['movie_title'].values

        return movie_names

In [7]:
# Initialize an instance of the class/model
awss = AWSS(train_ds, n_users, n_items, limbda = 0.05, GAMMA = 30, EPSILON = 1e-6)
awss.fit()
predictions = awss.predict()
recommended_movies = awss.recommend_items(1, 10)

In [10]:
# The 10 movies recommended for user 1
recommended_movies

array(['Flipper (1996)', 'Celluloid Closet, The (1995)',
       'Of Human Bondage (1934)', 'Grosse Fatigue (1994)',
       'Trial by Jury (1994)', 'City of Industry (1997)',
       'Lamerica (1994)', 'Normal Life (1996)',
       'Temptress Moon (Feng Yue) (1996)', 'Favor, The (1994)'],
      dtype=object)

In [11]:
MAE, RMSE = evaluate(test_ds.values, predictions)

In [12]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.7449454576770987, RMSE: 0.9537655442255649
