In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt


def preprocess(datasetPath = './ratings.dat'):
    # PLEASE DELETE THIS LINE LATER: nrows took first 100000 rows for test 
    rating_dataset = pd.read_csv(datasetPath,sep='::', nrows = 1000000)
    print(rating_dataset.describe())
    rating_dataset.columns = ['user_id','item_id','rating','timestamp']

    mean_rating = rating_dataset['rating'].mean()
    min_rating = rating_dataset['rating'].min()
    max_rating = rating_dataset['rating'].max()

    # if the rating is greater or equal to 4, treat it as positive rating (1); 
    # otherwise treat it as negative rating (0)
    #rating_dataset.loc[:,'rating'] = rating_dataset['rating'].map(lambda x: 1 if x >= 4 else 0)
    rating_dataset.loc[:,'rating'] = rating_dataset['rating'].map(lambda x: (x - min_rating) / (max_rating - min_rating))
    
    # store the original user_id and item_id lists
    user_id_list = sorted(rating_dataset.loc[:,'user_id'].unique().tolist())
    item_id_list = sorted(rating_dataset.loc[:,'item_id'].unique().tolist())

    item_index = np.arange(len(item_id_list)).tolist()
    

    # set the initial value (rating) of all the entries as 0 (if the data is NA, leave it as 0)
    # user_item_matrix = np.zeros(shape = (len(user_id_list), len(item_id_list)))

    # # The (i row, j column) of user_item_matrix means the ith user's rating to the jth movies, 
    # # Here, i and j are the actual ids' indices in the user_id_list and item_id_list respectively.

    # for i in range(len(user_id_list)):
    #     for index, row in rating_dataset.groupby(['user_id']).get_group(user_id_list[i]).iterrows():
    #         item_id = row['item_id']
    #         rating = row ['rating']
    #         user_item_matrix[i, item_id_list.index(item_id)] = rating
    user_item_matrix = rating_dataset.pivot(index='user_id', columns='item_id', values='rating')
    #user_item_matrix = user_item_matrix.subtract(np.float16(user_item_matrix.mean(axis=1)), axis=0)
    #similarity_matrix = user_item_matrix.T.corr('pearson', min_periods=1).to_numpy()
    user_item_matrix = np.float32(user_item_matrix.fillna(0).to_numpy())
    print(user_item_matrix)
    print(user_item_matrix.shape)
    
    # show the sparsity of the user-item matrix
    
    sparsity = 1.0 - ( np.count_nonzero(user_item_matrix) / (user_item_matrix.shape[0] * user_item_matrix.shape[1]) )
    sparsity *= 100
    print('{:.2f}%'.format(sparsity))
    
    return rating_dataset, user_id_list, item_id_list, user_item_matrix

rating_dataset, user_id_list, item_id_list, user_item_matrix = preprocess(datasetPath = './ratings.dat')


  rating_dataset = pd.read_csv(datasetPath,sep='::', nrows = 1000000)


                    1             122               5     838985046
count  1000000.000000  1000000.000000  1000000.000000  1.000000e+06
mean      3806.056084     3906.421320        3.522126  1.026182e+09
std       2157.229469     8572.503172        1.058205  1.143512e+08
min          1.000000        1.000000        0.500000  8.280802e+08
25%       1936.000000      640.000000        3.000000  9.460384e+08
50%       3810.000000     1721.000000        4.000000  1.023814e+09
75%       5707.000000     3471.000000        4.000000  1.118983e+09
max       7521.000000    65133.000000        5.000000  1.231130e+09
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(7316, 9626)
98.59%


In [33]:
def train_test_split(ratings):
    train = ratings.copy()
    
    test_time = np.quantile(rating_dataset.timestamp, 0.90)
    test_ids = rating_dataset.loc[rating_dataset['timestamp'] > test_time, ['user_id', 'item_id']]\
            .groupby('user_id')
    
    
    test_index_list = []


    for uid, mids in test_ids.__iter__():
        u_index = user_id_list.index(uid)
        mid_list = mids.item_id.to_list()
        movie_indices = [item_id_list.index(mid) for mid in mid_list]
    
        test_index_list.append({u_index:movie_indices})
        
    
    test = np.zeros(ratings.shape)
    
    for i in test_index_list:
        for u_index, m_list in i.items():
            train[u_index, m_list] = 0
            test[u_index, m_list] = ratings[u_index, m_list]
    
   
    
   

    return train, test



In [34]:
train, test = train_test_split(user_item_matrix)
print("ratings shape", user_item_matrix.shape, " number of nonzero:", np.count_nonzero(user_item_matrix))
print("training shape", train.shape, " number of nonzero:", np.count_nonzero(train))
#print("validation shape", val.shape, " number of nonzero:", np.count_nonzero(val))
print("test shape", test.shape, " number of nonzero:", np.count_nonzero(test))


ratings shape (7316, 9626)  number of nonzero: 991749
training shape (7316, 9626)  number of nonzero: 893023
test shape (7316, 9626)  number of nonzero: 98726


In [35]:
# The Similarity matrix based on pearson correlation
#similarity_matirx = user_item_matrix.T.corr()
similarity_matrix = np.nan_to_num(np.corrcoef(train), nan=0)
similarity_matrix.shape

  c /= stddev[:, None]
  c /= stddev[None, :]


(7316, 7316)

In [36]:
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))


In [43]:
top_n = 100
prediction_ratings = train.copy()
overall_mean = np.mean(train[train.nonzero()])
for user_index in range(prediction_ratings.shape[0]):
    predictions_indices = np.where(test[user_index, :] > 0)[0]
    rated_indices = np.where(train[user_index, :] > 0)[0]


        
    similarity_row = similarity_matrix[user_index, :]

    # sum of similarity in this row
    sum_s = sum(similarity_row)
    # get top-n similary neighbors except the first one
    neighbors = np.argsort(similarity_row)[::-1][:top_n][1:]
    #print(neighbors)
   
    for item_index in predictions_indices:
        # find the neighbors who have rated the movie
        rated_neighbors = []
        for neighbor in neighbors:
            if len(rated_neighbors) > 20:
                break
            if train[neighbor, item_index] > 0:
                rated_neighbors.append(neighbor)
            
        if len(rated_neighbors) > 20:
            rated_neighbors = np.array(rated_neighbors)
            # print(rated_neighbors)
            # print(train[rated_neighbors,item_index])
            # print(similarity_row[rated_neighbors])
            if sum_s != 0:
                prediction_ratings[user_index, item_index] = np.dot(similarity_row[rated_neighbors],\
                 prediction_ratings[rated_neighbors,item_index]) / sum_s
            else:
                prediction_ratings[user_index, item_index] = overall_mean
                
            
            
        elif len(rated_indices) > 0:
            # no rated_neighbors, use the mean score
            prediction_ratings[user_index, item_index] = train[user_index, rated_indices].mean()
        else:
            prediction_ratings[user_index, item_index] = overall_mean
            

    if user_index % 100 == 0:
        print('Finish the prediction for {:.2f}% users'.format(user_index/prediction_ratings.shape[0]*100))
        #rmse_validation = rmse(prediction_ratings, test)
        #print('rmse score is:', rmse_validation)

    
    

Finish the prediction for 0.00% users
Finish the prediction for 1.37% users
Finish the prediction for 2.73% users
Finish the prediction for 4.10% users
Finish the prediction for 5.47% users
Finish the prediction for 6.83% users
Finish the prediction for 8.20% users
Finish the prediction for 9.57% users
Finish the prediction for 10.93% users
Finish the prediction for 12.30% users
Finish the prediction for 13.67% users
Finish the prediction for 15.04% users
Finish the prediction for 16.40% users
Finish the prediction for 17.77% users
Finish the prediction for 19.14% users
Finish the prediction for 20.50% users
Finish the prediction for 21.87% users
Finish the prediction for 23.24% users
Finish the prediction for 24.60% users
Finish the prediction for 25.97% users
Finish the prediction for 27.34% users
Finish the prediction for 28.70% users
Finish the prediction for 30.07% users
Finish the prediction for 31.44% users
Finish the prediction for 32.80% users
Finish the prediction for 34.17% 

In [44]:
from sklearn.metrics import roc_auc_score
def map_to_CTR(matrix, nonzero):
    CTR = matrix[nonzero].flatten()
    thres_hold = 0.8*(CTR.max()-CTR.min())
    #print(thres_hold)
    for i in range(len(CTR)):
        if CTR[i] - CTR.min()  >= thres_hold: 
        #if CTR[i] >= 0.8:
            CTR[i] = 1

        else:
            CTR[i] = 0

    return CTR

print(prediction_ratings[test.nonzero()].mean())
print(test[test.nonzero()].mean())
CTR_predicted_rating = map_to_CTR(prediction_ratings, nonzero = test.nonzero())
CTR_validation_rating = map_to_CTR(test, nonzero = test.nonzero())


print(CTR_predicted_rating.shape)
print(CTR_validation_rating.shape)


auc_score =  roc_auc_score(CTR_validation_rating, CTR_predicted_rating)
print("The auc score is", auc_score)

0.6368183
0.6902268359395813
(98726,)
(98726,)
The auc score is 0.5053460733080465
