In [4]:
from datetime import datetime
import numpy as np
import random
from collections import defaultdict
from math import log2
import pandas as pd
from sklearn.model_selection import train_test_split
from cmfrec import CMF

Load the data

In [5]:
ratings = pd.read_csv("../preprocessed_data/ratings.csv")

Prepare data

In [6]:
rec_train, rec_test = train_test_split(ratings,test_size= 0.2, random_state= 42)
users_train,games_train = rec_train["user_id"].unique(), rec_train["app_id"].unique()
rec_test =rec_test.loc[rec_test["user_id"].isin(users_train) & rec_test["app_id"].isin(games_train)]
print(rec_train.shape)
print(rec_test.shape)

(3483739, 4)
(870400, 4)


Recode function for implicit feedbacks

In [11]:

def get_ratings(predictions):
    actual = np.array([pred[2] for pred in predictions])
    pred = np.array([pred[3] for pred in predictions])
    return actual, pred

def get_errors(predictions):
    actual, pred = get_ratings(predictions)
    rmse = np.sqrt(np.mean((pred - actual)**2))
    mape = np.mean(np.abs(actual - pred) / actual) * 100
    return rmse, mape

def evaluation(predictions, k=5):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions = defaultdict(float)
    recalls = defaultdict(float)
    ndcgs = defaultdict(float)
    f1_scores = defaultdict(float)
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        dcg = sum((true_r / log2(i + 2)) for i, (_, true_r) in enumerate(user_ratings[:k]) if true_r == 1)
        sorted_true = sorted(user_ratings, key=lambda x: x[1], reverse=True)
        idcg = sum((true_r / log2(i + 2)) for i, (_, true_r) in enumerate(sorted_true[:k]) if true_r == 1)
        
        n_rel = sum((true_r == 1) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= 0.5) for (est, _) in user_ratings[:10])
        n_rel_and_rec_k = sum((true_r == 1 and est >= 0.5) for (est, true_r) in user_ratings[:10])
        
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
        ndcgs[uid] = dcg / idcg if idcg != 0 else 0
        f1_scores[uid] = 2* precisions[uid] * recalls[uid] / (precisions[uid]+ recalls[uid]) if precisions[uid]+recalls[uid] !=0 else 0
    
    precision = np.mean(list(precisions.values()))
    recall = np.mean(list(recalls.values()))
    ndcg = np.mean(list(ndcgs.values()))
    f1_score = sum(f1 for f1 in f1_scores.values()) / len(f1_scores)
    
    return precision, recall, f1_score, ndcg

def run_implicit(algo, train_data, test_data, verbose=True, U=None, I=None):
    start = datetime.now()
    train = dict()
    test = dict()
    train_data = train_data.rename(columns={'app_id': 'ItemId', 'user_id': 'UserId', 'implicit_rating': 'Rating'})
    test_data = test_data.rename(columns={'app_id': 'ItemId', 'user_id': 'UserId', 'implicit_rating': 'Rating'})
    if U is not None:
        U = U.rename(columns={'user_id': 'UserId'})
    if I is not None:
        I = I.rename(columns={'app_id': 'ItemId'})
    
    st = datetime.now()
    print('Training the model...')
    algo.fit(X=train_data, U=U, I=I)
    print('Done. Time taken: {} \n'.format(datetime.now() - st))

    st = datetime.now()
    print('Evaluating the model with train data...')
    preds = algo.predict(train_data['UserId'].tolist(), train_data['ItemId'].tolist())
    train_preds = [(u, i, r, pred) for u, i, r, pred in zip(train_data['UserId'], train_data['ItemId'], train_data['Rating'], preds)]
    precision, recall, f1, ndcg = evaluation(train_preds)
    print('Time taken: {}'.format(datetime.now() - st))
    
    if verbose:
        print('-' * 15)
        print('Train Data')
        print('-' * 15)
        print("Precision: {}\nRecall: {}\nF1: {}\nNDCG: {}\n".format(precision, recall, f1, ndcg))
    
    if verbose:
        print('Adding train results to the dictionary...')
    train['precision'] = precision
    train['recall'] = recall
    train['f1'] = f1
    train['ndcg'] = ndcg

    st = datetime.now()
    print('\nEvaluating the model with test data...')
    preds = algo.predict(test_data['UserId'].tolist(), test_data['ItemId'].tolist())
    test_preds = [(u, i, r, pred) for u, i, r, pred in zip(test_data['UserId'], test_data['ItemId'], test_data['Rating'], preds)]
    precision, recall, f1, ndcg = evaluation(test_preds)
   
    print('Time taken: {}'.format(datetime.now() - st))
    if verbose:
        print('-' * 15)
        print('Test Data')
        print('-' * 15)
        print("Precision: {}\nRecall: {}\nF1: {}\nNDCG: {}\n".format(precision, recall, f1, ndcg))
    
    if verbose:
        print('Storing the test results in the test dictionary...')
    test['precision'] = precision
    test['recall'] = recall
    test['f1'] = f1
    test['ndcg'] = ndcg

    print('\n' + '-' * 45)
    print('Total time taken to run this algorithm:', datetime.now() - start)
    
    return train, test

In [12]:
als = CMF(k= 50 , method = 'als', lambda_ = 35)
train, test = run_implicit(als , rec_train, rec_test)

Training the model...
Done. Time taken: 0:00:18.767879 

Evaluating the model with train data...
Time taken: 0:00:22.770834
---------------
Train Data
---------------
Precision: 0.9342407070176895
Recall: 0.24780574702330174
F1: 0.3797450134129907
NDCG: 0.9492527609130151

Adding train results to the dictionary...

Evaluating the model with test data...
Time taken: 0:00:09.845453
---------------
Test Data
---------------
Precision: 0.8614880445841946
Recall: 0.8020874606771221
F1: 0.7980313378592722
NDCG: 0.9234581719383568

Storing the test results in the test dictionary...

---------------------------------------------
Total time taken to run this algorithm: 0:00:51.442951
