# Testing CF performance

### Dependencies

In [84]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

### Import CF module

In [33]:
import sys, os
sys.path.insert(0, os.path.abspath(".."))
from Modules.CF import Similarity, CF, PreProcessing

### Testing on Movielens dataset

Dataset url: https://grouplens.org/datasets/movielens/100k/
- Base: https://files.grouplens.org/datasets/movielens/ml-100k/ub.base
- Test: https://files.grouplens.org/datasets/movielens/ml-100k/ub.test

#### Import movielens data

In [3]:
base_movie = pd.read_csv(
    "https://files.grouplens.org/datasets/movielens/ml-100k/ub.base",
    header=None,
    sep="\t",
    encoding="latin-1",
)
test_movie = pd.read_csv(
    "https://files.grouplens.org/datasets/movielens/ml-100k/ub.test",
    header=None,
    sep="\t",
    encoding="latin-1",
)
X_movie_train, X_movie_test, ui_movie_shape = PreProcessing.data_from_matrix(
    base_movie[[0, 1]].to_numpy(),
    test_movie[[0, 1]].to_numpy()
    )
y_movie_train = base_movie[2].to_numpy()
y_movie_test = test_movie[2].to_numpy()

In [4]:
print(f"The train data set has {len(np.unique(X_movie_train[:, 0]))} users out of {ui_movie_shape[0]} recorded users")
print(f"The train data set has {len(np.unique(X_movie_train[:, 1]))} movies out of {ui_movie_shape[1]} recorded moviess")

The train data set has 943 users out of 943 recorded users
The train data set has 1675 movies out of 1682 recorded moviess


In [5]:
print(f"The test data set has {len(np.unique(X_movie_test[:, 0]))} users out of {ui_movie_shape[0]} recorded users")
print(f"The test data set has {len(np.unique(X_movie_test[:, 1]))} movies out of {ui_movie_shape[1]} recorded moviess")

The test data set has 943 users out of 943 recorded users
The test data set has 1145 movies out of 1682 recorded moviess


#### User-based

In [85]:
cf = CF(Similarity.cosine, mode="uucf", n_neighbors=20)
cf.fit(X_movie_train, y_movie_train, ui_movie_shape)
y_movie_pred = cf.predict(X_movie_test)
print("R2  :", r2_score(y_movie_test, y_movie_pred))
print("RMSE:", mean_squared_error(y_movie_test, y_movie_pred, squared=False))

R2  : 0.24081869865680194
RMSE: 0.9791266307426343


#### Item-based

In [86]:
cf = CF(Similarity.cosine, mode="iicf", n_neighbors=20)
cf.fit(X_movie_train, y_movie_train, ui_movie_shape)
y_movie_pred = cf.predict(X_movie_test)
print("R2  :", r2_score(y_movie_test, y_movie_pred))
print("RMSE:", mean_squared_error(y_movie_test, y_movie_pred, squared=False))

R2  : 0.2780339832815857
RMSE: 0.954826555944863


### Testing on sample Steam dataset

In [19]:
X_steam, y_steam, M, ui_steam_shape = PreProcessing.data_from_csv("../data/steam-90k/steam-90k.csv")
X_steam_train, y_steam_train, X_steam_test, y_steam_test = PreProcessing.split(
    X_steam, y_steam, 0.8, by="user", shuffle=True
)

In [32]:
print(f"The train data set has {len(np.unique(X_steam_train[:, 0]))} users out of {ui_steam_shape[0]} recorded users")
print(f"The train data set has {len(np.unique(X_steam_train[:, 1]))} games out of {ui_steam_shape[1]} recorded games")

The train data set has 5840 users out of 5840 recorded users
The train data set has 5360 games out of 5360 recorded games


In [21]:
print(f"The test data set has {len(np.unique(X_steam_test[:, 0]))} users out of {ui_steam_shape[0]} recorded users")
print(f"The test data set has {len(np.unique(X_steam_test[:, 1]))} games out of {ui_steam_shape[1]} recorded games")

The test data set has 5125 users out of 5840 recorded users
The test data set has 5022 games out of 5360 recorded games


### Evaluation

##### Accuracy

In [87]:
cf = CF(Similarity.inverse_euclidean_squared, mode="iicf", n_neighbors=10)
cf.fit(X_steam_train, y_steam_train, ui_steam_shape)
y_pred = cf.predict(X_steam_test)
print("True:", y_steam_test)
print("Pred:", y_pred)
print("R2  :", r2_score(y_steam_test, y_pred))
print("RMSE:", mean_squared_error(y_steam_test, y_pred, squared=False))

True: [1. 1. 1. ... 1. 1. 1.]
Pred: [0.9052505  0.68836456 0.64305076 ... 1.01515152 0.9769645  0.94098695]
R2  : 0.26875403381752616
RMSE: 0.36712221924198407


In [88]:
cf = CF(Similarity.inverse_euclidean_squared, mode="uucf", n_neighbors=3)
cf.fit(X_steam_train, y_steam_train, ui_steam_shape)
y_pred = cf.predict(X_steam_test)
print("True:", y_steam_test)
print("Pred:", y_pred)
print("R2  :", r2_score(y_steam_test, y_pred))
print("RMSE:", mean_squared_error(y_steam_test, y_pred, squared=False))

True: [1. 1. 1. ... 1. 1. 1.]
Pred: [1.         0.92307692 0.94444444 ... 1.         1.         1.        ]
R2  : 0.3014720223430454
RMSE: 0.35881519897918834


##### Implicit feedback

In [82]:
def hit_rate(cf: CF, X_test):
    total_hit = 0
    N = len(np.unique(X_test[:, 0]))
    for user in np.unique(X_test[:, 0]):
        rated = X_test[X_test[:, 0] == user][:, 1]
        rec = cf.get_recommendation(user)[0]
        hit = len(np.intersect1d(rec, rated))
        total_hit += hit
    return total_hit / N