# Testing CF performance

### Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

### Import CF module

In [3]:
import sys, os
sys.path.insert(0, os.path.abspath(".."))
from Modules.CF import Similarity, CF, PreProcessing

### Testing on sample Steam dataset

In [4]:
from zipfile import ZipFile

with ZipFile("../data/steam-2m/steam_2m.zip", 'r') as zip:
    zip.printdir()
    zip.extract("steam_2m.txt", "../data/steam-2m")

File Name                                             Modified             Size
steam_2m.txt                                   2024-05-10 00:22:16     33454467


Use the 90k dataset (≥10 ratings per users/games)

In [35]:
X_steam, y_steam, M, ui_steam_shape = PreProcessing.data_from_csv("../data/steam-90k/steam-90k.csv")
X_steam_train, y_steam_train, X_steam_test, y_steam_test = PreProcessing.split(
    X_steam, y_steam, 0.8, by="user", shuffle=True
)

Use the 2m dataset (≥60 ratings per users/games)

In [10]:
matrix = np.loadtxt("../data/steam-2m/steam_2m.txt", dtype="int")
X_steam, ui_steam_shape = PreProcessing.data_from_matrix(matrix)
y_steam = matrix[:, -1]
X_steam_train, y_steam_train, X_steam_test, y_steam_test = PreProcessing.split(
    X_steam, y_steam, 0.8, by="user", shuffle=True
)

In [11]:
print(f"The train data set has {len(np.unique(X_steam_train[:, 0]))} users out of {ui_steam_shape[0]} recorded users")
print(f"The train data set has {len(np.unique(X_steam_train[:, 1]))} games out of {ui_steam_shape[1]} recorded games")

The train data set has 25217 users out of 25217 recorded users
The train data set has 15651 games out of 15660 recorded games


In [12]:
print(f"The test data set has {len(np.unique(X_steam_test[:, 0]))} users out of {ui_steam_shape[0]} recorded users")
print(f"The test data set has {len(np.unique(X_steam_test[:, 1]))} games out of {ui_steam_shape[1]} recorded games")

The test data set has 25215 users out of 25217 recorded users
The test data set has 15373 games out of 15660 recorded games


### Evaluation

##### Accuracy

In [14]:
cf = CF(Similarity.inverse_euclidean_squared, mode="iicf", n_neighbors=40)
cf.fit(X_steam_train, y_steam_train, ui_steam_shape)
y_pred = cf.predict(X_steam_test)
print("True:", y_steam_test)
print("Pred:", y_pred)
print("R2  :", r2_score(y_steam_test, y_pred))
print("RMSE:", mean_squared_error(y_steam_test, y_pred, squared=False))

KeyboardInterrupt: 

In [None]:
cf = CF(Similarity.inverse_euclidean_squared, mode="uucf", n_neighbors=40)
cf.fit(X_steam_train, y_steam_train, ui_steam_shape)
y_pred = cf.predict(X_steam_test)
print("True:", y_steam_test)
print("Pred:", y_pred)
print("R2  :", r2_score(y_steam_test, y_pred))
print("RMSE:", mean_squared_error(y_steam_test, y_pred, squared=False))

True: [0 0 0 ... 1 1 0]
Pred: [ 0.04793103 -0.01323559 -0.33481333 ...  0.41666667  0.41666667
  0.2485831 ]
R2  : 0.18767564743860665
RMSE: 0.3594152850341306


##### Implicit feedback

In [40]:
def hit_rate(cf: CF, X_test):
    total_hit = 0
    N = len(np.unique(X_test[:, 0]))
    for user in np.unique(X_test[:, 0]):
        rated = X_test[X_test[:, 0] == user][:, 1]
        rec = cf.get_recommendation(user)[0]
        hit = len(np.intersect1d(rec, rated))
        total_hit += hit
    return total_hit / N

##### Comparison (using the 90k datset)

In [43]:
neighbors = [1, 3, 5, 7, 9]
func_names = [
    "UUCF-Cosine",
    "IICF-Cosine",
    "UUCF-Pearson",
    "IICF-Pearson",
    "UUCF-Inverse Manhattan",
    "IICF-Inverse Manhattan",
    "UUCF-Inverse Euclidean Squared",
    "IICF-Inverse Euclidean Squared",
]
mode_funcs = [
    ("uucf", Similarity.cosine),
    ("iicf", Similarity.cosine),
    ("uucf", Similarity.pearson),
    ("iicf", Similarity.pearson),
    ("uucf", Similarity.from_distance(metric="manhattan", tosim=lambda x: 1 / (1 + x))),
    ("iicf", Similarity.from_distance(metric="manhattan", tosim=lambda x: 1 / (1 + x))),
    ("uucf", Similarity.inverse_euclidean_squared),
    ("iicf", Similarity.inverse_euclidean_squared),
]

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]


RMSE

In [57]:
data = np.zeros((len(neighbors), len(mode_funcs)))
for i, n in enumerate(neighbors):
    for j, mode_func in enumerate(mode_funcs):
        mode = mode_func[0]
        func = mode_func[1]
        cf = CF(func, mode, n_neighbors=n)
        cf.fit(X_steam_train, y_steam_train, ui_steam_shape)
        y_pred = cf.predict(X_steam_test)
        rmse = mean_squared_error(y_steam_test, y_pred, squared=False)
        data[i, j] = rmse
rmse_report = pd.DataFrame(data.T, index=func_names, columns=neighbors)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]


In [71]:
rmse_report = rmse_report.sort_values(by=[9], ascending=False)
rmse_report.to_csv("../Reports/CF_rmse_report.csv", float_format="%.6f")
rmse_report

Unnamed: 0,1,3,5,7,9
UUCF-Cosine,0.444125,0.404688,0.400858,0.399773,0.399567
UUCF-Pearson,0.444118,0.404685,0.400858,0.399773,0.39956
IICF-Pearson,0.443976,0.401202,0.394935,0.394061,0.39326
IICF-Cosine,0.443845,0.401162,0.394935,0.394055,0.393248
IICF-Inverse Euclidean Squared,0.41059,0.387236,0.379747,0.374808,0.371812
IICF-Inverse Manhattan,0.410333,0.386891,0.380018,0.374742,0.371786
UUCF-Inverse Euclidean Squared,0.367295,0.362319,0.360241,0.356814,0.354349
UUCF-Inverse Manhattan,0.366978,0.361924,0.359939,0.356119,0.35387


R2-Score

In [59]:
data = np.zeros((len(neighbors), len(mode_funcs)))
for i, n in enumerate(neighbors):
    for j, mode_func in enumerate(mode_funcs):
        mode = mode_func[0]
        func = mode_func[1]
        cf = CF(func, mode, n_neighbors=n)
        cf.fit(X_steam_train, y_steam_train, ui_steam_shape)
        y_pred = cf.predict(X_steam_test)
        r2 = r2_score(y_steam_test, y_pred)
        data[i, j] = r2
r2_report = pd.DataFrame(data.T, index=func_names, columns=neighbors)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]


In [72]:
r2_report = r2_report.sort_values(by=[9])
r2_report.to_csv("../Reports/CF_r2_report.csv", float_format="%.6f")
r2_report

Unnamed: 0,1,3,5,7,9
UUCF-Cosine,-0.072936,0.109152,0.125933,0.130658,0.131556
UUCF-Pearson,-0.072901,0.109164,0.125933,0.130657,0.131583
IICF-Pearson,-0.072215,0.124434,0.151575,0.155324,0.158754
IICF-Cosine,-0.071583,0.124605,0.151572,0.15535,0.158806
IICF-Inverse Euclidean Squared,0.082976,0.18433,0.215574,0.235848,0.248014
IICF-Inverse Manhattan,0.084123,0.18578,0.214454,0.236117,0.248118
UUCF-Inverse Euclidean Squared,0.266173,0.285921,0.29409,0.307456,0.316992
UUCF-Inverse Manhattan,0.267441,0.287477,0.295271,0.310151,0.318836
