# Testing CF performance

### Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

### Import modules

In [1]:
from preprocessing import Preprocessor
from topk_cf import TopKNeighborCF
from similarity import Similarity

### Testing on sample Steam dataset

Use the steam_60 dataset (≥60 ratings per users)

In [3]:
X_steam, y_steam, ui_steam_shape = Preprocessor.load_data("../data/steam_60/user_item_data.txt")
X_steam_train, y_steam_train, X_steam_test, y_steam_test = Preprocessor.split(
    X_steam, y_steam, 0.8, by="user", shuffle=True
)

In [6]:
print(f"The train data set has {len(np.unique(X_steam_train[:, 0]))} users out of {ui_steam_shape[0]} recorded users")
print(f"The train data set has {len(np.unique(X_steam_train[:, 1]))} games out of {ui_steam_shape[1]} recorded games")

The train data set has 25217 users out of 25217 recorded users
The train data set has 35476 games out of 36014 recorded games


In [7]:
print(f"The test data set has {len(np.unique(X_steam_test[:, 0]))} users out of {ui_steam_shape[0]} recorded users")
print(f"The test data set has {len(np.unique(X_steam_test[:, 1]))} games out of {ui_steam_shape[1]} recorded games")

The test data set has 25217 users out of 25217 recorded users
The test data set has 29108 games out of 36014 recorded games


### Evaluation

##### Accuracy

In [8]:
uucf = TopKNeighborCF(Similarity.cosine, mode="uucf", neighbors=20)
print("Fitting data", end="\r")
uucf.fit(X_steam_train, y_steam_train, ui_steam_shape)
print("Predicting  ", end="\r")
y_pred = uucf.predict(X_steam_test)
print("True:", y_steam_test)
print("Pred:", y_pred)
print("R2  :", r2_score(y_steam_test, y_pred))
print("RMSE:", mean_squared_error(y_steam_test, y_pred, squared=False))

True: [3.5 4.5 3.  ... 3.5 3.  3. ]
Pred: [3.54086136 4.1050531  2.99464538 ... 3.20206782 2.89554248 2.95552147]
R2  : 0.32428553637342095
RMSE: 0.9635053568812305


In [9]:
iicf = TopKNeighborCF(Similarity.cosine, mode="iicf", neighbors=20)
print("Fitting data", end="\r")
iicf.fit(X_steam_train, y_steam_train, ui_steam_shape)
print("Predicting  ", end="\r")
y_pred = iicf.predict(X_steam_test)
print("True:", y_steam_test)
print("Pred:", y_pred)
print("R2  :", r2_score(y_steam_test, y_pred))
print("RMSE:", mean_squared_error(y_steam_test, y_pred, squared=False))

True: [3.5 4.5 3.  ... 3.5 3.  3. ]
Pred: [3.80594098 3.94851691 2.4653895  ... 3.48494484 2.78244471 3.25524955]
R2  : 0.3634519969128486
RMSE: 0.9351647023281325


##### Comparison

Neighbours

In [10]:
neighbors = [5, 10, 15, 20, 25]
func = Similarity.cosine
r2 = []
rmse = []
for n in neighbors:
    cf = TopKNeighborCF(func, "uucf", n)
    cf.fit(X_steam_train, y_steam_train, ui_steam_shape)
    y_pred = cf.predict(X_steam_test)
    r2.append(r2_score(y_steam_test, y_pred))
    rmse.append(mean_squared_error(y_steam_test, y_pred, squared=False))

In [14]:
pd.DataFrame([r2, rmse], columns=[neighbors], index=["R2", "RMSE"])

Unnamed: 0,5,10,15,20,25
R2,0.255295,0.30289,0.317478,0.324286,0.327863
RMSE,1.011497,0.97864,0.968347,0.963505,0.960951
