# Modelling

In [None]:
%reset

In [1]:
import numpy as np
from scipy.sparse import csr_array, lil_array
import matplotlib.pyplot as plt
import pandas as pd
import math
import pickle
import time
import datetime
import random
from pprint import pprint
from lightgbm import LGBMRanker
from sklearn.model_selection import train_test_split

from SVD import Metrics, RatingSVD, LogisticSVD
from RecData import RecData

## Data Preprocessing

In [2]:
# recs = pd.read_csv('data/recommendations.csv')
# recs = pd.read_csv('data/pruned.csv')
# recs = pd.read_csv('data/2plus.csv')
recs = pd.read_csv('data/full_pruned.csv')

# recs = recs[:5]


recs = recs.sort_values(by='timestamp')
recs = recs.drop_duplicates(subset=['user_id', 'app_id'], keep='last')

USED_COLS = ['app_id', 'is_recommended', 'user_id']
recs = recs[USED_COLS]

item_data = pd.read_csv('data/games.csv')
titles = item_data[['app_id', 'title']]

print("Shape:", recs.shape)
recs.sort_values(by=['user_id', 'app_id']).head()


Shape: (1482464, 3)


Unnamed: 0,app_id,is_recommended,user_id
420790,12210,True,240
675882,22380,True,240
246231,239140,True,240
539676,251570,True,240
521289,270880,True,240


In [3]:
# random.seed(42)
# np.random.seed(42)

# rec_data = RecData()
# rec_data.create_from_dataframe(recs)
# rec_data.set_titles(titles)

# # del recs

# start_time = time.time()
# print("Creating splits...")
# train_data, test = rec_data.train_test_split(test_size=0.2)
# print("First split done.")
# train_data, val = train_data.train_test_split(test_size=0.2)
# print("Done creating splits in", time.time() - start_time, "seconds")

In [4]:
random.seed(42)
np.random.seed(42)
rec_data = RecData()
rec_data.create_from_dataframe(recs)
rec_data.set_titles(titles)

# del recs

print("Creating splits...")
train_data, val = rec_data.leave_k_out_split(k=1, create_val=False)
print("Done creating splits.")

Creating utility matrix...
Done utility matrix.
Creating splits...
Done user 1 / 63175
Done user 10001 / 63175
Done user 20001 / 63175


KeyboardInterrupt: 

In [None]:
del rec_data
del item_data
del titles

## Ranker

This was used as a postprocessing step after using item-knn for candidate generation. Was not able to get good results.

In [None]:
# train = train_data.generate_dataframe()
# X_train = train[['app_id', 'user_id']]
# y_train = train['is_recommended'].to_numpy()

# X_train = X_train.sort_values(by='user_id')
# counts = X_train['user_id'].value_counts(sort=False)
# X_train = X_train[['app_id']]

In [None]:
# ranker = LGBMRanker(n_estimators=300, random_state=42, objective='lambdarank')
# ranker = ranker.fit(X_train, y_train, group = counts.values, categorical_feature=['app_id'],
#                     eval_set=[(X_train, y_train)], eval_group=[counts.values], 
#                     eval_at=20)

## Training SVD

In [None]:
# for pruned, epoch: 80, lr:0.01, reg:0.02, k=100
# for 2plus, epoch: 80, lr:0.01, reg:0.02, k=100

random.seed(42)
np.random.seed(42)

# Fit and predict with svd
svd_predictor = RatingSVD(
    train_data.get_num_users(), train_data.get_num_items(), 2, k=100, 
    learning_rate=0.001, C=0.02)
svd_predictor.fit(train_data.get_matrix(), 50, validation_set=val, early_stop=False)
# svd_predictor = LogisticSVD(
#     train_data.get_num_users(), train_data.get_num_items(), 2, k=100, 
#     learning_rate=0.01, C=0.04)
# svd_predictor.fit(train_data.get_matrix(), 40, validation_set=val, early_stop=False)

# Plot error
errors = svd_predictor.get_train_errors()
val_errors = svd_predictor.get_val_errors()
plt.plot(range(len(errors)), errors, label='Training loss')
plt.plot(range(len(val_errors)), val_errors, label='Test loss')
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.title('Error vs Epoch')
plt.legend()
plt.show()

# Compute rmse
predictions = svd_predictor.predict_pairs([(user, item) for user, item, _ in val])
predictions = [prediction + (val[i][2],) for i, prediction in enumerate(predictions)]
metrics = Metrics()
rmse = metrics.rmse(predictions)
print("Validation rmse", rmse)

We can now continue to train if we believe the model is still underfitting.

In [None]:
svd_predictor.continue_fit(5, early_stop=False)

In [None]:
# Plot error
errors = svd_predictor.get_train_errors()
val_errors = svd_predictor.get_val_errors()
plt.plot(range(len(errors)), errors, label='Training error')
plt.plot(range(len(val_errors)), val_errors, label='Test error')
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.title('Error vs Epoch')
plt.legend()
plt.show()

# Compute rmse
predictions = svd_predictor.predict_pairs([(user, item) for user, item, _ in val])
predictions = [prediction + (val[i][2],) for i, prediction in enumerate(predictions)]
metrics = Metrics()
rmse = metrics.rmse(predictions)
print("Test rmse", rmse)

In [None]:
values, edges = np.histogram(svd_predictor._user_features)
plt.stairs(values, edges)
plt.show()

values, edges = np.histogram(svd_predictor._item_features)
plt.stairs(values, edges)
plt.show()

values, edges = np.histogram(svd_predictor._user_biases)
plt.stairs(values, edges)
plt.show()

values, edges = np.histogram(svd_predictor._item_biases)
plt.stairs(values, edges)
plt.show()

In [None]:
svd_predictor._item_biases[1539]

In [None]:
svd_predictor._user_features[5]

We compute similarities for item-knn.

In [None]:
svd_predictor.compute_sims()

## Preparing for Production

In [None]:
svd_predictor.prep_for_item_knn()
train_data.prep_for_item_knn()

## Saving the model

In [None]:
# Ensure file exists
model_dir = model_dir = "saved_models/{}-{}-{}-{}-{}-{}_{}".format(
    *(time.localtime()[:6] + (round(rmse, 4), ))).replace(".", "-") + ".pkl" 
file = open(model_dir, 'a')
file.close()

# Save model
print("Saving model...")
with open(model_dir, 'wb') as file:
    pickle.dump([train_data, svd_predictor], file)
print("Done saving model.")

## Tests

In [None]:
with open('saved_models/2023-7-8-12-15-13_0-37.pkl', 'rb') as file:
    train_data, svd_predictor = pickle.load(file)

### Testing Rerank

In [None]:
# data.search_title('forza')

In [None]:
# filtered = svd_predictor.items_top_n([(286, 1)])
# app_ids = []
# for item_index in filtered:
#     app_ids.append(data.index_to_item_id(item_index))
# df = pd.DataFrame({'app_id': app_ids})
# predictions = ranker.predict(df)
# df['relevance'] = predictions
# df.head()
# df = df.sort_values(by='relevance', ascending=False)
# df.head()

In [None]:
# pprint([[data.index_to_title(data.item_id_to_index(i)) for _, i, _ in df.itertuples()]][0][:20])

### Testing Performance

Checking recall.

In [None]:
start_time = time.time()
print("Computing recall...")
svd_predictor.compute_recall(val)
print("Done computing recall in", time.time() - start_time, "seconds")

Checking item-based knn.

In [None]:
train_data.search_title('turing')

In [None]:
top = svd_predictor.items_knn([(0, 1)], n=10)
pprint([(sim, train_data.index_to_title(index)) for sim, index in top])

Checking collaborative filtering.

In [None]:
svd_predictor._mu

In [None]:
j = 400

# Test top-n list for user j
top_n = svd_predictor.top_n(j, 1000, remove_bias=False)
print("Predicted top n:")
print(top_n[:10])
pprint([index for r, index in top_n][0])
pprint([(r, train_data.index_to_title(index)) for r, index in top_n][:10])

# User j true top n list
print("True top n:")
true_top = train_data.top_n(j, 5000)
pprint([train_data.index_to_title(index) for _, index in true_top])

## Item KNN

In [8]:
from KNN import ItemKNN

In [None]:
knn = ItemKNN(k=40, mean_centered=True)
knn.fit(train_data.get_matrix())

In [17]:
train_data.search_title('pixel')

[('Pixel Puzzles 2: Space', 272),
 ('Pixel Survivors : Roguelike', 818),
 ('McPixel 3', 668)]

In [18]:
prefs = train_data.create_prefs([(278, 1), (0, 0), (577, 1), (272, 0)])
top = knn.top_n(13, 10, prefs=prefs)
pprint(top)
pprint([train_data.index_to_title(i) for r, i in top])

[(1.0, 18),
 (1.0, 138),
 (1.0, 203),
 (1.0, 213),
 (1.0, 230),
 (1.0, 256),
 (1.0, 265),
 (1.0, 333),
 (1.0, 362),
 (1.0, 388)]
['Coral Island',
 'STALCRAFT',
 'BONELAB',
 'Made in Abyss: Binary Star Falling into Darkness',
 'Forts',
 'BLUE REFLECTION: Second Light',
 'SIGNALIS',
 'MELTY BLOOD: TYPE LUMINA',
 'Tower of Fantasy',
 'Sphinx and the Cursed Mummy']


In [32]:
knn._item_means[800]

1.614065180102916

In [35]:
# Ensure file exists
model_dir = "saved_models/knn/sim1.pkl" 
file = open(model_dir, 'a')
file.close()

# Save model
print("Saving model...")
with open(model_dir, 'wb') as file:
    pickle.dump([train_data, knn._sims, dict(knn._item_means)], file)
print("Done saving model.")

Saving model...
Done saving model.


In [9]:
import numba as nb
from numba import jit
from KNN import ItemKNN

model_dir = "saved_models/knn/sim1.pkl" 

with open(model_dir, 'rb') as file:
    train_data, sims, means = pickle.load(file)

@jit
def make_dict(items):
    return {k: v for k,v in items}

means_prime = make_dict(tuple(means.items()))
knn = ItemKNN(k=40, mean_centered=True)
knn._sims = sims
knn._item_means = means_prime
knn._M = train_data.get_matrix()
knn._num_users, knn._num_items = train_data.get_matrix().shape


  @jit
[1m
File "..\..\..\AppData\Local\Temp\ipykernel_24508\2972409351.py", line 6:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
Fall-back from the nopython compilation path to the object mode compilation path has been detected. This is deprecated behaviour that will be removed in Numba 0.59.0.

For more information visit https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit
[1m
File "..\..\..\AppData\Local\Temp\ipykernel_24508\2972409351.py", line 6:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
