# Batch architecture in Recommendation Systems

In this notebook we are going to explain how to create use the batch architecture to deploy a recommendation system solution.


In [1]:
import os
import json
import requests
import numpy as np
import logging
import psycopg2

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.models.sar import SAR

# NOTE: MODIFY THE FILE secrets.template.py
from secrets import DATABASE, USER, PASS

In [2]:
# Top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = "100k"

# Other data settings
USER_COL = "userID"
ITEM_COL = "itemID"
RATING_COL = "rating"
TIMESTAMP_COL = "timestamp"
PREDICTION_COL = "prediction"

# Train test split ratio
SPLIT_RATIO = 0.75

# Model settings
SIMILARITY_TYPE = "jaccard"
TIME_DECAY = 30 # number of days until the weight of the ratings are decayed by 1/2

SEED = 42

logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)-8s %(message)s")

In [3]:
data = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE
)

# Convert the float precision to 32-bit in order to reduce memory consumption 
data[RATING_COL] = data[RATING_COL].astype(np.float32)

data.head()

2023-10-23 11:52:11,877 DEBUG    Starting new HTTPS connection (1): files.grouplens.org:443
2023-10-23 11:52:12,461 DEBUG    https://files.grouplens.org:443 "GET /datasets/movielens/ml-100k.zip HTTP/1.1" 200 4924029
2023-10-23 11:52:12,461 INFO     Downloading https://files.grouplens.org/datasets/movielens/ml-100k.zip
100%|█████████████████████████████████████| 4.81k/4.81k [00:01<00:00, 4.00kKB/s]


Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [4]:
model = SAR(
    col_user=USER_COL,
    col_item=ITEM_COL,
    col_rating=RATING_COL,
    col_timestamp=TIMESTAMP_COL,
    similarity_type=SIMILARITY_TYPE, 
    time_decay_coefficient=30, 
    timedecay_formula=True,
    normalize=True
)

In [5]:
with Timer() as train_time:
    model.fit(data)

print(f"Took {train_time.interval} seconds for training.")

2023-10-23 11:52:14,107 INFO     Collecting user affinity matrix
2023-10-23 11:52:14,113 INFO     Calculating time-decayed affinities
2023-10-23 11:52:14,171 INFO     Creating index columns
2023-10-23 11:52:14,239 INFO     Calculating normalization factors
2023-10-23 11:52:14,278 INFO     Building user affinity sparse matrix
2023-10-23 11:52:14,284 INFO     Calculating item co-occurrence
2023-10-23 11:52:14,483 INFO     Calculating item similarity
2023-10-23 11:52:14,484 INFO     Using jaccard based similarity
2023-10-23 11:52:14,572 INFO     Done training


Took 0.49327490001451224 seconds for training.


In [6]:
with Timer() as scoring_time:
    top_k = model.recommend_k_items(data, top_k=TOP_K, remove_seen=True)

print("Took {} seconds for scoring.".format(scoring_time.interval))

2023-10-23 11:52:14,580 INFO     Calculating recommendation scores
2023-10-23 11:52:14,837 INFO     Removing seen items


Took 0.3269488000078127 seconds for scoring.


In [9]:
top_k.head()

Unnamed: 0,userID,itemID,prediction
0,196,204,3.664532
1,196,216,3.627478
2,196,88,3.461649
3,196,69,3.456071
4,196,168,3.429427


In [7]:
# Now let's look at the results for a specific user
user_id = 54

In [8]:
items_seen = data[data[USER_COL] == user_id]
items_seen

Unnamed: 0,userID,itemID,rating,timestamp
232,54,106,3.0,880937882
336,54,595,3.0,880937813
512,54,742,5.0,880934806
806,54,302,4.0,880928519
1352,54,676,5.0,880935294
...,...,...,...,...
68542,54,634,1.0,892681013
70980,54,250,4.0,880933834
74116,54,823,2.0,880938088
78663,54,405,4.0,880934806


In [12]:
items_predicted = top_k[top_k[USER_COL] == user_id].sort_values(
    by=PREDICTION_COL, ascending=False
)
items_predicted

Unnamed: 0,userID,itemID,prediction
1300,54,300,2.784323
1301,54,294,2.601673
1302,54,248,2.548543
1303,54,286,2.458506
1304,54,282,2.436808
1305,54,271,2.433754
1306,54,293,2.3683
1307,54,315,2.367518
1308,54,222,2.357715
1309,54,301,2.354047


In [None]:
# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(database=DATABASE, user=USER, password=PASS, host="localhost", port="5432")

# Create a cursor object to execute SQL queries
cur = conn.cursor()

# Create a table to store your data
create_table_query = """
CREATE TABLE user_items (
    userID INTEGER,
    itemID INTEGER,
    prediction FLOAT
);
"""
cur.execute(create_table_query)

# Commit the changes and close the connection
conn.commit()
conn.close()

In [None]:
# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(database=DATABASE, user=USER, password=PASS, host="localhost", port="5432")

# Create a cursor object to execute SQL queries
cur = conn.cursor()

# Define the INSERT query
insert_query = "INSERT INTO user_items (userID, itemID, prediction) VALUES (%s, %s, %s);"

# Convert the DataFrame to a list of tuples
data_to_insert = [(row['userID'], row['itemID'], row['prediction']) for index, row in top_k.iterrows()]

# Use executemany to insert the data efficiently
cur.executemany(insert_query, data_to_insert)

# Commit the changes and close the connection
conn.commit()
conn.close()