## Assignment 3 Pattern Mining and Recommender Systems: Individual Code

### Task 2: Collaborative Filtering

### Ky Cuong Pham, 1906313, Version 02

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
# Load the data
data = pd.read_csv("data/Groceries data train.csv")
data = data.dropna()

# Convert columns to appropriate types
data['User_id'] = data['User_id'].astype('int')
data['year'] = data['year'].astype('int')
data['month'] = data['month'].astype('int')
data['day'] = data['day'].astype('int')
data['day_of_week'] = data['day_of_week'].astype('int')

data.head()

Unnamed: 0,User_id,Date,itemDescription,year,month,day,day_of_week
0,2351,1/01/2014,cleaner,2014,1,1,2
1,2226,1/01/2014,sausage,2014,1,1,2
2,1922,1/01/2014,tropical fruit,2014,1,1,2
3,2943,1/01/2014,whole milk,2014,1,1,2
4,1249,1/01/2014,citrus fruit,2014,1,1,2


# Non-model-based approach

In [6]:
user_item_matrix = data.groupby(['User_id', 'itemDescription']).size().unstack(fill_value=0)

print(user_item_matrix.shape)
user_item_matrix.head()

(3493, 167)


itemDescription,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
1002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0


In [7]:
from scipy.sparse import csr_matrix

# Convert to sparse matrix for efficiency
matrix = csr_matrix(user_item_matrix.values)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 18288 stored elements and shape (3493, 167)>

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

 # Calculate cosine similarity between users
user_similarity = cosine_similarity(matrix) #user and user
# Create user similarity DataFrame
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=user_item_matrix.index,
    columns=user_item_matrix.index
)

user_similarity_df.head()

User_id,1000,1001,1002,1003,1004,1005,1006,1009,1010,1011,...,4988,4989,4990,4991,4992,4993,4995,4997,4999,5000
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,1.0,0.348155,0.288675,0.0,0.428845,0.0,0.218218,0.0,0.0,0.235702,...,0.0,0.0,0.0,0.154303,0.0,0.0,0.0,0.408248,0.0,0.0
1001,0.348155,1.0,0.301511,0.213201,0.447914,0.246183,0.341882,0.0,0.090909,0.123091,...,0.0,0.213201,0.0,0.161165,0.123091,0.123091,0.0,0.426401,0.0,0.0
1002,0.288675,0.301511,1.0,0.0,0.371391,0.0,0.188982,0.0,0.0,0.0,...,0.447214,0.353553,0.0,0.400892,0.0,0.0,0.0,0.353553,0.25,0.25
1003,0.0,0.213201,0.0,1.0,0.328266,0.57735,0.267261,0.0,0.0,0.288675,...,0.0,0.5,0.0,0.188982,0.0,0.433013,0.0,0.0,0.176777,0.0
1004,0.428845,0.447914,0.371391,0.328266,1.0,0.303239,0.421117,0.0,0.055989,0.227429,...,0.0,0.262613,0.0,0.397033,0.07581,0.303239,0.092848,0.525226,0.092848,0.092848


In [9]:
user_id = 3247
n_neighbors = 10
top_n = 10

# Get similar users (excluding the target user)
user_idx = user_item_matrix.index.get_loc(user_id)
similar_users = user_similarity_df.iloc[user_idx].sort_values(ascending=False)[1:n_neighbors+1]

print(f"Top {n_neighbors} similar users to User {user_id}:")
for sim_user_id, similarity in similar_users.items():
    print(f"User {sim_user_id}: Similarity = {similarity:.4f}")

Top 10 similar users to User 3247:
User 4959: Similarity = 0.6629
User 2658: Similarity = 0.6250
User 2850: Similarity = 0.6250
User 3725: Similarity = 0.6187
User 3146: Similarity = 0.6187
User 4723: Similarity = 0.6187
User 3660: Similarity = 0.6124
User 1040: Similarity = 0.5893
User 4206: Similarity = 0.5784
User 2617: Similarity = 0.5745


In [10]:
# Get items that the target user hasn't purchased
user_items = set(user_item_matrix.columns[user_item_matrix.loc[user_id] > 0])
items_to_recommend = set(user_item_matrix.columns) - user_items
items_to_recommend

{'Instant food products',
 'abrasive cleaner',
 'artif. sweetener',
 'baby cosmetics',
 'bags',
 'baking powder',
 'bathroom cleaner',
 'beef',
 'berries',
 'beverages',
 'bottled beer',
 'bottled water',
 'brandy',
 'brown bread',
 'butter',
 'butter milk',
 'cake bar',
 'candles',
 'canned fish',
 'canned fruit',
 'canned vegetables',
 'cereals',
 'chewing gum',
 'chicken',
 'chocolate',
 'chocolate marshmallow',
 'citrus fruit',
 'cleaner',
 'cling film/bags',
 'cocoa drinks',
 'condensed milk',
 'cooking chocolate',
 'cookware',
 'cream',
 'cream cheese ',
 'curd',
 'curd cheese',
 'decalcifier',
 'dental care',
 'dessert',
 'detergent',
 'dish cleaner',
 'dog food',
 'domestic eggs',
 'female sanitary products',
 'finished products',
 'fish',
 'flour',
 'flower (seeds)',
 'flower soil/fertilizer',
 'frankfurter',
 'frozen chicken',
 'frozen dessert',
 'frozen fish',
 'frozen fruits',
 'frozen meals',
 'frozen potato products',
 'grapes',
 'hair spray',
 'ham',
 'hamburger meat',
 

In [11]:
# Calculate recommendation scores
recommendations = {}

for item in items_to_recommend:
    score = 0
    total_similarity = 0
    
    for sim_user_id, similarity in similar_users.items():
        # If similar user has purchased this item
        if user_item_matrix.loc[sim_user_id, item] > 0:
            score += similarity * user_item_matrix.loc[sim_user_id, item]
            total_similarity += similarity
    
    # Normalize score by total similarity if possible
    if total_similarity > 0:
        recommendations[item] = score / total_similarity # sum(similarity * rating)/sum(similarity)

# Sort recommendations by score
sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)

# Return top N recommendations
sorted_recommendations[:top_n]

[('chewing gum', 1.0),
 ('bottled beer', 1.0),
 ('white bread', 1.0),
 ('pasta', 1.0),
 ('oil', 1.0),
 ('ham', 1.0),
 ('margarine', 1.0),
 ('frozen meals', 1.0),
 ('rubbing alcohol', 1.0),
 ('liquor', 1.0)]

# 2. Model based approach

In [33]:

# Import necessary libraries from Surprise
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import pandas as pd

# Load the data
data = pd.read_csv("data/Groceries data train.csv")
data = data.dropna()

# Convert the 'Date' column to datetime
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')

# Create a binary column for purchase (1 if the user bought the item, else 0)
data['purchase'] = 1  # Assuming each row represents a purchase

# Create the user-item interaction matrix
user_item_matrix = data.groupby(['User_id', 'itemDescription'])['purchase'].max().unstack(fill_value=0)

# Prepare the data for Surprise library
reader = Reader(rating_scale=(0, 1))  # Binary ratings (0 or 1)
data_surprise = Dataset.load_from_df(data[['User_id', 'itemDescription', 'purchase']], reader)

# Split data into training and testing sets (80% training, 20% testing)
trainset, testset = train_test_split(data_surprise, test_size=0.2)

# Build and train the SVD model
svd = SVD()
svd.fit(trainset)

# Make predictions on the testset
predictions = svd.test(testset)

# Evaluate the model performance using RMSE (Root Mean Squared Error)
rmse = accuracy.rmse(predictions)
print(f"RMSE of the SVD model: {rmse}")

# Get items that the target user hasn't purchased
user_items = set(user_item_matrix.columns[user_item_matrix.loc[user_id] > 0])
items_to_recommend = set(user_item_matrix.columns) - user_items

# Calculate recommendation scores
recommendations = {}
for item in items_to_recommend:
    # Predict if the user will buy the item
    pred = svd.predict(user_id, item)
    recommendations[item] = pred.est

# Sort recommendations by the predicted score
sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)

# Show the top N recommendations
print(f"Top {top_n} recommendations for User {user_id}:")
for item, score in sorted_recommendations[:top_n]:
    print(f"{item}: Predicted score = {score:.4f}")



RMSE: 0.0415
RMSE of the SVD model: 0.041528792726574386
Top 10 recommendations for User 3247:
chewing gum: Predicted score = 1.0000
bottled beer: Predicted score = 1.0000
liqueur: Predicted score = 1.0000
curd: Predicted score = 1.0000
newspapers: Predicted score = 1.0000
cocoa drinks: Predicted score = 1.0000
dog food: Predicted score = 1.0000
photo/film: Predicted score = 1.0000
condensed milk: Predicted score = 1.0000
tropical fruit: Predicted score = 1.0000


In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv('data/Groceries data train.csv')  # or use pd.read_csv(io.StringIO(...)) if you're pasting it

# Encode user_id and itemDescription
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['user'] = user_encoder.fit_transform(df['User_id'])
df['item'] = item_encoder.fit_transform(df['itemDescription'])

# Preview
df[['User_id', 'itemDescription', 'user', 'item']].head()


Unnamed: 0,User_id,itemDescription,user,item
0,2351.0,cleaner,1170,31
1,2226.0,sausage,1062,130
2,1922.0,tropical fruit,807,156
3,2943.0,whole milk,1688,164
4,1249.0,citrus fruit,215,30


In [2]:
interactions = list(zip(df['user'], df['item']))
num_users = df['user'].nunique()
num_items = df['item'].nunique()

# Show a few examples
print("Sample interactions:", interactions[:5])
print("Users:", num_users, "Items:", num_items)

Sample interactions: [(1170, 31), (1062, 130), (807, 156), (1688, 164), (215, 30)]
Users: 3494 Items: 168


In [4]:
from sklearn.model_selection import train_test_split

# Split the interactions list into a training set and a test set, but in a user-aware way — meaning:

# Each user gets their own data split

# We ensure at least one test interaction per user (if possible)

# This is important for recommender systems to fairly evaluate generalization per user



# Convert to DataFrame for easier splitting
interactions_df = pd.DataFrame(interactions, columns=['user', 'item'])

# Group by user and split each user's items
train_interactions = []
test_interactions = []

for user_id, user_data in interactions_df.groupby('user'):
    items = user_data['item'].tolist()
    if len(items) < 2: # If a user has less than 2 items, there's no point in splitting — we keep all their items in training
        train_interactions.extend([(user_id, i) for i in items])
        continue
    train_items, test_items = train_test_split(items, test_size=1) # 1 item for testing
    train_interactions.extend([(user_id, i) for i in train_items])
    test_interactions.extend([(user_id, i) for i in test_items])

print(f"Train: {len(train_interactions)}, Test: {len(test_interactions)}")


Train: 23493, Test: 3492


In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
from tqdm import tqdm

class MLPDataset(Dataset):
    def __init__(self, interactions, num_users, num_items, num_negatives=4):
        self.data = []
        self.num_users = num_users
        self.num_items = num_items
        self.interactions = set(interactions)
        for (u, i) in interactions:
            self.data.append((u, i, 1))  # positive
            for _ in range(num_negatives):
                j = random.randint(0, num_items - 1)
                while (u, j) in self.interactions:
                    j = random.randint(0, num_items - 1)
                self.data.append((u, j, 0))  # negative

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        user, item, label = self.data[idx]
        return torch.tensor(user), torch.tensor(item), torch.tensor(label, dtype=torch.float32)

class MLPRec(nn.Module):
    def __init__(self, num_users, num_items, layers=[64,32,16,8]):
        super(MLPRec, self).__init__()
        self.embedding_user = nn.Embedding(num_users, layers[0] // 2)
        self.embedding_item = nn.Embedding(num_items, layers[0] // 2)
        
        mlp_layers = []
        input_size = layers[0]
        for layer_size in layers[1:]:
            mlp_layers.append(nn.Linear(input_size, layer_size))
            mlp_layers.append(nn.ReLU())
            input_size = layer_size
            
        self.mlp = nn.Sequential(*mlp_layers)
        self.output = nn.Linear(layers[-1], 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, user, item):
        user_emb = self.embedding_user(user)
        item_emb = self.embedding_item(item)
        x = torch.cat([user_emb, item_emb], dim=-1)
        x = self.mlp(x)
        x = self.output(x)
        return self.sigmoid(x).squeeze()


In [7]:
train_dataset = MLPDataset(train_interactions, num_users, num_items)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)

model = MLPRec(num_users, num_items)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

history = []

epochs = 10
for epoch in tqdm(range(epochs)):
    model.train()
    total_loss = 0
    for user, item, label in train_loader:
        optimizer.zero_grad()
        output = model(user, item)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    history.append(total_loss)


100%|██████████| 10/10 [00:44<00:00,  4.50s/it]


In [8]:
from sklearn.metrics import mean_squared_error
import numpy as np

model.eval()
true_labels = []
pred_scores = []

with torch.no_grad():
    for (u, i) in train_interactions:
        user_tensor = torch.tensor([u])
        item_tensor = torch.tensor([i])
        pred = model(user_tensor, item_tensor).item()
        pred_scores.append(pred)
        true_labels.append(1.0)  # All test interactions are positive

rmse = np.sqrt(mean_squared_error(true_labels, pred_scores))
print(f"Train RMSE: {rmse:.4f}")

Train RMSE: 0.5108


In [9]:
model.eval()
true_labels = []
pred_scores = []

with torch.no_grad():
    for (u, i) in test_interactions:
        user_tensor = torch.tensor([u])
        item_tensor = torch.tensor([i])
        pred = model(user_tensor, item_tensor).item()
        pred_scores.append(pred)
        true_labels.append(1.0)  # All test interactions are positive

rmse = np.sqrt(mean_squared_error(true_labels, pred_scores))
print(f"Test RMSE: {rmse:.4f}")

Test RMSE: 0.6664
