In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'movielens-20m-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F339%2F77759%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240908%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240908T041443Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Db8e086afecdbd3416ede04217779ee00526fc1ec42810d4c921e373a4d4e148e1810e6914e5abb6192627c1e4383d5d162dbf45c5b9ba56af4428346366b7a52c573b54dfd6fce8b51e2a361512abf4fb73485562c4373b384a1cafbf97cd52483016cc9bdc80ce8187bb0dae0330e19845e8bfe0b45a53086a9f0dec2d44bd2605145df3db51fa61d251d394234be70452637b9fdca666fcfce2835255066472d4d805d6c8251063f8774fb948aba69c2dcc2907e128b6e829f1528965f6fa5f3fae66f6315199e6ddad5896ec96413cdb0b05273e23f48d6e36c95e5f7d49246ec5f56ba7687b7267591fb50d897c4ee23a3baead625fe3ee0febc681a2e62,d-recommendation/pytorch/v.1.0/1:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-models-data%2F72017%2F85725%2Fbundle%2Farchive.tar.gz%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240908%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240908T041443Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D59eacb17ab1cc2844f8c123402f920a21373b2761fe3e9a650486f45da8e3f0f534206b6a428b1efdd40719fb9be27330080da2a8105b44fa7918827129e0c09e7f4de9758c4d47ce85bd4e95a24d3110205200e22961bfc5cdf890b207a1de02fba17bf17957bdc46ba119686d1ece59434c81a7ae796a2adf52ff642685387871e1781fb9b7af54537d75dae6d37288ac41eb84849b98c4af468d19008fa382817a6681da5b7cc9ba97beb500de645fedc6cd3cff8afd7f883fe01db7f90c6d2e4d0eb43be941221b30e1ad7b2b09fd3dd0668c474c233b9590c07027d6b210c83c6cc4b562867420b48459a605def51cf0b3dc56cfe7a3dfc4299d709225e'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# !wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.9.2-Linux-x86_64.sh
# !chmod +x Miniconda3-py37_4.9.2-Linux-x86_64.sh
# !bash ./Miniconda3-py37_4.9.2-Linux-x86_64.sh -b -f -p /usr/local

In [None]:
# !conda install pytorch cudatoolkit=11.3 -c pytorch-nightly -y

In [None]:
# !pip3 install torchrec-nightly torchrec

In [None]:
# !pip install sklearn

In [None]:
# !cp /usr/local/lib/lib* /usr/lib/

In [None]:
# #Restart Colab runtime before executing this
# import sys
# sys.path = ['', '/env/python', '/usr/local/lib/python37.zip', '/usr/local/lib/python3.7',
#             '/usr/local/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/site-packages']


In [None]:
# !nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Mon_Apr__3_17:16:06_PDT_2023
Cuda compilation tools, release 12.1, V12.1.105
Build cuda_12.1.r12.1/compiler.32688072_0


In [None]:
# !pip install -q torch fbgemm-gpu torchrec

In [None]:
# import os
# import torch
# import torchrec
# import torch.distributed as dist

# os.environ["RANK"] = "0"
# os.environ["WORLD_SIZE"] = "1"
# os.environ["MASTER_ADDR"] = "localhost"
# os.environ["MASTER_PORT"] = "29500"

# dist.init_process_group(backend="gloo")


# Using Torchrec to retrieve embeddings from a DistributedParallelModel using the KJT minibatch

https://colab.research.google.com/gist/dhruvrnaik/2acd7289df0885184e7e96d38eb153e7/torchrec-on-movie-rating-dataset.ipynb

Dataset : https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset

Source : https://www.kaggle.com/code/jamesloy/deep-learning-based-recommender-systems#Deep-Learning-based-Recommender-System

Source : https://www.kaggle.com/code/willkoehrsen/neural-network-embedding-recommendation-system#Introduction:-Book-Recommendation-System

In [None]:


import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.nn as nn


np.random.seed(123)


In [None]:
import psutil
import os

# Function to check memory usage
def memory_usage_of_df(df):
    return df.memory_usage(deep=True).sum() / (1024**2)  # Memory usage in MB

# Display memory usage of the DataFrame
def print_memory_usage():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    print(f"Memory usage: {mem_info.rss / (1024**2):.2f} MB")

# Load data in chunks and sample userIds
chunk_size = 10**6  # Adjust chunk size according to system's memory capacity
user_ids = set()

print("Memory usage before loading user IDs:")
print_memory_usage()

# Iterate through chunks to collect unique user IDs
for chunk in pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv',
                         chunksize=chunk_size, usecols=['userId']):
    user_ids.update(chunk['userId'].unique())

# Sample 10% of unique userIds
user_ids = list(user_ids)
sampled_user_ids = np.random.choice(user_ids, size=int(len(user_ids) * 0.1), replace=False)

print("Memory usage after sampling user IDs:")
print_memory_usage()

# Load data again, this time only for sampled userIds
ratings = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv',
                      parse_dates=['timestamp'],
                      chunksize=chunk_size)

# Filter data based on sampled userIds and concatenate chunks
filtered_chunks = []

print("Memory usage before filtering data:")
print_memory_usage()

for chunk in ratings:
    filtered_chunk = chunk[chunk['userId'].isin(sampled_user_ids)]
    filtered_chunks.append(filtered_chunk)

# Combine all chunks into a single DataFrame
filtered_ratings = pd.concat(filtered_chunks, ignore_index=True)

print("Memory usage after concatenating filtered data:")
print_memory_usage()

# Print the number of rows and unique users
print(f'There are {len(filtered_ratings)} rows of data from {len(sampled_user_ids)} users')

# Display the first 5 rows
print(filtered_ratings.head())

# Memory usage of the final DataFrame
print(f"Memory usage of filtered_ratings DataFrame: {memory_usage_of_df(filtered_ratings):.2f} MB")


Memory usage before loading user IDs:
Memory usage: 322.25 MB
Memory usage after sampling user IDs:
Memory usage: 356.51 MB
Memory usage before filtering data:
Memory usage: 357.43 MB
Memory usage after concatenating filtered data:
Memory usage: 532.67 MB
There are 2030571 rows of data from 13849 users
   userId  movieId  rating           timestamp
0       3        1     4.0 1999-12-11 13:36:47
1       3       24     3.0 1999-12-14 12:54:08
2       3       32     4.0 1999-12-11 13:14:07
3       3       50     5.0 1999-12-11 13:13:38
4       3      160     3.0 1999-12-14 12:54:08
Memory usage of filtered_ratings DataFrame: 61.97 MB


In [None]:
ratings = filtered_ratings.__deepcopy__()

## Train-test split

To simulate a time-based split where earlier ratings are used for training and the latest ratings are used for testing or validation - For each user, the most recent review is used as the test set (i.e. leave one out), while the rest will be used as training data .**For each user, the most recent review is used as the test set (i.e. leave one out), while the rest will be used as training data .**

## Converting the dataset into an implicit feedback dataset

As discussed earlier, we will train a recommender system using implicit feedback. However, the MovieLens dataset that we're using is based on explicit feedback. To convert this dataset into an implicit feedback dataset, we'll simply binarize the ratings such that they are are '1' (i.e. positive class). **The value of '1' represents that the user has interacted with the item.**

It is important to note that using implicit feedback reframes the problem that our recommender is trying to solve. **Instead of trying to predict movie ratings (when using explicit feedback), we are trying to predict whether the user will interact (i.e. click/buy/watch) with each movie, with the aim of presenting to users the movies with the highest interaction likelihood.**


In [None]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)

train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

# drop columns that we no longer need
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

train_ratings.loc[:, 'rating'] = 1

train_ratings.sample(5)

Unnamed: 0,userId,movieId,rating
287750,19107,1466,1.0
1696350,115574,6127,1.0
1545058,105607,422,1.0
294224,19624,28,1.0
351198,23188,1225,1.0




We do have a problem now though. After binarizing our dataset, we see that every sample in the dataset now belongs to the positive class. However we also require negative samples to train our models, to indicate movies that the user has not interacted with. We assume that such movies are those that the user are not interested in - even though this is a sweeping assumption that may not be true, it usually works out rather well in practice.

The code below generates 4 negative samples for each row of data. In other words, the ratio of negative to positive samples is 4:1.

In [None]:
test_ratings.head(5)

Unnamed: 0,userId,movieId,rating
5,3,173,2.0
458,11,5971,5.0
704,22,303,3.0
849,30,6378,2.5
874,36,58293,3.5


# Building dataset for Torchrec

In [None]:
# Find common userIds present in both dataframes
common_user_ids = set(train_ratings["userId"]).intersection(set(test_ratings["userId"]))

train_data = {}
test_data = {}

for user_id in tqdm(common_user_ids):
    train_movies = train_ratings[train_ratings["userId"] == user_id]["movieId"].tolist()
    train_data[user_id] = train_movies

    test_movies = test_ratings[test_ratings["userId"] == user_id]["movieId"].tolist()
    test_data[user_id] = test_movies

print(len(train_data))
i = 0
for k,v in test_data.items():
    print(k,v)
    i+=1
    if i==10: break

  0%|          | 0/13849 [00:00<?, ?it/s]

13849


65537 [1214]
32771 [54281]
3 [173]
32773 [12]
32774 [5]
131082 [4996]
11 [5971]
65556 [3238]
98324 [88672]
22 [303]


In [None]:
# embedding_collection = torchrec.EmbeddingBagCollection(
#     device="meta",
#     tables=[
#         torchrec.EmbeddingBagConfig(
#             name="userId_table",
#             embedding_dim=64,
#             num_embeddings=MAX_USERS,
#             feature_names=["userId"],
#             pooling=torchrec.PoolingType.SUM,
#         ),
#         torchrec.EmbeddingBagConfig(
#             name="movieId_table",
#             embedding_dim=64,
#             num_embeddings=len(movie_encoder.classes_),
#             feature_names=["movieId"],
#             pooling=torchrec.PoolingType.SUM,
#         )
#     ]
# )

In [None]:
# model = torchrec.distributed.DistributedModelParallel(embedding_collection, device=torch.device("cpu"))
# print(model)
# print(model.plan)

In [None]:
sample = list(train_data.items())[-3:]
userids = []
movies_seen = []
for row in sample:
    userids.append(row[0])
    movies_seen.append(row[1])

userids, movies_seen
import copy
values = copy.deepcopy(userids)
lengths = [1]*len(values)
for movie_list in movies_seen:
    values.extend(movie_list)
    lengths.append(len(movie_list))


# kjt = torchrec.KeyedJaggedTensor(
#     keys = ["userId","movieId"],
#     values = torch.tensor(values).cpu(),
#     lengths = torch.tensor(lengths, dtype=torch.int64).cpu(),
# )

# print(kjt.to(torch.device("cpu")))

# pooled_embeddings = model(kjt)
# print(pooled_embeddings)

In [None]:
# from tqdm import tqdm
# from multiprocessing import Pool, cpu_count
# from functools import partial

# # Convert train_ratings to a set for fast lookup
# user_item_set = set(zip(train_ratings['userId'], train_ratings['movieId']))

# # Function to generate negative samples for a single user
# def generate_negative_samples(user, user_item_set, all_movieIds, num_negatives):
#     negative_samples = []
#     positives = set(train_ratings[train_ratings['userId'] == user]['movieId'])

#     while len(negative_samples) < num_negatives:
#         negative_item = np.random.choice(all_movieIds)
#         if negative_item not in positives and (user, negative_item) not in user_item_set:
#             negative_samples.append(negative_item)

#     return [(user, neg_item, 0) for neg_item in negative_samples]

# # Generate negative samples using multiprocessing
# num_negatives = 4
# all_movieIds = ratings['movieId'].unique()

# with Pool(cpu_count()) as p:
#     negative_samples_list = list(tqdm(p.imap(partial(generate_negative_samples,
#                                                       user_item_set=user_item_set,
#                                                       all_movieIds=all_movieIds,
#                                                       num_negatives=num_negatives),
#                                               set(train_ratings['userId'])), total=len(set(train_ratings['userId']))))

# # Flatten negative_samples_list
# negative_samples = [item for sublist in negative_samples_list for item in sublist]

# # Generate training data
# train_data = list(user_item_set)
# train_data.extend(negative_samples)

# users, items, labels = zip(*train_data)
# users = np.array(users)
# items = np.array(items)
# labels = np.array(labels)


# Neural Collaborative Filtering (NCF)

## User Embeddings

In this embedding, users with similar movie preferences are placed near to each other, and vice versa.

## Learned Embeddings

Similarly, we will use a separate item embedding layer to represent the traits of the items (i.e. movies) in a lower dimensional space.

**How can we learn the weights of the embedding layer, such that it provides an accurate representation of users and items?**

> **Collaborative Filtering** - by using the ratings dataset, we can identify similar users and movies, creating user and item embeddings learned from existing ratings.

## Model Architecture


| userId | movieId | interacted |
|--------|---------|------------|
| 3      |  1      |   1        |

The inputs to the model are the one-hot encoded user and item vector for userId = 3 and movieId = 1. The user input vector and item input vector are fed to the user embedding and item embedding respectively, which results in a smaller, denser user and item vectors.

The embedded user and item vectors are concatenated before passing through a series of fully connected layers, which maps the concatenated embeddings into a prediction vector as output. Finally, we apply a Sigmoid function to obtain the most probable class. In the example above, the most probable class is 1 (positive class), since 0.8 > 0.2.

![NCF](https://i.imgur.com/cNWbIce.png)



In [None]:
from torch.utils.data import Dataset, DataLoader

class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training in PyTorch tensor format

    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds

    """

    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))

        num_negatives = 4
        for u, i in tqdm(user_item_set):
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [None]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds

    def forward(self, user_input, item_input):
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)
        vector = torch.cat([user_embedded, item_embedded], dim=-1)
        vector = torch.relu(self.fc1(vector))
        vector = torch.relu(self.fc2(vector))
        pred = torch.sigmoid(self.output(vector))
        return pred

In [None]:
import torch.optim as optim

num_users = ratings['userId'].max() + 1
num_items = ratings['movieId'].max() + 1
all_movieIds = ratings['movieId'].unique()

# Initialize the model
model = NCF(num_users, num_items, ratings, all_movieIds)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())
model

NCF(
  (user_embedding): Embedding(138492, 8)
  (item_embedding): Embedding(131159, 8)
  (fc1): Linear(in_features=16, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=1, bias=True)
)

In [None]:

from torch.utils.data import DataLoader

# Prepare data loaders
train_dataset = MovieLensTrainDataset(ratings, all_movieIds)
# train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=4)

# train_loader

  0%|          | 0/2030571 [00:00<?, ?it/s]

In [None]:
import logging


# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger('TrainingLogger')


# Log model architecture
def log_model_summary(model):
    logger.info(f"Model Architecture:\n{model}")
    total_params = sum(p.numel() for p in model.parameters())
    logger.info(f"Total Parameters: {total_params / 1e6:.2f} M")

log_model_summary(model)

In [None]:
# Training loop
num_epochs = 3

for epoch in range(num_epochs):
    # Create a new DataLoader instance for the current epoch
    train_loader = DataLoader(MovieLensTrainDataset(ratings, all_movieIds), batch_size=512, shuffle=True, num_workers=4)

    model.train()
    running_loss = 0.0

    for user_input, item_input, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        user_input, item_input, labels = user_input.to(device), item_input.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(user_input, item_input)

        # Calculate loss
        loss = criterion(outputs, labels.view(-1, 1).float())
        loss.backward()

        # Optimize
        optimizer.step()

        running_loss += loss.item() * user_input.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')
    logger.info(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

  0%|          | 0/2030571 [00:00<?, ?it/s]

Epoch 1/3:   0%|          | 0/19830 [00:00<?, ?it/s]

Epoch 1/3, Loss: 0.2393


  0%|          | 0/2030571 [00:00<?, ?it/s]

Epoch 2/3:   0%|          | 0/19830 [00:00<?, ?it/s]

Epoch 2/3, Loss: 0.2329


  0%|          | 0/2030571 [00:00<?, ?it/s]

Epoch 3/3:   0%|          | 0/19830 [00:00<?, ?it/s]

Epoch 3/3, Loss: 0.2285


In [None]:
# Save model checkpoint
torch.save(model.state_dict(), 'ncf_model.pth')


# Evaluating Recommender System

The key here is that we don't need the user to interact on every single item in the list of recommendations. Instead, we just need the user to interact with at least one item on the list - as long as the user does that, the recommendations have worked.

To simulate this, let's run the following evaluation protocol to generate a list of 10 recommended items for each user.

        For each user, randomly select 99 items that the user has not interacted with
        Combine these 99 items with the test item (the actual item that the user interacted with). We now have 100 items.
        Run the model on these 100 items, and rank them according to their predicted probabilities
        Select the top 10 items from the list of 100 items. If the test item is present within the top 10 items, then we say that this is a hit.
        Repeat the process for all users. The Hit Ratio is then the average hits.
        This evaluation protocol is known as Hit Ratio @ 10, and it is commonly used to evaluate recommender systems.


**Hit Ratio @ 10 - What % of the users were recommended the actual item (among a list of 10 items) that they eventually interacted with**


In [None]:
# Load model checkpoint
model.load_state_dict(torch.load('/kaggle/input/d-recommendation/pytorch/v.1.0/1/ncf_model.pth',map_location=torch.device('cpu')))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

NCF(
  (user_embedding): Embedding(138492, 8)
  (item_embedding): Embedding(131159, 8)
  (fc1): Linear(in_features=16, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=1, bias=True)
)

In [None]:



# Convert test ratings to a set of user-item pairs
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u, i) in tqdm(test_user_item_set):
    # Get the items interacted with by user u
    interacted_items = user_interacted_items.get(u, [])
    not_interacted_items = set(all_movieIds) - set(interacted_items)

    # Randomly sample 99 non-interacted items
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99, replace=False))
    test_items = selected_not_interacted + [i]

    # Prepare inputs for the model
    user_tensor = torch.tensor([u] * len(test_items)).to(device)
    item_tensor = torch.tensor(test_items).to(device)

    # Model inference
    with torch.no_grad():
        predicted_labels = model(user_tensor, item_tensor).cpu().numpy().flatten()

    # Get top 10 items based on predicted labels
    top10_items = [test_items[j] for j in np.argsort(predicted_labels)[::-1][:10]]

    # Check if the true item is in the top 10 recommendations
    hits.append(1 if i in top10_items else 0)

#     print(f"User didn't interacted but recommended : ",[x for x in top10_items if x not in hits])

# Calculate and print Hit Ratio @ 10
hit_ratio_at_10 = np.mean(hits)
print(f"The Hit Ratio @ 10 is {hit_ratio_at_10:.2f}")


  0%|          | 0/13849 [00:00<?, ?it/s]

The Hit Ratio @ 10 is 0.77


# Get similar movie recommendation

In [None]:
def get_item_embeddings(model):
    """Extract item embeddings from the model."""
    # Set model to evaluation mode
    model.eval()

    with torch.no_grad():
        item_embeddings = model.item_embedding.weight.cpu().numpy()

    return item_embeddings

# Get item embeddings
item_embeddings = get_item_embeddings(model)


## Approximate Nearest Neighbour Search

In [None]:
!pip install -qU faiss-cpu

In [None]:
import faiss

def compute_similarities_faiss(item_embeddings, k=10):
    """Compute approximate nearest neighbors using Faiss."""
    item_embeddings = np.array(item_embeddings, dtype=np.float32)
    dim = item_embeddings.shape[1]

    # Create a Faiss index
    index = faiss.IndexFlatL2(dim)  # L2 distance
    index.add(item_embeddings)  # Add vectors to index

    # Query the index for k-nearest neighbors
    distances, indices = index.search(item_embeddings, k + 1)  # k + 1 to exclude the item itself
    return distances, indices

# Compute approximate nearest neighbors with Faiss
k = 10
distances, indices = compute_similarities_faiss(item_embeddings, k=k)


In [None]:
# from sklearn.metrics.pairwise import cosine_similarity

# def compute_similarities(item_embeddings):
#     """Compute similarity matrix from item embeddings."""
#     similarity_matrix = cosine_similarity(item_embeddings)
#     return similarity_matrix

# # Compute similarity matrix
# similarity_matrix = compute_similarities(item_embeddings)

In [None]:
def recommend_similar_movies(movie_id, movie_ids, distances, indices, k=10):
    """Recommend similar movies to a given movie."""
    if movie_id not in movie_ids:
        raise ValueError(f"Movie ID {movie_id} is not in the movie IDs list.")

    # Get the index of the movie
    movie_index = movie_ids.index(movie_id)

    # Get indices of the top N most similar movies
    top_indices = indices[movie_index][1:]  # Exclude the movie itself
    top_indices = top_indices[:k]  # Ensure we return exactly top_k items

    # Get movie IDs of the top similar movies
    similar_movies = [movie_ids[i] for i in top_indices]

    return similar_movies




In [None]:
# Example usage
movie_ids = list(range(len(item_embeddings)))  # Create a list of movie IDs (adjust as needed)
movie_id_to_recommend = 1  # Example movie ID

# Get similar movies
similar_movies = recommend_similar_movies(movie_id_to_recommend, movie_ids, distances, indices, k=10)

print(f"Movies similar to movie ID {movie_id_to_recommend}: {similar_movies}")

Movies similar to movie ID 1: [377, 29742, 173, 1198, 44129, 1197, 1240, 91078, 90331, 42945]


In [None]:
# Example usage
movie_ids = list(range(len(item_embeddings)))  # Create a list of movie IDs (adjust as needed)
movie_id_to_recommend = 3  # Example movie ID

# Get similar movies
similar_movies = recommend_similar_movies(movie_id_to_recommend, movie_ids, distances, indices, k=10)

print(f"Movies similar to movie ID {movie_id_to_recommend}: {similar_movies}")

Movies similar to movie ID 3: [71469, 76370, 91984, 22908, 126212, 112703, 96427, 46499, 88673, 130188]


In [None]:
#