In [55]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

np.random.seed(123)

In [56]:
user = pd.read_csv('/Users/salonivora/Documents/Machine Learning/Music Recommendation System/dataSet/ydata-ymusic-user-artist-ratings-v1_0.txt',sep='\s+',header=None)
user.columns=['user_id','artist_id','rating']

user.count()

#user.drop(user.tail(1155794).index,
#        inplace = True)
        
#user.head()

user_id      115579440
artist_id    115579440
rating       115579440
dtype: int64

In [57]:
rand_userIds = np.random.choice(user['user_id'].unique(), 
                                size=int(len(user['user_id'].unique())*0.005), 
                                replace=False)

user = user.loc[user['user_id'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(user), len(rand_userIds)))

There are 584452 rows of data from 9744 users


In [58]:
user['rank_latest'] = user.groupby(['user_id'])['rating'].rank(method='first', ascending=False)


In [59]:
train_ratings = user[user['rank_latest'] != 1]
test_ratings = user[user['rank_latest'] == 1]

In [60]:
train_ratings = train_ratings[['user_id', 'artist_id', 'rating']]
test_ratings = test_ratings[['user_id', 'artist_id', 'rating']]

In [61]:
train_ratings.loc[:, 'rating'] <=20

train_ratings.sample(5)

Unnamed: 0,user_id,artist_id,rating
43751338,735525,1010620,0
39525774,664358,1097613,90
6067607,100631,1026648,21
1242467,20775,1009352,30
51746840,869227,1008451,90


In [62]:
# Get a list of all movie IDs
all_artistIds = user['artist_id'].unique()

# Placeholders that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_ratings['user_id'], train_ratings['artist_id']))

# 4:1 ratio of negative to positive samples
num_negatives = 4

for (u, i) in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):
        # randomly select an item
        negative_item = np.random.choice(all_artistIds) 
        # check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_artistIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0) # items not interacted with are negative

  0%|          | 0/574708 [00:00<?, ?it/s]

In [63]:
class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_artistIdss (list): List containing all movieIds
    
    """

    def __init__(self, ratings, all_artistIdss):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_artistIdss)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_artistIdss):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['user_id'], ratings['artist_id']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_artistIdss)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_artistIdss)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [64]:
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_artistIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings, all_artistIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_artistIds = all_artistIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred

    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MovieLensTrainDataset(self.ratings, self.all_artistIds),
                          batch_size=512, num_workers=0)

In [65]:
num_users = user['user_id'].max()+1
num_items = user['artist_id'].max()+1

all_artistIds = user['artist_id'].unique()

model = NCF(num_users, num_items, train_ratings, all_artistIds)

In [66]:
import pickle

# Save the Modle to file in the current working directory

Pkl_Filename = "/Users/salonivora/Documents/Machine Learning/Music Recommendation System/Pickle_NCF_Model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model, file)


In [67]:
trainer = pl.Trainer(max_epochs=5,gpus=None ,reload_dataloaders_every_n_epochs=True, progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)

trainer.fit(model)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 15.6 M
1 | item_embedding | Embedding | 8.8 M 
2 | fc1            | Linear    | 1.1 K 
3 | fc2            | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
24.4 M    Trainable params
0         Non-trainable params
24.4 M    Total params
97.606    Total estimated model params size (MB)
  rank_zero_warn(


Training: -1it [00:00, ?it/s]

In [68]:
# Load the Model back from file
with open(Pkl_Filename, 'rb') as file:  
    Pickled_Model = pickle.load(file)

Pickled_Model

NCF(
  (user_embedding): Embedding(1948069, 8)
  (item_embedding): Embedding(1101720, 8)
  (fc1): Linear(in_features=16, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=1, bias=True)
)

In [69]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['user_id'], test_ratings['artist_id']))

# Dict of all items that are interacted with by each user
user_interacted_items = user.groupby('user_id')['artist_id'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_artistIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(Pickled_Model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)

print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

  0%|          | 0/9744 [00:00<?, ?it/s]

The Hit Ratio @ 10 is 0.09


In [70]:
artist = pd.read_csv("/Users/salonivora/Documents/Machine Learning/Music Recommendation System/dataSet/filtered_artist.csv")

#artist.head(10)
art=list()

for i in top10_items:    
    a = artist[(artist['id']==i)]
    art.append(a.name)

print(art)

[3246    slum village
Name: name, dtype: object, Series([], Name: name, dtype: object), 13369    digger
Name: name, dtype: object, 6863    oxymoron
Name: name, dtype: object, Series([], Name: name, dtype: object), 5321    newsboys
Name: name, dtype: object, 3890    eddie harris
Name: name, dtype: object, 10417    trendz of culture
Name: name, dtype: object, 4419    yoko ono
Name: name, dtype: object, 4075    sun ra
Name: name, dtype: object]
