<a href="https://colab.research.google.com/github/Katrin-Leberfinger/Hybrid-gender-debiased-music-recommendation/blob/main/validation_on_musiclen_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model validations

Code Source:
https://gist.github.com/shubhamagarwal92/37ccb747f7130a35a8e76aa66d60e014

Interesting articles
https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#31-running-bert-on-our-text
https://www.kaggle.com/hassanamin/bert-pytorch-cola-classification


In [1]:
!pip install transformers



In [2]:
import torch
import numpy as np
import torch.nn as nn
from torch.autograd import Variable
from transformers import BertConfig, BertPreTrainedModel, BertModel, BertForSequenceClassification
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, AdamW
from torch import nn
from transformers import BertModel
from sklearn.metrics import f1_score
import ast


# Read Data

In [3]:
# data: track_artist -	track_name -	track_tag - track_id

def read_items_data(data, id_col, text_col):
  items = dict()
  item2pos = dict()
  pos2item = dict()
  token2id = dict()
  num_items = 1
  num_tokens = 1
  max_item_len = 0

  for index, row in data.iterrows():
    # for each item:
    item_id = row[id_col]
    item_words = []
    item2pos[item_id] = num_items
    pos2item[num_items] = item_id
    
    for word in row[text_col].split(' '): 
    #  for each word:
      try:
        token2id[word]
      except:
        token2id[word] = num_tokens
        num_tokens = num_tokens + 1
    

      item_words.append(token2id[word])

    if len(item_words) > max_item_len:
        max_item_len = len(item_words)

    items[item_id] = item_words
    num_items = num_items + 1

  return items, item2pos, pos2item, token2id, max_item_len

In [4]:
# data: track_artist -	track_name -	track_tag - track_id
# NEW VERSION USING BERT

def read_items_data_bert(data, id_col, text_col, max_length=512):
  tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny') 
  items = torch.Tensor([])
  attentions = torch.Tensor([])
  item2pos = dict()
  pos2item = dict()
  token2id = dict()
  num_items = 0
  num_tokens = 0
  max_item_len = 0

  for index, row in data.iterrows():
    # for each item:
    item_id = row[id_col]
    item_words = []
    item2pos[item_id] = num_items
    pos2item[num_items] = item_id


    inputs = tokenizer.encode_plus(
        row[text_col], 
        None,
        add_special_tokens=True,
        max_length = max_length,
        pad_to_max_length=True
        
    )

    tokens = inputs["input_ids"]
    attention = inputs['attention_mask']
    tokens=torch.tensor(tokens, dtype=torch.long).unsqueeze(0)
    attention=torch.tensor(attention, dtype=torch.long).unsqueeze(0)

    items = torch.concat((items, tokens))
    attentions = torch.concat((attentions, attention))
    num_items = num_items + 1

  return items, attentions, item2pos, pos2item, token2id, max_item_len

In [5]:
# data: user_id - item_id - rating
# playcount: normalize/ binary (>1:1 or <=1:0)

def read_ratings_data(data, item2pos, id_col, rating_col, user_col):
    ratings = torch.Tensor(len(data), 3)
    user2id = dict()
    id2user = dict()
    num_users = 0
    i = 0
    for _, row in data.iterrows():
        raw_user = row[user_col] 

        try:
          user2id[raw_user]
        except:
          user2id[raw_user] = num_users
          id2user[num_users] = raw_user
          num_users = num_users + 1

        user = user2id[raw_user]
        item = int(item2pos[row[id_col]]) #track_id
        rating = int(row[rating_col]) # count

        ratings[i][0] = user
        ratings[i][1] = item
        ratings[i][2] = rating / data[rating_col].max()
        i = i+1


    return ratings, user2id, id2user


In [6]:
def pad_items_data(items, item2pos, max_item_len):
    data = torch.zeros(len(items), max_item_len)

    for item_id, tokens in enumerate(items):
      token = items[tokens]
      for i, t in enumerate(token):
        data[item_id,i] = t


    return data

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
%cd /content/drive/MyDrive/Master\ Thesis/data

/content/drive/MyDrive/Master Thesis/data


## Read Data: MovieLen

In [9]:
data_movies = pd.read_csv("movies/movies_abstracts.csv").drop(['Unnamed: 0'],axis=1).dropna()
data_interaction = pd.read_csv("movies/ratings.csv").drop(['timestamp'],axis=1)

data_movies = pd.merge(data_movies, data_interaction[['movieId']], 'inner').drop_duplicates()
data_interaction = pd.merge(data_interaction, data_movies[['movieId']], 'inner')

rating_col = 'rating'
item_col = 'movieId'
user_col = 'userId'
text_col = 'abstract'
data_items_eval = data_movies

data_interaction.loc[data_interaction.rating<4., 'rating'] = 0.
data_interaction.loc[data_interaction.rating>=4., 'rating'] = 1.

## Read Data: Lyrics 

In [10]:
# data_tracks_lyrics = pd.read_csv("music/data_tracks_lyrics.txt", sep="\t").drop(['Unnamed: 0'],axis=1).dropna()
# data_tracks_tags_lyrics = pd.read_csv("music/data_tracks_tags_lyrics.txt", sep="\t").drop(['Unnamed: 0'],axis=1).dropna()
# #data_tracks_tags_lyrics=data_tracks_tags_lyrics.iloc[:-1,:]
# data_interaction = pd.read_csv("music/data_user_track_interaction.txt", sep="\t").drop(['Unnamed: 0'],axis=1)
# data_user = pd.read_csv("music/data_user.txt", sep="\t").drop(['Unnamed: 0'],axis=1)

# data_interaction.loc[data_interaction['count']<2., 'count'] = 0.
# data_interaction.loc[data_interaction['count']>=2., 'count'] = 1.

# rating_col = 'count'
# item_col = 'track_id'
# user_col = 'user_id'
# text_col = 'tags'
# data_items_eval = data_tracks_tags_lyrics

In [11]:
# import random
# selected_user = random.sample(list(data_interaction.user_id.unique()), 1000)
# data_interaction = data_interaction.loc[data_interaction.user_id.isin(selected_user)]

In [12]:
l = 0
i = 0
for d in data_items_eval[text_col]:
  l = l+ len(d.split())
  i = i+1
l/i

115.02414486921529

## CV and balance data set

In [13]:
data_interaction['fold'] = np.random.randint(1, 6, data_interaction.shape[0])

import random
random.seed(123)
idx = []
for _,df in data_interaction.groupby('fold'):
  len_pos = len(df[df[rating_col]==1])
  len_neg = len(df[df[rating_col]==0])
  if len_pos < len_neg:
    df = df[df[rating_col]==0].sample(len_pos).append(df[df[rating_col]==1])
  elif len_pos > len_neg:
    df = df[df[rating_col]==1].sample(len_pos).append(df[df[rating_col]==0])
  idx.extend(df.index.values.tolist())

test_fold = 1

data_interaction_train = data_interaction.loc[data_interaction.fold != test_fold, [user_col, item_col, rating_col]]
data_interaction_test = data_interaction.loc[data_interaction.fold == test_fold, [user_col, item_col, rating_col]]

In [14]:
data_interaction_train

Unnamed: 0,userId,movieId,rating
0,1,1,1.0
2,7,1,1.0
3,15,1,0.0
5,18,1,0.0
6,19,1,1.0
...,...,...,...
70190,610,160341,0.0
70191,610,160527,1.0
70192,610,160836,0.0
70193,610,163937,0.0


In [15]:
data_interaction_test = data_interaction_test[data_interaction_test.userId.isin([7, 15, 17])]

# Ask Me Anything Rating

Code source: https://github.com/nlp-deepcbrs/amar

## **BERT Model**

In [16]:
items, attentions, item2pos, pos2item, token2id, max_item_len = read_items_data_bert(data_items_eval, item_col, text_col, 128)
ratings_train, user2id_train, id2user_train = read_ratings_data(data_interaction_train, item2pos,  item_col, rating_col, user_col)

ratings_test, user2id_test, id2user_test = read_ratings_data(data_interaction_test, item2pos,  item_col, rating_col, user_col)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:
class AMARBert(nn.Module):
    def __init__(self, hidden_dense_layer_size, item_embeddings_size, num_users):
        super(AMARBert, self).__init__()
        #self.model1_layer2 = BertModel.from_pretrained('prajjwal1/bert-tiny')#('bert-base-uncased')
        self.model1_layer2 = BertModel.from_pretrained('prajjwal1/bert-tiny')
        self.model1_layer3 = nn.Dropout(p=0.2)

        self.model2_layer1 = nn.Embedding(num_users, user_embeddings_size)
        
        self.linear = nn.Linear(hidden_dense_layer_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        #_, y1 = self.model1_layer2(x[0], attention_mask = x[2],  return_dict=False)
        output, y1 = self.model1_layer2(x[0],  return_dict=False)
        #y1 = output[:, 0, :]
        # pooled_output (=y1) is the output of the CLS token
        # "Since BERT is transformer based contextual model, the idea is [CLS] token would have captured the entire context and would be sufficient for simple downstream tasks such as classification."
        # https://stackoverflow.com/questions/63673511/how-to-use-the-outputs-of-bert-model?rq=1
        # https://towardsdatascience.com/bert-to-the-rescue-17671379687f
        #y1 = self.model1_layer3(y1)
        
        y2 = self.model2_layer1(x[1])

        y = torch.cat([y1, y2], 1)
        y = self.linear(y)
        return self.sigmoid(y)

In [18]:
# Params: items_data, ratings_data, genres_data
device = 'cuda'

import numpy as np
num_examples=ratings_train.size(0)
item_embeddings_size = 128
user_embeddings_size = 100
genre_embeddings_size = 128
hidden_dense_layer_size = item_embeddings_size + user_embeddings_size
num_tokens = 128
num_users = len(data_interaction[user_col].drop_duplicates())

model = AMARBert(hidden_dense_layer_size, user_embeddings_size, num_users)

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
from transformers import get_linear_schedule_with_warmup
criterion = nn.BCEWithLogitsLoss() 
# # https://huggingface.co/transformers/v1.0.0/migration.html
lr = 1e-3
num_total_steps = 1000
num_warmup_steps = 100
warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  
#optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

scheduler = get_linear_schedule_with_warmup(
   optimizer,
   num_warmup_steps=num_warmup_steps,
   num_training_steps=num_total_steps
)


num_epochs=5
batch_size=256 
#batch_size=32
num_examples=ratings_train.shape[0]

cost_per_epoch = []

for e in range(num_epochs):
    # shuffle and split training examples in batches
    indices = torch.randperm(num_examples).split(batch_size)

    #remove last element so that all the batches have equal size
    indices = indices[:len(indices)-1] 

    average_cost = 0

    for t, v in enumerate(indices):
        #items positions
        curr_items_ids_batch = ratings_train[v, 1]

        # items descriptions
        curr_items_batch = items[curr_items_ids_batch.numpy(),:].to(device)
        curr_attentions_batch = attentions[curr_items_ids_batch.numpy(),:].to(device)
        
        # users ids
        curr_users_batch = ratings_train[v, 0].to(device)

        # model inputs
        inputs = [ curr_items_batch.type(torch.LongTensor), curr_users_batch.type(torch.LongTensor), curr_attentions_batch.type(torch.LongTensor)]

        # model targets
        targets = ratings_train[v, 2]

        # callback that does a single batch optimization step
        optimizer.zero_grad()

        # backward propagation
        outputs = model(inputs)
        
      
        outputs = outputs.reshape(-1,)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        # evaluate current loss function value
        average_cost = average_cost + loss
        

    # evaluate average cost per epoch
    average_cost = average_cost / len(indices)
    cost_per_epoch.append(average_cost)
    print("Average cost per epoch: ", average_cost)




Average cost per epoch:  tensor(0.6948, grad_fn=<DivBackward0>)


KeyboardInterrupt: ignored

In [None]:
outputs.max()

## **BERT**: Get predictions

In [None]:
indices = torch.arange(0, ratings_test.shape[0]).split(1)
predictions = {}
feature_vectors = {}
weights = {}

indices = indices[:len(indices)-1] 

for t, v in enumerate(indices):

  curr_items_ids_batch = ratings_test[v.numpy(), 1]

  curr_items_batch = items[curr_items_ids_batch.numpy(),:].to(device)
  curr_attentions_batch = attentions[curr_items_ids_batch.numpy(),:].to(device)

  curr_users_batch = ratings_test[v.numpy(), 0].to(device)

  inputs = [ curr_items_batch.type(torch.LongTensor), curr_users_batch.type(torch.LongTensor), curr_attentions_batch.type(torch.LongTensor)]
  

  targets = model(inputs)

  

  # save prediction for each user
  real_user_id = id2user_test[curr_users_batch[0].item()]
  
  try:
    predictions[real_user_id] 
  except:
    predictions[real_user_id] = []

  predictions[real_user_id].append([pos2item[curr_items_ids_batch.item()], 
                                    targets[0].item()])
  
        



In [None]:
topn=10
results = []
results_df = pd.DataFrame(columns = [user_col, item_col, rating_col])

for user in predictions:
    item_list = []
    user_prediction = predictions[user]

    user_prediction = sorted(user_prediction,key=lambda x: (x[1]), reverse=True)
    n = 1
    for item, rating in user_prediction:
        if item  not in item_list:
          item_list.append(item)
          results_df = pd.concat((results_df, pd.DataFrame(data={user_col:[user],item_col:[item],rating_col:[rating]})))
          results.append([user, item, rating])
          if n >= topn:
              break
          n = n + 1

## Models from AMAR paper **SeqLSTM**

In [20]:
items, item2pos, pos2item, token2id, max_item_len = read_items_data(data_items_eval, item_col, text_col)
items = pad_items_data(items, item2pos, max_item_len)

ratings_train, user2id_train, id2user_train = read_ratings_data(data_interaction_train, item2pos,  item_col, rating_col, user_col)

ratings_test, user2id_test, id2user_test = read_ratings_data(data_interaction_test, item2pos, item_col, rating_col, user_col)



In [63]:
class AMARSeqLSTM(nn.Module):
    def __init__(self, hidden_dense_layer_size, num_tokens, item_embeddings_size, num_users):
        super(AMARSeqLSTM, self).__init__()
        self.model1_layer1 = nn.EmbeddingBag(num_tokens + 1, item_embeddings_size)
        self.model1_layer2 = nn.LSTM(input_size=item_embeddings_size, hidden_size= item_embeddings_size,
                                num_layers=5, dropout=0.2, batch_first=True)
        #self.model1_layer3 = nn.AvgPool1d(2) 
        self.model1_layer3 = nn.Dropout(0.2) 

        self.model2_layer1 = nn.Embedding(num_users, user_embeddings_size)
        
        self.linear = nn.Linear(hidden_dense_layer_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self,x, hidden):
        y1 = self.model1_layer1(x[0].type(torch.LongTensor))
        y1, hidden  = self.model1_layer2(y1.reshape(y1.shape[0], 1, y1.shape[1]), hidden)
        y1 = self.model1_layer3(y1)
        y1 = y1.reshape(y1.shape[0], y1.shape[-1])
        
        y2 = self.model2_layer1(x[1])
        y = torch.cat([y1, y2], 1)
        y = self.linear(y)
        return self.sigmoid(y), hidden

    def initialize_hidden_state(self,batch_size):
      weight=next(self.parameters()).data
      if(torch.cuda.is_available()):
       hidden=(weight.new(5,batch_size,item_embeddings_size).zero_().cuda(),weight.new(5,batch_size,item_embeddings_size).zero_().cuda())
      else:
        hidden=(weight.new(5,batch_size,item_embeddings_size).zero_(),weight.new(5,batch_size,item_embeddings_size).zero_())
        
      return hidden
        

In [64]:
# Params: items_data, ratings_data, genres_data, batch_size
device = 'cuda'

import numpy as np
num_examples=ratings_train.size(0)
item_embeddings_size = 128
user_embeddings_size = 10
genre_embeddings_size = 128
hidden_dense_layer_size = item_embeddings_size + user_embeddings_size
num_tokens = len(token2id)
num_users = len(data_interaction[user_col].drop_duplicates())

# ADD genre and tags

model = AMARSeqLSTM( hidden_dense_layer_size, num_tokens, item_embeddings_size, num_users).to(device)

## Train model

In [None]:
#criterion = nn.BCEWithLogitsLoss() 
criterion = nn.BCELoss()
#criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr= 1e-3, alpha=0.9)
#optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# remove later
ratings_train = ratings_train[ratings_train[:,1]<len(items),]
ratings_train = ratings_train[ratings_train[:,0]<num_users,]

num_epochs=10
#batch_size=256 
batch_size=32
num_examples=ratings_train.shape[0]

cost_per_epoch = []

for e in range(num_epochs):
    hidden_state=model.initialize_hidden_state(batch_size)
    # shuffle and split training examples in batches
    indices = torch.randperm(num_examples).split(batch_size)

    #remove last element so that all the batches have equal size
    indices = indices[:len(indices)-1] 

    average_cost = 0

    for t, v in enumerate(indices):
        hidden_state=tuple([element.data for element in hidden_state])
        #items positions
        curr_items_ids_batch = ratings_train[v, 1]

        # items descriptions
        curr_items_batch = items[curr_items_ids_batch.numpy(),:].to(device)
        
        # users ids
        curr_users_batch = ratings_train[v, 0].to(device)

        # model inputs
        inputs = [ curr_items_batch.type(torch.LongTensor), curr_users_batch.type(torch.LongTensor)]

        # model targets
        targets = ratings_train[v, 2]

        # callback that does a single batch optimization step
        optimizer.zero_grad()

        # backward propagation
        outputs,hidden_state=model(inputs,hidden_state)
        outputs = outputs.reshape(-1,)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        # evaluate current loss function value
        average_cost = average_cost + loss
        

    # evaluate average cost per epoch
    average_cost = average_cost / len(indices)
    cost_per_epoch.append(average_cost)
    print("Average cost per epoch: ", average_cost)


In [40]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

## **SeqLSTM**: Get predictions

In [None]:
# # Version from github

# # remove later
# #ratings_test = ratings_test[ratings_test[:,1]<len(items),]
# #ratings_test = ratings_test[ratings_test[:,0]<num_users,]
# topn = 10

# indices = torch.arange(0, ratings_test.shape[0]).split(batch_size)
# predictions = {}

# indices = indices[:len(indices)-1] 

# for t, v in enumerate(indices):

#   curr_items_ids_batch = ratings_test[v.numpy(), 1]

#   curr_items_batch = items[curr_items_ids_batch.numpy(),:].to(device)

#   curr_users_batch = ratings_test[v.numpy(), 0].to(device)

#   inputs = [ curr_items_batch.type(torch.LongTensor), curr_users_batch.type(torch.LongTensor)]
  
#   #if t == len(indices):
#   #  last_batch_size = v.shape[0]
#   #  curr_users_batch = torch.cat(curr_users_batch, torch.zeros(batch_size-v.shape[0], 0).to(device), 0)
#   #  curr_items_batch = torch.cat(curr_items_batch, torch.zeros(batch_size-v.shape[0], max_item_len).to(device), 0)


#   targets = model(inputs)

#   # save prediction for each user

#   for index in range(targets.shape[0]):
#       real_user_id = id2user_test[curr_users_batch[index].item()]
#       try:
#          predictions[real_user_id] 
#       except:
#         predictions[real_user_id] = []
#       predictions[real_user_id].append([pos2item[curr_items_ids_batch[index].item()], 
#                                        targets[index].item()])
      




In [76]:
# New version

indices = ratings_test[:,0].unique()
predictions = {}
hidden_state=model.initialize_hidden_state(1)
for curr_users_batch in indices:
  

  curr_users_batch = curr_users_batch.to(device).reshape(1,)
 # indices_items = ratings_test[ratings_test[:,0] == curr_users_batch.item()][:,1].unique()
  indices_items = ratings_train[:,1].unique()[:2500]
  for curr_items_ids_batch in indices_items:
    
    curr_items_ids_batch = int(curr_items_ids_batch.item())

    curr_items_batch = items[curr_items_ids_batch,:].to(device).reshape(1,-1)

    
    inputs = [ curr_items_batch.type(torch.LongTensor), curr_users_batch.type(torch.LongTensor)]
    
    outputs, hidden_state = model(inputs, hidden_state)
    outputs = outputs.reshape(-1,)
    item_list = []

    real_user_id = id2user_test[curr_users_batch[0].item()]
    
    try:
      predictions[real_user_id] 
    except:
      predictions[real_user_id] = []

    predictions[real_user_id].append([pos2item[curr_items_ids_batch], 
                                      outputs[0].item()])
        




In [77]:
results = []
results_df = pd.DataFrame(columns = [user_col, item_col, rating_col])

for user in predictions:
    user_prediction = predictions[user]

    user_prediction = sorted(user_prediction,key=lambda x: (x[1]), reverse=True)
    n = 1
    for item, rating in user_prediction:
        results_df = pd.concat((results_df, pd.DataFrame(data={user_col:[user],item_col:[item],rating_col:[rating]})))
        results.append([user, item, rating])
     #   if n >= topn:
     #       break
        n = n + 1

In [78]:
results_df[results_df.userId == 7].head(10)

Unnamed: 0,userId,movieId,rating
0,7.0,1203,0.829153
0,7.0,924,0.811689
0,7.0,2066,0.78892
0,7.0,1208,0.785936
0,7.0,1204,0.782513
0,7.0,926,0.782363
0,7.0,588,0.782172
0,7.0,356,0.780328
0,7.0,16,0.780216
0,7.0,1277,0.777891


In [79]:
results_df[results_df.userId == 17].head(10)

Unnamed: 0,userId,movieId,rating
0,17.0,1207,0.933986
0,17.0,1227,0.92795
0,17.0,1208,0.927367
0,17.0,1214,0.925842
0,17.0,912,0.920312
0,17.0,1213,0.919956
0,17.0,1246,0.917216
0,17.0,588,0.916404
0,17.0,1941,0.91442
0,17.0,1023,0.913637


In [81]:
results_df[results_df.userId == 15].head(10)

Unnamed: 0,userId,movieId,rating
0,15.0,1213,0.612235
0,15.0,1246,0.584999
0,15.0,926,0.570376
0,15.0,897,0.565124
0,15.0,1227,0.565048
0,15.0,924,0.552918
0,15.0,1214,0.552261
0,15.0,1200,0.550493
0,15.0,904,0.550368
0,15.0,1228,0.543278


## **Most Popular**: Get Predictions

https://www.kaggle.com/gspmoreira/recommender-systems-in-python-101#Popularity-model

In [None]:
topn = 10

item_popularity_df = data_interaction.groupby(item_col)[rating_col].sum().sort_values(ascending=False).reset_index()

predictions = {}

for real_user_id in data_interaction_test.userId:
    predictions[real_user_id] = []
    items_to_ignore = data_interaction_train[data_interaction_train[user_col] == real_user_id]['movieId'].values
    item_popularity_df_user = item_popularity_df[~item_popularity_df[item_col].isin(items_to_ignore)]
    predictions[real_user_id].append(item_popularity_df_user.head(topn)[item_col].values.tolist())
      

KeyboardInterrupt: ignored

In [None]:
results = []
results_df = pd.DataFrame(columns = [user_col, item_col, rating_col])

for user in predictions:
    user_prediction = predictions[user][0]
    for item in user_prediction:
        rating = 1.
        results_df = pd.concat((results_df, pd.DataFrame(data={user_col:[user],item_col:[item],rating_col:[rating]})))
        results.append([user, item, rating])

# Evaluate predictions (for all methods) 

Scores: F1

In [None]:
scores = []
for user, df in results_df.groupby(user_col):
  df = df.drop_duplicates(subset=item_col)
  y_true_sorted = data_interaction_test.loc[data_interaction_test[user_col] == user].sort_values(rating_col, ascending=False)
  y_true_df = pd.merge(data_items_eval[[item_col]], y_true_sorted[[item_col, rating_col]], 'left').fillna(0)
  y_true_ndcg = y_true_df[rating_col].values
  y_true_df.loc[y_true_df[rating_col] > 0, rating_col] = 1
  y_true = y_true_df[rating_col].values

  y_pred_df = pd.merge(data_items_eval[[item_col]], df[[item_col, rating_col]], 'left').fillna(0)
  y_pred_ndcg = y_pred_df[rating_col].values
  y_pred_df.loc[y_pred_df[rating_col] > 0, rating_col] = 1
  y_pred = y_pred_df[rating_col].values

  if y_true.sum() >= 1:
    score = f1_score(y_true, y_pred)
    scores.append(score)

print("F1 Score: ", np.mean(scores))

F1 Score:  0.6116116466860201


# Interpretability of Bias

- Find correlations between user gender and item gender (Pearsons correlation)
- For each track get proportion of female/male user => compare train and recommendataion data
- Check if genderness increased/decreased in recommendations (in comparision to training data) 
- Compare distribution of genderness between history and recommendations 
  - "Delta Metric of Genderness" (https://arxiv.org/pdf/2108.06973.pdf)
  - Proportion tests: Fisher exact test, Chi-Square



In [None]:
data_artists = pd.read_csv("data_artists.txt", sep="\t").drop(['Unnamed: 0'],axis=1).dropna()
data_artists.columns = ['track_artist', 'type', 'gender_artist']
data_user_track_interaction = pd.read_csv("data_user_track_interaction.txt", sep="\t").drop(['Unnamed: 0'],axis=1)
data_user = pd.read_csv("data_user.txt", sep="\t").drop(['Unnamed: 0'],axis=1)
data_user.columns = ['user_id', 'gender_user']

a = []
for i in  data_artists.track_artist.values:
  a.append(i.lower())
data_artists['track_artist'] = a

In [None]:
df_tmp = pd.merge(data_user_track_interaction, data_user, 'inner')
df_all = pd.merge(df_tmp, data_artists, on = 'track_artist').drop_duplicates()
df_all = df_all.loc[(df_all['gender_artist'] != 'Unknown') & (df_all['gender_artist'] != 'other')]

replace_dict1 = {'m' : 0, 'f' : 1}
replace_dict2 = {'male' : 0, 'female' : 1}
df_all['gender_user'] = df_all['gender_user'].replace(replace_dict1)
df_all['gender_artist'] = df_all['gender_artist'].replace(replace_dict2)

## Correlation between user and item gender

In [None]:
# Proportion of female items in all female user
prop_female = 0
# Proportion of female items in all male user
prop_male = 0
for group, df in df_all.groupby('gender_user'):
  if group == 0:
    prop_male = df.gender_artist.sum() / len(df)
  else:
    prop_female = df.gender_artist.sum() / len(df)

print(f'In the history data, all female users listen to {round(prop_female*100, 2)} % female itmes.')
print(f'In the history data, all male users listen to {round(prop_male*100, 2)} % female itmes.')

# Pearson correlation
print('The attributes gender_user and gender_artist show a correlation of ', df_all.gender_user.corr(df_all.gender_artist))

In the history data, all female users listen to 8.3 % female itmes.
In the history data, all male users listen to 2.67 % female itmes.
The attributes gender_user and gender_artist show a correlation of  0.10665398232086548


In [None]:
from scipy.stats import fisher_exact
tab = pd.crosstab(df_all.gender_user, df_all.gender_artist)
stats, p_value = fisher_exact(tab)
print(p_value)
# if p-value <= 0.05 => gender_user and gender_artist are independent

2.1905224050321245e-05


## Delta metric of genderness

 For user u_i: (prop_female(rec) - prop_female(history)) / prop_female(history)

 If positive: more female tracks are recommended to the user

In [None]:
df_tmp = pd.merge(results_df, data_user, 'inner')
df_artists_tmp = pd.merge(df_tmp, data_user_track_interaction[['track_artist', 'track_id']])
df_rec = pd.merge(df_artists_tmp, data_artists, on = 'track_artist').drop_duplicates()
df_rec = df_rec.loc[(df_rec['gender_artist'] != 'Unknown') & (df_rec['gender_artist'] != 'other')]
df_rec['gender_user'] = df_rec['gender_user'].replace(replace_dict1)
df_rec['gender_artist'] = df_rec['gender_artist'].replace(replace_dict2)

In [None]:
prop_female_rec = df_rec.gender_artist.sum()
prop_female_history = df_all.gender_artist.sum()
delta = (prop_female_rec - prop_female_history) / prop_female_history

if delta > 0:
  print(f'The value of delta is {delta} and therefore more female tracks are recommended to user.')
else:
  print(f'The value of delta is {delta} and therefore more male tracks are recommended to user.')

The value of delta is -1.0 and therefore more male tracks are recommended to user.


## Proportion test

In [None]:
# Fisher exact test

from scipy.stats import fisher_exact
tab = pd.crosstab(df_rec.gender_artist, df_all.gender_artist)
stats, p_value = fisher_exact(tab)
if p_value <= 0.05:
  print(f'The attributes gender_artist of the history and recommendation data are significantly independent.')

# if p-value <= 0.05 => gender_x and gender_y are independent

In [None]:
# z-test

from statsmodels.stats.proportion import proportions_ztest
significance = 0.005
sample_size_hist = df_all.gender_artist.count()
sample_success_hist = df_all.gender_artist.sum()
sample_success_rec = df_rec.gender_artist.count()
sample_size_rec = df_rec.gender_artist.sum()
successes = np.array([sample_success_hist, sample_success_rec])
samples = np.array([sample_size_hist, sample_size_rec])
stat, p_value = proportions_ztest(count=successes, nobs=samples,  alternative='two-sided')
if p_value <= 0.05:
  print(f'The proportions of gender_artist of the history and recommendation data are significantly different.')
# if p-value <= 0.05 => the proportions are significantly different

  prop = count * 1. / nobs
  nobs_fact = np.sum(1. / nobs)
  zstat = value / std_diff


# Predict artist gender

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [None]:
data_all_info = pd.read_csv("music/data_all_info.txt", sep="\t").drop(['Unnamed: 0'],axis=1).dropna()
data_all_info = data_all_info.loc[data_all_info.artist_gender != 'other' ]

In [None]:
input = feature_vectors
#input = weights

In [None]:
df_features = pd.DataFrame()
id_col = []

for id in data_interaction_test[item_col].unique():
  id_col.append(id)
  df_features = df_features.append(pd.DataFrame(input[id][0].detach().numpy()))

df_features[item_col] = id_col

In [None]:
df_features = pd.merge(df_features, data_all_info)

In [None]:
df_features.loc[df_features.artist_gender == 'female', 'artist_gender'] = 0
df_features.loc[df_features.artist_gender == 'male', 'artist_gender'] = 1
df_features.artist_gender = df_features.artist_gender.astype('int')

In [None]:
X = df_features.iloc[:,:128]
y = df_features.artist_gender
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42
)

In [None]:
#clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test.values, y_pred))
print("Balanced Accuracy:", metrics.balanced_accuracy_score(y_test.values, y_pred))
print("Recall: ", metrics.recall_score(y_test.values, y_pred))
print("Precision: ", metrics.precision_score(y_test.values, y_pred))

Accuracy: 0.7887323943661971
Balanced Accuracy: 0.7465686274509804
Recall:  0.8431372549019608
Precision:  0.86


In [None]:
import pandas as pd
feature_imp = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)
feature_imp.head(10)

104    0.226804
98     0.140306
17     0.131687
81     0.114796
127    0.111745
62     0.091315
5      0.083488
111    0.051955
3      0.047903
69     0.000000
dtype: float64

---------------------------

# Next steps

[x] Train test split 

[x] Implement CV

[x] Balance data set (use as many listened as unlistened tracks - randomly chosing them)

[x] Use Popularity model as baseline

[x] Check index behaviour!!! (pos2item,...)

[x] Test/train split (https://colab.research.google.com/github/d2l-ai/d2l-en-colab/blob/master/chapter_recommender-systems/movielens.ipynb)

[x] Verify BERT Tokenizer

[x] Verify data reading with new movie data

[x] Try out gender interpretability

[ ] Add tags data (instead of lyrics)

[x] Get more data

[x] Clean lyrics data (remove non-english ones and repeated parts)

[x] Use embedding of model to predict gender of artist

[x] Try to find song writer gender (not priority) => tried this out but could not find a general solution

[ ] Implement ndcg (https://github.com/Jenniferz28/Collaborative-Filtering-Recommendation/blob/69a2400736a628de34620318abe861cc9a19e621/ndcg.py#L77)

[ ] Clean up code and move to repository

[ ] Implement evaluation with using all items in dataset

[ ] BERT make sure this is correct!

[ ] Try out correlation interpretability things (e.g. "Exploring Artist Gender Bias in Music Recommendation"

[ ] Remove 'important' feature vector from recommendataion model and compare recommendation performance + prediction perforamcne

