# Collaborative Filtering for Implicit Feedback Datasets

## Import libraries

In [1]:
from pymongo import MongoClient
client = MongoClient()
db = client.whosampled
import numpy as np
import pandas as pd

import implicit
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import scipy.sparse as sparse
from scipy.sparse import csr_matrix

import os, sys
os.environ["OPENBLAS_NUM_THREADS"]="1"

import random

np.set_printoptions(threshold=sys.maxsize)
pd.options.display.max_seq_items = 10000
from src.test_ranking import *

%reload_ext autoreload

## Entire Process to make model on all data.

We turn the data into a pandas dataframe.
Then turn into a utility matrix. 
I did all of my grid searching (testing different factors, iterations, lambdas, # of artists, # of producers) on the producer vs. artist utility matrix. However, the recommender has to be from producer to song. We have to be specific to these people- they're too dumb to do otherwise. So I am going to make the producer-song utility matrix using the same hyperparameters as before because I am too lazy to check all my hyperparameters again. 

## Read in the data from the Mongo collection

In [2]:
df = clean_up_mongo_coll(db.main_redo)

## Turn df to utility matrix

In [3]:
item_user = turn_df_to_util_mat(
    df, 'sampled_artist_song', 'new_song_producer')

# Trying steps to increase the difference between recommendations

Replace any numbers above 1 with 1.

This will not have an effect on BPR but may help with scoring.

Original Sparsity: 99.98228189634969

I got this as sparsity when I filter by requisite interactions:
99.940

99.87635658576808
3,3: 99.72889
3,0: 99.711?
4,0: 99.643
0,5: 99.429
0,10:99.04
1.10:99.02
4,10:98.76

In [4]:
item_user = item_user.mask(user_song>1, 1)

In [54]:
evalu = trainModelAndEvaluateIt(item_user)
evalu.make_train_set_and_test_set(5)

In [52]:
evalu.filter_dataset_by_requisite_interactions(0,0)

In [55]:
evalu.get_sparsity_of_mat(evalu.train)

99.98228189634969

In [56]:
evalu.train_model()
rank_score, pop_score = evalu.get_rank_and_pop_score_from_train_test_model()

100%|██████████| 100/100 [00:06<00:00, 14.32it/s, correct=90.46%, skipped=3.42%]


In [57]:
rank_score

0.31763491256460463

In [58]:
pop_score

0.17559389191346764

In [61]:
evalu.get_top_recommends_for_user("DJ Premier")

KeyError: "None of [Int64Index([4635], dtype='int64', name='new_song_producer')] are in the [columns]"

# Attempting Bayersian Personalized Ranking

BPR apparently only uses Binary values, yes or no's. So any value above 1 is just seen as a positive 1. 

So it doesn't matter if I use the masked or unmasked data (the values above 1 or not).

In [61]:
#Attempting Bayersian Personalized Ranking
model = implicit.bpr.BayesianPersonalizedRanking()

sparse_song_user = csr_matrix(user_song1s)

model.fit(sparse_song_user)

predictions_user_song = model.user_factors.dot(model.item_factors.T)

100%|██████████| 100/100 [00:06<00:00, 15.16it/s, correct=90.39%, skipped=3.66%]


In [67]:
evalu.model.recommend(9117, csr_matrix(evalu.test.T))

[(3347, 2.0356562),
 (6222, 1.7531835),
 (29646, 1.7252755),
 (7854, 1.6887385),
 (3332, 1.558776),
 (11667, 1.511463),
 (6513, 1.5052252),
 (7012, 1.4785868),
 (31460, 1.4712963),
 (3628, 1.4509379)]

In [64]:
evalu.train.columns.get_loc("J Dilla")

9117

In [45]:
sparse_song_user

<21379x33719 sparse matrix of type '<class 'numpy.int64'>'
	with 134449 stored elements in Compressed Sparse Row format>

In [73]:
def get_top_rec_using_BPR(prod, N =20):

    userID = evalu.train.columns.get_loc(prod)
    recommends = evalu.model.recommend(userid = userID, user_items = csr_matrix(evalu.test.T), N = N)
    recommends = [rec[0] for rec in recommends]
    return list(item_user.index[recommends])
    


In [74]:
get_top_rec_using_BPR("Madlib", 30)

['Gang Starr - full clip',
 'Tom Scott - love poem',
 'Barrington Levy - under mi sensi',
 'Mos Def - body rock',
 'Bill Deal & the Rhondels - tucks theme',
 'Roy Ayers Ubiquity - aint got time',
 'Sly & the Family Stone - brave & strong',
 'Jimmy Spicer - money dollar bill yall',
 'Bob James - i feel a song in my heart',
 'Brethren - outside love',
 'Iron Butterfly - real fright',
 'Grandmaster Flash - freelance',
 'George Benson - california dreaming',
 'Herbie Hancock - people music',
 'Johnny Harris - light my fire',
 'The Cannonball Adderley Quintet - cannon raps',
 'Hubert Laws - cymbaline',
 'Eric Gale - forecast',
 'The 24-Carat Black - 24 carat black theme',
 'Hirokazu Tanaka - title',
 'Al Green - something',
 'Ramsey Lewis - tambura',
 'Billy Paul - war of the gods',
 'Cold Blood - kissing my love',
 'Ahmad Jamal Trio - the awakening',
 'Rotary Connection - memory band',
 'David Bowie - soul love',
 'The Notorious B.I.G. - the garden freestyle',
 'Little Boy Blues - seed of 

In [75]:
get_top_rec_using_BPR("J Dilla")

['Black Oak Arkansas - hot and nasty',
 'Curtis Mayfield - dont worry if theres a hell below, were all going to go',
 'The Mad Lads - get out of my life, woman',
 'Dizzy Gillespie - matrix',
 'Black Moon - how many mcs...',
 'Grandmaster Flash - freelance',
 'DJ Grand Wizard Theodore - live convention 82 side b',
 'David Axelrod - a divine image',
 'Tom Dissevelt and Kid Baltan - song of the second moon',
 'Bob James - feel like making love',
 'EPMD - get off the bandwagon remix',
 'Ramsey Lewis - dreams',
 'The Electric Prunes - general confessional',
 'George Semper - get out of my life, woman',
 'Love Unlimited Orchestra - strange games & things',
 'The Meters - here comes the meter man',
 'Grassella Oliphant - get out of my life woman',
 'Keith Murray - the most beautifullest thing in this world',
 'Les McCann - north carolina',
 'Archie Bell & the Drells - dont let love get you down']

In [31]:
# Train model for producer- sampled song utility matrix




# train the model on a sparse matrix of item/user/confidence weights
sparse_user_song_filt = csr_matrix(user_song1s)
model.fit(sparse_user_song_filt)

# calculate predictions

predictions_user_song = model.item_factors.dot(model.user_factors.T)

100%|██████████| 45.0/45 [00:33<00:00,  2.03it/s]


In [53]:
prod = "Madlib"

pd.DataFrame(get_top_recommends_not_yet_sampled_for_user(prod,
    predictions_user_song,
    user_song1s,
    filter=False),
             columns = ['Recommended Songs for {}'.format(prod)])



Unnamed: 0,Recommended Songs for Madlib
0,Hugo Winterhalter - granada
1,Juicy J - slob on my knob
2,Buddy Guy - my mother
3,Detroit Emeralds - baby let me take you in my ...
4,Def Jef - droppin rhymes on drums
5,Nico Fidenco - venice reportage
6,Kyle MacLachlan - the weirding way
7,David Wise - dk yell
8,Joe Jackson - you cant get what you want till ...
9,Jamiroquai - blow your mind part 1 & 2


In [54]:
prod = "Kanye West"

pd.DataFrame(get_top_recommends_not_yet_sampled_for_user(prod,
    predictions_user_song,
    user_song1s,
    filter=False),
             columns = ['Recommended Songs for {}'.format(prod)])



Unnamed: 0,Recommended Songs for Kanye West
0,Eminem - just lose it
1,Ideal J - si je rappe ici
2,2Pac - pain
3,Cannonball Adderley - aries
4,Cameo - candy
5,Leon Russell - rainbow in your eyes
6,"Lee ""Scratch"" Perry - city too hot"
7,Melvin Van Peebles - love thats america
8,Galt MacDermot - wurly
9,Bobby Caldwell - my flame


In [56]:
def get_similar_to_prod(user, n_similar, model, artist_user):
    
    # Get index of user 
    index_of_user = artist_user.index.get_loc(user)
    
    similar_users = model.similar_items(index_of_user, N= n_similar)
    
    similar_user_inds = [sim[0] for sim in similar_users]
    
    return [artist_user.index[n] for n in similar_user_inds]

In [56]:
prod = "Pete Rock"

get_similar_to_prod(prod, 10, model, user_song1s)

['Pete Rock',
 'C.L. Smooth',
 'Flipout',
 'Grap Luva',
 'DJ Mass',
 'Kevin Geeda',
 'Vern Large',
 'DJ Parker Lee',
 'Onyx',
 'Teddy Blend']

## Score song model

I am using the best paramaters from the grid search from producer-artist. The score is .26 and .16 for model and popularity. 
When I filter to producers with at least 5 samples, let's see. It's more accurate - 0.15 and 0.08. 

In [57]:
user_inds, item_inds = get_indices_of_test_set_values(user_song, 5)
train, test = make_train_set_and_test_set(user_inds, item_inds, user_song)

In [57]:
train_lim, test_lim, user_inds_lim, item_inds_lim = filter_dataset_by_requisite_interactions(
    train, test, 5, 0)

In [58]:
rank_score, pop_rank_score = get_rank_and_pop_score_from_train_test_model(
    train_lim, test_lim, user_inds_lim, item_inds_lim, best_factors, best_lambda, best_iters)

print(
"Model rank score: {} \n\
Popularity rank score: {}".format(rank_score, pop_rank_score))



100%|██████████| 45.0/45 [00:13<00:00,  3.22it/s]


Model rank score: 0.1524071729776384 
Popularity rank score: 0.08880592165645203


In [71]:
yo = pd.DataFrame([[0,1,0,1,0,1,1,0,1],
               [1,0,0,1,1,0,0,1,1],
               [0,0,0,0,1,0,1,1,1],
               [1,1,1,1,0,1,0,1,0], 
               [0,1,0,0,1,1,1,0,1]])

In [81]:
yo.values[,1]

SyntaxError: invalid syntax (<ipython-input-81-0b3bcc89e259>, line 1)