# Collaborative Filtering for Implicit Feedback Datasets

## Import libraries

In [2]:
from pymongo import MongoClient
client = MongoClient()
db = client.whosampled
import numpy as np
import pandas as pd

import implicit
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import scipy.sparse as sparse
from scipy.sparse import csr_matrix

import os, sys
os.environ["OPENBLAS_NUM_THREADS"]="1"

import random

np.set_printoptions(threshold=sys.maxsize)
pd.options.display.max_seq_items = 10000
from src.test_ranking import *
from src.explore_model import *

%reload_ext autoreload

## Entire Process to make model on all data.

We turn the data into a pandas dataframe.
Then turn into a utility matrix. 
I did all of my grid searching (testing different factors, iterations, lambdas, # of artists, # of producers) on the producer vs. artist utility matrix. However, the recommender has to be from producer to song. We have to be specific to these people- they're too dumb to do otherwise. So I am going to make the producer-song utility matrix using the same hyperparameters as before because I am too lazy to check all my hyperparameters again. 

## Read in the data from the Mongo collection

In [3]:
df = clean_up_mongo_coll(db.main_redo)

## Turn df to utility matrix

In [4]:
user_song = turn_df_to_util_mat(
    df, 'new_song_producer', 'sampled_artist_song')

In [30]:
user_song.head()

sampled_artist_song,"""Haare"" Ensemble - wo geh ich hin","""Sleepy"" John Estes - milk cow blues","""Sweet"" Charles Sherrell - hang out & hustle","""Sweet"" Charles Sherrell - outa sight outa mind","""Sweet"" Charles Sherrell - soul man","""Sweet"" Charles Sherrell - strangers in the night","""Sweet"" Charles Sherrell - yes its you",$Ugga & Spice - yes we can remix,$uicideboy$ - black beard,$uicideboy$ - gorilla warfare,...,Юрий Антонов - маки,פרחי ניו יורק - russian interlude,ゴジラ 地球最大の決戦 - venusian talks about ghidorah,姚蘇蓉 - 夜歸人,山根麻以 - emi,新必殺仕事人 - 対決,方瑞娥 - 青蚵仔嫂,童丽 - 伶人歌,鄧麗君 - 我還是永遠愛著你,마음과 마음 - 밤의찬가2
new_song_producer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Buttnaked"" Tim Dawg",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Coach"" Frank Johnson",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Dope Mix"" Dave",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""House Master"" Baldwin",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Spinnin' Wheel"" Bill",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Get best value from grid search, and use for prod_lim/ art_lim search 

In [5]:
#min_idx = grid_search_three_params.rank_score.idxmin()

best_factors = 20

best_lambda = 30

best_iters = 45

## Train model on entire dataset

In [6]:
# Train model for producer- sampled song utility matrix

# filter so at least 6 samples per producer and at least 1 sample per artist.
user_song_filt, _, _, _ = filter_dataset_by_requisite_interactions(
        user_song, user_song, 5, 0)

#flip so that its item_user, so that the recommend method works
#song_user_filt = user_song_filt.T 

model = implicit.als.AlternatingLeastSquares(
        factors=best_factors, iterations=best_iters, regularization= best_lambda)

# train the model on a sparse matrix of item/user/confidence weights
sparse_user_song_filt = csr_matrix(user_song_filt)
model.fit(sparse_user_song_filt)

# calculate predictions

predictions_user_song = model.item_factors.dot(model.user_factors.T)

100%|██████████| 45.0/45 [00:13<00:00,  3.42it/s]


In [40]:
user_song_filt.sum(axis = 0).sort_values(ascending = False)

sampled_artist_song
Beside - change the beat                                   1152
James Brown - funky drummer                                1017
Lyn Collins - think about it                                826
James Brown - funky president people its bad                649
Doug E. Fresh - la di da di                                 643
The Winstons - amen, brother                                628
The Honey Drippers - impeach the president                  510
Melvin Bliss - synthetic substitution                       477
Public Enemy - bring the noise                              474
Mountain - long red                                         457
Run-DMC - here we go live at the funhouse                   448
The Mohawks - the champ                                     387
Funk, Inc. - kool is back                                   383
Sly & the Family Stone - sing a simple song                 329
Bobby Byrd - hot pants bonus beats                          310
Skull Snaps - its a 

In [41]:
user_song_filt.sum(axis = 1).sort_values(ascending = False)

new_song_producer
Madlib              1621
DJ Premier          1408
J Dilla             1229
9th Wonder          1125
Pete Rock           1067
The Alchemist       1008
DJ Paul              973
Dr. Dre              912
Juicy J              850
Marley Marl          832
Kanye West           759
Erick Sermon         709
RZA                  678
Prince Paul          666
Statik Selektah      606
Knxwledge.           603
Buckwild             539
Diamond D            535
DJ Muggs             475
Guru                 444
Jermaine Dupri       437
Ice Cube             425
Timbaland            415
Teddy Riley          404
Simon Harris         390
Large Professor      382
The Beatnuts         378
Sir Jinx             375
De La Soul           356
DJ Yella             347
                    ... 
Dave Ogilvie           6
Carnage                6
O.Gee                  6
OB O'Brien             6
Olaf Dieckmann         6
Carlos Bonell          6
Ollie Brown            6
Nasty Habits           6
Chad Ja

In [49]:
prod = "Madlib"

pd.DataFrame(get_top_recommends_not_yet_sampled_for_user(prod,
    10,
    predictions_user_song.T,
    user_song_filt,
    filter=False),
             columns = ['Recommended Songs for {}'.format(prod)])



Unnamed: 0,Recommended Songs for Madlib
0,Mountain - long red
1,Beside - change the beat
2,The Mohawks - the champ
3,Ronnie Gee - raptivity
4,James Brown - funky president people its bad
5,Big Daddy Kane - just rhymin with biz
6,Joe Tex - papa was too
7,Doug E. Fresh - la di da di
8,Djamel Allam - rani lah
9,Malcolm McLaren - buffalo gals


In [59]:
prod = "Kanye West"

pd.DataFrame(get_top_recommends_not_yet_sampled_for_user(prod,
    10,
    predictions_user_song.T,
    user_song_filt,
    filter=False),
             columns = ['Recommended Songs for {}'.format(prod)])



Unnamed: 0,Recommended Songs for Kanye West
0,Mountain - long red
1,Doug E. Fresh - la di da di
2,James Brown - funky drummer
3,James Brown - funky president people its bad
4,Lyn Collins - think about it
5,Melvin Bliss - synthetic substitution
6,Young Thug - some more
7,The Mohawks - the champ
8,Beside - change the beat
9,Drake - jumpman


In [15]:
def get_similar_to_prod(user, n_similar, model, user_item):
    
    # Get index of user 
    index_of_user = user_item.index.get_loc(user)
    
    similar_users = model.similar_items(index_of_user, N= n_similar)
    
    similar_user_inds = [sim[0] for sim in similar_users]
    
    return [user_item.index[n] for n in similar_user_inds]

In [56]:
prod = "Kanye West"

pd.DataFrame(get_similar_to_prod(prod, 10, model, user_song_filt)[1:],
             columns = ['Producers Most Similar {}'.format(prod)])

Unnamed: 0,Producers Most Similar Kanye West
0,Charlie Bisharat
1,John Tesh
2,Illangelo
3,Jon Brion
4,Dave Sardy
5,Sir Nolan
6,Hollywood Hank
7,Larry Anschell
8,The Mekanics


## Score song model

I am using the best paramaters from the grid search from producer-artist. The score is .26 and .16 for model and popularity. 
When I filter to producers with at least 5 samples, let's see. It's more accurate - 0.15 and 0.08. 

In [23]:
user_inds, item_inds = get_indices_of_test_set_values(user_song, 5)
train, test = make_train_set_and_test_set(user_inds, item_inds, user_song)

NameError: name 'user_art' is not defined

In [57]:
train_lim, test_lim, user_inds_lim, item_inds_lim = filter_dataset_by_requisite_interactions(
    train, test, 5, 0)

In [58]:
rank_score, pop_rank_score = get_rank_and_pop_score_from_train_test_model(
    train_lim, test_lim, user_inds_lim, item_inds_lim, best_factors, best_lambda, best_iters)

print(
"Model rank score: {} \n\
Popularity rank score: {}".format(rank_score, pop_rank_score))



100%|██████████| 45.0/45 [00:13<00:00,  3.22it/s]


Model rank score: 0.1524071729776384 
Popularity rank score: 0.08880592165645203
