# Collaborative Filtering for Implicit Feedback Datasets

## Import libraries

In [1]:
from pymongo import MongoClient
client = MongoClient()
db = client.whosampled
import numpy as np
import pandas as pd

import implicit
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import scipy.sparse as sparse
from scipy.sparse import csr_matrix

import os, sys
os.environ["OPENBLAS_NUM_THREADS"]="1"

import random

np.set_printoptions(threshold=sys.maxsize)
pd.options.display.max_seq_items = 10000
from src.test_ranking import *

%reload_ext autoreload

## Read in the data from the Mongo collection

In [2]:
df = clean_up_mongo_coll(db.main_redo)

## Turn df to utility matrix

In [3]:
user_art = turn_df_to_util_mat(
    df, 'new_song_producer', 'sampled_artist')

## Get best value from grid search, and use for prod_lim/ art_lim search 

In [4]:
#min_idx = grid_search_three_params.rank_score.idxmin()

best_factors = 20

best_lambda = 30

best_iters = 45

## Explore Model

In [5]:
#Train model on entire dataset to examine its parameters

# filter so at least 6 samples per producer and at least 1 sample per artist.
user_art1, _, user_inds_lim, item_inds_lim = filter_dataset_by_requisite_interactions(
        user_art, user_art, 5, 0)

#flip so that its item_user, so that the recommend method works
artist_user = user_art1.T 

model = implicit.als.AlternatingLeastSquares(
        factors=best_factors, iterations=best_iters, regularization= best_lambda)

# train the model on a sparse matrix of item/user/confidence weights
sparse_art_user = csr_matrix(artist_user)
model.fit(sparse_art_user)

# calculate predictions

predictions_art_user = model.user_factors.dot(model.item_factors.T).T

100%|██████████| 45.0/45 [00:07<00:00,  5.77it/s]


In [6]:
def get_similar_to_prod(user, n_similar, model, artist_user):
    
    # Get index of user 
    index_of_user = artist_user.columns.get_loc(user)
    
    similar_users = model.similar_users(index_of_user, N= n_similar)
    
    similar_user_inds = [sim[0] for sim in similar_users]
    
    return [artist_user.columns[n] for n in similar_user_inds]

In [7]:
pd.DataFrame(get_similar_to_prod("The Alchemist", 10, model, artist_user)[1:],
             columns = ['Producers Most Similar to Timbaland'])

Unnamed: 0,Producers Most Similar to Timbaland
0,Sebb
1,Stu Bangas
2,Evidence
3,Hordatoj
4,"Davel ""Bo"" McKenzie"
5,Mastafive
6,Chinky P
7,Guts
8,SBe Audiologist
