# Recommender System Small Scale Testing

## Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
import graphlab as gl
import random

## Upload User Data

Since the complete song-user matrix is extremely large, I will randomly sample from the locally stored csv file for training and testing; rather than uploading the entire file.

In [None]:
#Set random seed

random.seed(21)

#Find cleaned user song matrix

file = ('/Users/ivettetapia 1/Symbolic Link Seagate Drive/Springboard/Capstone 1_Mus_Recomend/Data/msd_song_user_matrix.csv')

#Count number of records

n = sum(1 for line in open(file)) #number of records in file (excludes header)

s = 20000 # sample size to draw

# Create random skip pattern for pandas

skip = sorted(random.sample(xrange(1,n+1),n-s))

In [None]:
#Use random sample to upload selected data to a DF

user_song_matrix = pd.read_csv(file, na_values = 'NaN', index_col = 0, skiprows=skip)

user_song_matrix.head()

In [None]:
user_song_matrix.info()

Convert objects to string and floats to int. This is to facilitate conversion to GraphLab's SFrame format. An SF will not accept a pandas column whose type is an object.

In [None]:
#Convert object to string

user_song_matrix[['artist_id','artist_name','release','song_id','title','user_id']] = user_song_matrix[['artist_id','artist_name','release','song_id','title','user_id']].astype('|S250')

In [None]:
#Convert float to int

user_song_matrix[['play','play_count']] = user_song_matrix[['play','play_count']].astype('int')

## Convert Pandas DataFrame to SFrame

Convert DF to GraphLab dataframe format to use libraries recommendation algorithms.

In [None]:
sample_sf = gl.SFrame(data=user_song_matrix)

sample_sf

## Create random 70%/30% train - test split

In [None]:
train, test = sample_sf.random_split(.7, seed=5)

print(len(train), len(test))

## Create Baseline: Song Popularity Based Recommender

This model is used as a frame of reference for the subsequent approaches. This recomender does  not have any personalization at all. It recommends the most popular items to users.

Model Documentation: https://turi.com/products/create/docs/generated/graphlab.recommender.popularity_recommender.create.html#graphlab.recommender.popularity_recommender.create

*Create Song Popularity Recomender*

In [None]:
song_popularity =  gl.popularity_recommender.create(train, 
                                                    item_id = 'song_id',
                                                    user_id='user_id')

**Evaluate Song Popularity Recomender Precision and Recall on Test Data**

In [None]:
eval_song_pop = song_popularity.evaluate_precision_recall(test)

eval_song_pop

See top 5 recommended songs.

In [None]:
song_popularity.recommend(k=5)

## Create Baseline: Artist Popularity Based Recommender

In [None]:
user_song_matrix_art = user_song_matrix[user_song_matrix['artist_id'] != 'nan']

sample2_sf = gl.SFrame(data=user_song_matrix_art)

train_art, test_art = sample2_sf.random_split(.7, seed=5)

print(len(train_art), len(test_art))

This model is used as a frame of reference for the subsequent approaches. This recomender does  not have any personalization at all. It recomends the most popular artists to users.

Model Documentation: https://turi.com/products/create/docs/generated/graphlab.recommender.popularity_recommender.create.html#graphlab.recommender.popularity_recommender.create

*Create Artist Popularity Recomender*

In [None]:
artist_popularity = gl.popularity_recommender.create(train_art,
                                                     user_id = 'user_id',
                                                     item_id = 'artist_id')

**Evaluate Artist Popularity Recomender Precision and Recall**

In [None]:
eval_artist_pop = artist_popularity.evaluate_precision_recall(test_art)

eval_artist_pop

*Find top 5 artist recomendations*

In [None]:
artist_popularity.recommend(k=5)

## Content - Based Recommender

Model Documentation:

https://turi.com/products/create/docs/generated/graphlab.recommender.item_content_recommender.create.html#graphlab.recommender.item_content_recommender.create

In [None]:
train_item_data = train[['artist_familiarity','artist_hotttnesss','artist_id','artist_name',
                         'release','song_hotttnesss','song_id','title','duration','key',
                         'loudness','mode','tempo','time_signature','year']]

obs_data = train[['user_id','song_id','play']]

In [None]:
content = gl.item_content_recommender.create(train_item_data,
                                             item_id = 'song_id',
                                             user_id = 'user_id',
                                             observation_data = obs_data)

**Evaluate Content Based Recomender Precision and Recall**

In [None]:
eval_content = content.evaluate_precision_recall(test)

eval_content

*Find top 5 per user recomendations*

In [None]:
content.recommend(k=5)

## Item - Item Recommender

Documentation for this model:

https://turi.com/products/create/docs/generated/graphlab.recommender.item_similarity_recommender.create.html#graphlab.recommender.item_similarity_recommender.create

*Create Item - Item Recomender*

In [None]:
item_rec = gl.item_similarity_recommender.create(train,
                                              user_id = 'user_id',
                                              item_id = 'song_id',
                                              similarity_type = 'pearson')

**Evaluate Item - Item Recomender Precision and Recall on Test Data**

In [None]:
eval_item = content.evaluate_precision_recall(test)

eval_item

*Top 5 recomendations per user*

In [None]:
item_rec.recommend(k=5)

## Ranking Matrix Factorization

Matrix factorization algorithm for implicit data. Determine n-ranked items since predicting 'ratings' does not makes sense for implicit of data.

Model documentation:

https://turi.com/products/create/docs/generated/graphlab.recommender.ranking_factorization_recommender.create.html#graphlab.recommender.ranking_factorization_recommender.create

In [None]:
data_matfact = train[['user_id','song_id','play']]

In [None]:
matfac_rec = gl.ranking_factorization_recommender.create(data_matfact ,
                                                         user_id = 'user_id',
                                                         item_id = 'song_id')

In [None]:
eval_matfact = matfac_rec.evaluate_precision_recall(test)

eval_matfact

*Top 5 recomendations per user*

In [None]:
matfac_rec.recommend(k=5)

## Graph Precision and Recall Curves

Explore how to get evaluation results out.

In [None]:
print(type(eval_song_pop))

eval_song_pop.keys()

Create Columns

In [None]:
c_songpop = gl.SArray(['Song Popularity'] * 18)
c_artpop = gl.SArray(['Artist Popularity'] * 18)
c_cont = gl.SArray(['Content-Based'] * 18)
c_item = gl.SArray(['Item-Item'] * 18)
c_matfact = gl.SArray(['Matrix Factorization (SVD)'] * 18)

Add Identifying Columns & Convert to DataFrame

In [None]:
pres_recall1 = eval_song_pop['precision_recall_overall'].add_column(c_songpop, name='Recommender Type').to_dataframe()

pres_recall2 = eval_artist_pop['precision_recall_overall'].add_column(c_artpop, name='Recommender Type').to_dataframe()

pres_recall3 = eval_content['precision_recall_overall'].add_column(c_cont, name='Recommender Type').to_dataframe()

pres_recall4 = eval_item['precision_recall_overall'].add_column(c_item, name='Recommender Type').to_dataframe()

pres_recall5 = eval_matfact['precision_recall_overall'].add_column(c_matfact, name='Recommender Type').to_dataframe()

Convert each to df

In [None]:
frames = [pres_recall1, pres_recall2, pres_recall3, pres_recall4, pres_recall5]

precision_recall_overall = pd.concat(frames)

precision_recall_overall.head()

In [None]:
%matplotlib inline

sns.set_style("whitegrid")

sns.despine()

sns.lineplot(x='recall', y='precision', data = precision_recall_overall, 
             hue='Recommender Type', markers=True)

plt.title('Precision-Recall Comparisons \n')

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

#plt.xticks(np.arange(0, 1.2, step=0.2))
#plt.yticks(np.arange(0, 1.2, step=0.2))

plt.show()

## IDEAS & NEXT STEPS

+ Add more data. Find out how much my machine can handle.
+ Parameter Tuning - *particularly item based, matrix factorization.*
+ Add interaction confidence? (matrix factorization)
+ Cross - Validation (kfolds = n)
+ Add other accuracy metrics
+ Bring other side information about items to improve performance (artist tags)?

# End of Notebook