##This notebooks lays out the steps we have been using to run Hermes

###It saves intermediary products then reads those products back into the context for the next step
###This helps prevent any losses during the run

###First set up your spark context (if necessary) and load the hermes.zip into context
###hermes.zip can be found on GitHub at https://github.com/Lab41/hermes.git

In [None]:
#This block of code will set up a spark content and sql context if you are running locally
#If you are on cluster or have deployed spark a different way you don't need this
from pyspark import SparkContext, SparkConf 
from pyspark.sql import SQLContext 

try:
    sc = SparkContext()
except:
    sc = SparkContext._active_spark_context

sqlCtx = SQLContext(sc)

In [None]:
sc.addPyFile('hermes/hermes.zip')
from src import hermes_run_script
import pandas as pd

###Read in the data - for instance the MovieLens data can be found at: http://grouplens.org/datasets/movielens/

###The data then needs to be transformed into json files.  Each dataset has its own ETL folder in hermes/src/utils
###Once transformed then you should load the json files

In [None]:
movies = sqlCtx.read.json(
    'movielens_20m_movies.json.gz', 
)

ratings = sqlCtx.read.json(
    'movielens_20m_ratings.json.gz',
)

#We found the best tag set is in MovieLens 20M and it usable for all movielens
tags = sqlCtx.read.json('movielens_20m_tags.json.gz') 

###Set up all of the parameters necessary for the runner

In [None]:
#name of the dataset: will be used for to get the correct vectorizer and when saving files
data_name = 'movielens_20m'
#the types of user vectors to assess
#each dataset has different user vectors that can be chosen
user_vector_types = ['ratings', 'pos_ratings', 'ratings_to_interact']
#the types of content vectors to assess
#each dataset has different content vectors that can be chosen
content_vector_types = ['genre','tags'] 

#the directory where intermediate files will be saved including user vectors, content vectors, and predictions
#this can be HDFS
directory = 'HDFS/movielens/data'
#the directory for the csv results files.
#this should not be HDFS
results_directory = 'movielens/results'

#the collaborative filtering algorithms to run
cf_predictions = ['cf_mllib', 'cf_item', 'cf_user']
#the content based algorithms to run
cb_predictions = ['cb_vect', 'cb_kmeans_100', 'cb_kmeans_1000']

#the number of predictions to give to a user
result_runs = [100, 1000] 

#any additional items that are necessary to run the content vectors
#for MovieLens this includes the user tags if you want to run the tag content vector
support_files = {'num_tags':300, 'tag_rdd':tags}

###Pass in all the variables into the Hermes Runner

In [None]:
runner = hermes_run_script.hermes_run(ratings, movies, user_vector_types, content_vector_types,\
    sqlCtx, sc, data_name, directory, results_directory, cf_predictions, cb_predictions, \
    result_runs, num_partitions=30, **support_files)

In [None]:
#run the vectorizers
runner.run_vectorizer()

In [None]:
#run the collaborative filtering algorithms
runner.run_cf_predictions()

In [None]:
#run the content based algorithms
runner.run_cb_predictions()

In [None]:
#get the results for the collaborative filtering predictions
runner.run_cf_results()

In [None]:
#get the results for the content based predictions
runner.run_cb_results()

In [None]:
#consolidate all of the results into a single csv file
runner.consolidate_results()

###View the results

In [10]:
full_results_path = results_directory + data_name + '_full_results.csv'
results =  pd.read_csv(full_results_path, delimiter=',', index_col=0)

In [16]:
#View part or all of the results
results[['user_vector','content_vector','N','alg_type','serendipity', 'cat_coverage', 'rmse']]

Unnamed: 0,user_vector,content_vector,N,alg_type,serendipity,cat_coverage,rmse
0,pos_ratings,genre,1000,cb_vect,0.12797,28.304557,0.579861
1,ratings,genre,100,cb_kmeans_100,0.229742,2.698327,1.300742
2,pos_ratings,genre,1000,cf_user,0.047091,28.304557,0.474636
3,ratings_to_interact,genre,1000,cb_vect,0.127515,26.98327,0.791903
4,pos_ratings,genre,100,cf_user,0.018797,2.830456,0.454204
5,ratings,genre,1000,cb_vect,0.101428,26.98327,1.198814
6,ratings,genre,100,cf_mllib,0.075411,2.698327,0.757524
7,ratings_to_interact,genre,100,cb_kmeans_100,0.274341,2.698327,0.820626
8,pos_ratings,tags,1000,cb_vect,0.087713,28.304557,0.488214
9,ratings_to_interact,genre,1000,cf_user,0.057605,26.98327,0.474519
