In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise.model_selection import train_test_split
from surprise import dump
import csv
from surprise import accuracy
from pprint import pprint

<IPython.core.display.Javascript object>

In [3]:
csv_path = os.path.join("../data/csv/reviews_cleaned_reduced_500.csv")
TextFileReader = pd.read_csv(csv_path, chunksize=1000)  # the number of rows per chunk

dfList = []
for df in TextFileReader:
    dfList.append(df)

df = pd.concat(dfList,sort=False)


<IPython.core.display.Javascript object>

In [4]:
# load beers
csv_path = os.path.join("../data/csv/beers.csv")
beers_df = pd.read_csv(csv_path)

<IPython.core.display.Javascript object>

In [5]:
beers_df = beers_df.rename(columns={'id': 'beer_id'})

<IPython.core.display.Javascript object>

In [6]:
# Lets combine the dataframe
merge_df = pd.merge(df,
                 beers_df[['beer_id', 'name', 'style', 'brewery_id']],
                 on='beer_id')

<IPython.core.display.Javascript object>

In [7]:
# sample random trainset and testset method Using Cosine similarity
# test set is made of 25% of the ratings. we are looking at similarities between items (user_based=false)
reader=Reader(rating_scale=(0,5))
data = Dataset.load_from_df(merge_df[['username', 'beer_id', 'score']], reader)

trainset, testset = train_test_split(data, test_size=.25)

sim_options = {'name': 'cosine',
               'user_based': False
               }

# We'll use KNN.
algo = KNNBasic(min_k = 5, sim_options=sim_options)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.fit(trainset).test(testset)

# Then compute RMSE
accuracy.rmse(predictions)
accuracy.mae(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.5578
MAE:  0.3934


0.39341180683491567

<IPython.core.display.Javascript object>

In [15]:
dumpfile = os.path.join('../data/dump/dump_knn_cosine_500dump_file1')
dump.dump(dumpfile, predictions, algo)

<IPython.core.display.Javascript object>

In [8]:
# Lets serialize and save this prediction algorithm
# Dump algorithm and reload it
file_name_algo = os.path.join('../data/dump/algo_knn_cosine_500dump_file')
dump.dump(file_name_algo, algo=algo)
file_name_pred = os.path.join('../data/dump/pred_knn_cosine_500dump_file')
dump.dump(file_name_pred, predictions=predictions)

<IPython.core.display.Javascript object>

In [9]:
# Code below identifes the top 10 best and worst predictions based upon code from this 
# notebook:https://nbviewer.jupyter.org/github/NicolasHug/Surprise/blob/master/examples/notebooks/KNNBasic_analysis.ipynb
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0

    
df_predict = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_predict['Iu'] = df_predict.uid.apply(get_Iu)
df_predict['Ui'] = df_predict.iid.apply(get_Ui)
df_predict['err'] = abs(df_predict.est - df_predict.rui)
best_predictions = df_predict.sort_values(by='err')[:10]
worst_predictions = df_predict.sort_values(by='err')[-10:]   

<IPython.core.display.Javascript object>

In [10]:
# Best Predictions:
print(best_predictions)

                 uid     iid  rui  est  \
195564  DrunkyBuddha    1558  5.0  5.0   
18707   DrunkyBuddha   47658  5.0  5.0   
81658         CEDAMA    2093  5.0  5.0   
52336   DrunkyBuddha  128500  5.0  5.0   
17487         CEDAMA   35732  5.0  5.0   
33449      100200300     103  3.0  3.0   
44746   DrunkyBuddha  111616  5.0  5.0   
85084       bmcduff2     104  1.0  1.0   
134924      bmcduff2   29015  1.0  1.0   
9474       100200300    2318  3.0  3.0   

                                          details   Iu    Ui  err  
195564  {'actual_k': 40, 'was_impossible': False}   64  1436  0.0  
18707   {'actual_k': 40, 'was_impossible': False}   64  1321  0.0  
81658   {'actual_k': 40, 'was_impossible': False}  105  2811  0.0  
52336   {'actual_k': 40, 'was_impossible': False}   64  1047  0.0  
17487   {'actual_k': 40, 'was_impossible': False}  105   884  0.0  
33449   {'actual_k': 40, 'was_impossible': False}   44  1335  0.0  
44746   {'actual_k': 40, 'was_impossible': False}   64   897 

<IPython.core.display.Javascript object>

In [11]:
# Worst Predictions:
print(worst_predictions)

                    uid     iid   rui       est  \
226273       Mtnbeerman    1320  1.00  4.466261   
114238          bmur112    3734  1.00  4.482400   
217563  SteelCityHops89    5428  1.00  4.485264   
36981        JackOGreen  118987  1.04  4.531219   
215557         aliving1    1320  1.00  4.495263   
179874         dogfish7     639  1.00  4.499617   
746          Patlee3324   29015  1.00  4.509593   
171643     tat2dhllblly    1320  1.00  4.539816   
210135   captainbearcat      65  1.00  4.544256   
91491    SinjaminBentek   35036  1.00  4.683507   

                                          details   Iu    Ui       err  
226273  {'actual_k': 30, 'was_impossible': False}   30  1178  3.466261  
114238  {'actual_k': 40, 'was_impossible': False}  162   555  3.482400  
217563  {'actual_k': 34, 'was_impossible': False}   34  1375  3.485264  
36981   {'actual_k': 40, 'was_impossible': False}   46   377  3.491219  
215557  {'actual_k': 40, 'was_impossible': False}   57  1178  3.495263  


<IPython.core.display.Javascript object>

In [12]:
def get_beer_name (beer_raw_id):
    beer_name = beers_df.loc[beers_df.beer_id==beer_raw_id,'name'].values[0]
    return beer_name

def get_beer_style (beer_raw_id):
    beer_style = beers_df.loc[beers_df.beer_id==beer_raw_id,'style'].values[0]
    return beer_style

def get_beer_score_mean (beer_raw_id):
    score_mean = mean_score.loc[mean_score.beer_id==beer_raw_id,'score'].values[0]
    return score_mean

def get_beer_neighbors (beer_raw_id):
    beer_inner_id = algo.trainset.to_inner_iid(beer_raw_id)
    beer_neighbors = algo.get_neighbors(beer_inner_id, k=5)
    beer_neighbors = (algo.trainset.to_raw_iid(inner_id)
                  for inner_id in beer_neighbors)
    return(beer_neighbors)

def get_beer_recc_df (beer_raw_id):
    beer_inner_id = algo.trainset.to_inner_iid(beer_raw_id)
    beer_neighbors = algo.get_neighbors(beer_inner_id, k=10)
    beer_neighbors = (algo.trainset.to_raw_iid(inner_id)
                      for inner_id in beer_neighbors)
    beers_id_recc = []
    beer_name_recc =[]
    beer_style_recc = []
    beer_score_mean = []
    for beer in beer_neighbors:
        beers_id_recc.append(beer)
        beer_name_recc.append(get_beer_name(beer))
        beer_style_recc.append(get_beer_style(beer))
        beer_score_mean.append(get_beer_score_mean(beer))
    beer_reccomendations_df = pd.DataFrame(list(zip(beers_id_recc,beer_name_recc,beer_style_recc,beer_score_mean)),
                                       columns=['beer_id', 'name', 'style', 'score_mean'])
    return beer_reccomendations_df
    
def get_inner_ids(riids):
    inner_ids = []
    for riid in riids:
        inner_ids.append(trainset.to_inner_iid(riid))
    return inner_ids
        

<IPython.core.display.Javascript object>

In [13]:
#Create mean score dataframe and get riids (riids = raw beers ids)
mean_score = merge_df.groupby('beer_id', as_index=False)[['score']].mean()
riids = mean_score['beer_id']
riids = riids.to_list()
#Get the beer inner ids
inner_ids = get_inner_ids(riids)

<IPython.core.display.Javascript object>

In [14]:
# Save the df_predict and df_ids for later use
df_predict.to_csv("../data/csv/df_predict_cosine_500.csv",index=False)
df_ids = pd.DataFrame(list(zip(riids,inner_ids)), columns=['beer_id', 'inner_ids'])
df_ids.to_csv("../data/csv/df_ids_cosine_500.csv",index=False)

<IPython.core.display.Javascript object>