In [1]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib as plt
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import sys
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
from pyspark.sql.functions import col, explode
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# Recall y precision
from reco_utils.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation


print("System version: {}".format(sys.version))
print("Spark version: {}".format(pyspark.__version__))

System version: 3.9.5 (default, May 18 2021, 12:31:01) 
[Clang 10.0.0 ]
Spark version: 3.1.2


In [2]:
#Load data

path='./DATA'
file = os.path.join(path, "users_bgs.csv")

In [3]:
df=pd.read_csv(file, usecols=['u_id','bgg_id','Game','category','Your Rating'], low_memory=False)
df.head()

Unnamed: 0,u_id,Game,Your Rating,bgg_id,category
0,0,1830: Railways & Robber Barons,-1,421,102110341011
1,0,18AL,7,2612,1021112010341011
2,0,2 de Mayo,6,36522,10511019
3,0,23,7,103651,1002
4,0,6 nimmt!,7,432,10021098


In [4]:
#Transform explicit values to implicit values
df['Rating'] = df['Your Rating'].apply(lambda x: 1 if x>0 else 0)

#### Remove users who only have one board game

In [5]:
#Group by 'user_id'
p=df.groupby('u_id').count()

In [6]:
# Visualise users with a single board game
(p['bgg_id']==1).sum()

6

In [7]:
# Remove the user_id with a single board game
bgu=list(p[p['bgg_id']==1].index)

dfm = df[~df.u_id.isin(bgu)]

#### Modified the id of games and users so that they are correlative

In [8]:
# Create a dictionary for games titles and ids

bgg_id=list(dfm['bgg_id'])
game=list(dfm['Game'])

item_dict={}

for key in bgg_id:
    for value in game:
        item_dict[key] = value
        game.remove(value)
        break 

In [9]:
print(len(set(item_dict.keys())))
print(len(set(item_dict.values())))

36535
36535


In [10]:
# Resample
resample_id_item_dict={}
for index, key in enumerate(item_dict.keys()):
    resample_id_item_dict[key]= index
    

In [11]:
resample_item_dict = {resample_id_item_dict[k]:v for k,v in item_dict.items()}
assert(len(set(resample_item_dict.keys())) == len(set(resample_item_dict.values())))

In [12]:
# Create a dictionary for resample users id

u_id=list(dfm['u_id'].unique())
resample_user_dict={}

for index,key in enumerate(u_id):
    resample_user_dict[key] = index

In [13]:
print(len(set(resample_user_dict.keys())))
print(len(set(resample_user_dict.values())))

2844
2844


In [14]:
# Copy and apply the changes
dfm_r=dfm.copy()
dfm_r['item_id']=dfm['bgg_id'].apply(lambda x: resample_id_item_dict[x])
dfm_r['user_id']=dfm['u_id'].apply(lambda x: resample_user_dict[x])
dfm_r.describe()

Unnamed: 0,u_id,Your Rating,bgg_id,Rating,item_id,user_id
count,838777.0,838777.0,838777.0,838777.0,838777.0,838777.0
mean,1404.570457,3.42304,91310.6183,0.545138,4722.867555,1403.031329
std,813.788285,4.159144,88829.182464,0.497959,6316.864111,812.113149
min,0.0,-1.0,1.0,0.0,0.0,0.0
25%,700.0,-1.0,9209.0,0.0,680.0,700.0
50%,1375.0,5.0,55952.0,1.0,2185.0,1374.0
75%,2087.0,7.0,163976.0,1.0,6332.0,2086.0
max,2851.0,10.0,332853.0,1.0,36534.0,2843.0


In [15]:
#dfm_r.to_csv('DATA/r_users_bgs.csv')
#path='./DATA'
#file = os.path.join(path, "r_users_bgs.csv")
#dfm_r=pd.read_csv(file, low_memory=False)
#dfm_r.head()

#### Format Data for models

In [16]:
# DF Ratings

dfratings = pd.DataFrame()

dfratings['user_id']=dfm_r['user_id']
dfratings['item_id']=dfm_r['item_id']
dfratings['rating']=dfm_r['Rating']

print(dfratings.shape)
dfratings.head()

(838777, 3)


Unnamed: 0,user_id,item_id,rating
0,0,0,0
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1


In [17]:
# Prepare DF BGames

dfbg = dfm_r.groupby(by=['bgg_id','category'], as_index=False).first()
dfbg.drop(['bgg_id','u_id','Your Rating','Rating', 'user_id'],axis=1, inplace=True)

In [18]:
# DF Board Games

dfgames = pd.DataFrame()

dfgames['item_id']=dfbg['item_id']
dfgames['title']=dfbg['Game']
dfgames['category']=dfbg['category']

print(dfgames.shape)
dfgames.head()

(36535, 3)


Unnamed: 0,item_id,title,category
0,747,Die Macher,102110261001
1,5896,Dragonmaster,10021010
2,292,Samurai,10091035
3,6308,Tal der Könige,1050
4,15364,Mare Mediterraneum,10151008


## Content Based

A popular technique in recommender systems is content-based filtering. Content here refers to the attributes of products that a user likes. Thus, the idea in content-based filtering is to tag products with certain keywords, understand what the user likes, search for those keywords in the database, and recommend different products with the same attributes.

This algorithm is intended to create a recommendation system that helps identify games that are similar based on their features. The category of each board game will be used to perform the recommendation system. 


In [19]:
df_games=dfgames.copy()

In [20]:
df_games['category'] = df_games['category'].map(lambda x: x.replace(',', ' '))
print(dfgames['category'])

0        1021,1026,1001
1             1002,1010
2             1009,1035
3                  1050
4             1015,1008
              ...      
36530              1002
36531         1002,1030
36532         1002,2481
36533    1009,1028,1113
36534         1021,1013
Name: category, Length: 36535, dtype: object


In [21]:
df_games.set_index('title', inplace = True)
df_games.head()

Unnamed: 0_level_0,item_id,category
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Die Macher,747,1021 1026 1001
Dragonmaster,5896,1002 1010
Samurai,292,1009 1035
Tal der Könige,6308,1050
Mare Mediterraneum,15364,1015 1008


In [22]:
print(np.where(df_games.index=='Tal der Könige')[0])

[3]


In [23]:
count = CountVectorizer()
count_matrix = count.fit_transform(df_games['category'])

In [24]:
cosine = cosine_similarity(count_matrix, count_matrix)
cosine

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.40824829],
       [0.        , 1.        , 0.        , ..., 0.5       , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.40824829,
        0.        ],
       ...,
       [0.        , 0.5       , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.40824829, ..., 0.        , 1.        ,
        0.        ],
       [0.40824829, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [25]:
def cb_recommendations(name, cosine = cosine):
    
    recommended = []
    indices = pd.Series(df_games.index)
    idx = indices[indices == name].index[0]
    score = pd.Series(cosine[idx]).sort_values(ascending = False)
    top_10 = list(score.iloc[0:11].index)
    for i in top_10:
        if idx != i:
            recommended.append(list(df_games.index)[i])
        
    return recommended

In [26]:
cb_recommendations('Azul')

['Rennaissance Chess',
 'Azul: Stained Glass of Sintra',
 'Three Musketeers Game: Chess Variant',
 'Jaleo',
 'Blinq',
 'Spark',
 'Interplay',
 '0·1 (Zero Point One)',
 'Genial Spezial',
 'Color Wheel']

#### Create the final dataset for streamlit implementation

In [27]:
dbgm=df_games.copy()

In [28]:
indgame=dbgm.index.tolist()
indgame

['Die Macher',
 'Dragonmaster',
 'Samurai',
 'Tal der Könige',
 'Mare Mediterraneum',
 'Cathedral',
 'Lords of Creation',
 'El Caballero',
 'Elfenland',
 'Bohnanza',
 'Ra',
 'Catan',
 'Basari',
 'Cosmic Encounter',
 'MarraCash',
 'Button Men',
 'RoboRally',
 'Wacky Wacky West',
 'Full Metal Planète',
 'Gateway to the Stars',
 'Magic Realm',
 'Divine Right',
 'Twilight Imperium',
 'Battlemist',
 'Age of Renaissance',
 'Supremacy: The Game of the Superpowers',
 'Illuminati',
 'Terrain Vague',
 'Dark Tower',
 'Dark World',
 'Buffalo Chess',
 'Arkham Horror',
 'Federation & Empire',
 'Dragon Masters',
 'Runes',
 'Darkover',
 'Borderlands',
 "Can't Stop",
 'Tigris & Euphrates',
 'Airlines',
 'David & Goliath',
 'Medici',
 'Chinatown',
 'Krieg und Frieden',
 'Mamma Mia!',
 'Lost Cities',
 'Ricochet Robots',
 'Mighty Empires',
 'Brauerei',
 'Tikal',
 "Sophie's World",
 'Schoko & Co.',
 'Kings & Things',
 'Giganten',
 'Vinci',
 'Fossil',
 "Curse of the Mummy's Tomb",
 'Samurai: Game of Politic

In [29]:
dict_rec={}
for idx,valor in enumerate (indgame):
    dict_rec[idx]=cb_recommendations(valor)
print(dict_rec)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [30]:
dict_rec

{0: ['Cold War',
  'Quebec Libre',
  'Tax & Spend',
  'Vertigo',
  'League of Nations',
  'The Great Canadian Pie',
  'Democracy: Majority Rules',
  'John Company',
  'Organized Crime',
  'Firsts and Thirds'],
 1: ["10 Minute Heist: The Wizard's Tower",
  'Unreal Estate',
  'Mystick Companion',
  'Summon the Dragon',
  'Hérois & Monstros',
  'Der Herr der Wichtel',
  'Khrysos Hunters',
  'Fairy Season',
  'I Signori dei Draghi',
  'Crazier Eights',
  'Raise The Ruins'],
 2: ['Canossa',
  'Small Samurai Empires',
  'TA‐KE',
  'The Duke',
  'Cohorts: Game of Roman Checkers',
  'Barbacan',
  'Turnier',
  "The Duke: Lord's Legacy",
  'Torres',
  'Legie',
  'Lancelot'],
 3: ['Philosophia: Dare to be Wise',
  'Caracalla',
  'Cleopatra and the Society of Architects',
  'Porto Carthago',
  'Giza: The Great Pyramid',
  'Imhotep: The Duel',
  "Xi'an",
  'Tawantinsuyu: The Inca Empire',
  'Horrible Histories: Rotten Romans',
  'Municipium',
  'Tribes of the Four Seasons'],
 4: ['SeaFall',
  'Tung

In [31]:
dbgm['recomendacion']=dbgm['item_id'].apply(lambda z: dict_rec[z])

In [32]:
dbgm.head()

Unnamed: 0_level_0,item_id,category,recomendacion
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Die Macher,747,1021 1026 1001,"[Code Breaker, Hanabi & Ikebana, Gangster City..."
Dragonmaster,5896,1002 1010,"[Bears&Bees, Vers: The Rap Game, GYÜMI, BAM!: ..."
Samurai,292,1009 1035,"[Marquis, Im Schatten des Sonnenkönigs, Coffee..."
Tal der Könige,6308,1050,"[Schneller Becher, Hühnerpfanne, Castle Flutte..."
Mare Mediterraneum,15364,1015 1008,"[Oonie Moonie Goonie!, Bla Bla Bla, Match Plus..."


In [33]:
#dbgm.to_csv("DATA/cb_recommendations.csv")

## Most popular Board Games

The basic idea of this recommender is that the most popular and critically acclaimed board games will have a higher probability of being liked by the average audience. This model does not give personalized recommendations based on the user.

The Recommender gives generalized recommendations to each user based on popularity. All we have to do is to sort the games based on popularity and display the best board games on our list.

In [34]:
#Split random into training and test datasets
train, test = train_test_split(dfratings, test_size = 0.20, random_state = 42) 

In [35]:
print('There are %s users, %s itmes and %s pairs in the train set' \
      %(train.user_id.unique().shape[0], train.item_id.unique().shape[0], train.shape[0]))
train.head()

There are 2844 users, 34067 itmes and 671021 pairs in the train set


Unnamed: 0,user_id,item_id,rating
32330,123,1060,0
399743,1313,1287,1
763101,2564,5918,0
79614,259,1340,1
719436,2418,727,1


In [36]:
print('There are %s users, %s itmes and %s pairs in the test set' \
      %(test.user_id.unique().shape[0], test.item_id.unique().shape[0], test.shape[0]))
test.head()

There are 2838 users, 20894 itmes and 167756 pairs in the test set


Unnamed: 0,user_id,item_id,rating
332615,1114,1151,1
489596,1619,2907,1
499690,1659,647,1
698079,2334,1283,1
288796,963,7547,1


In [37]:
# Group the train dataset by item and count the number of users
popular = train.groupby('item_id')['user_id'].count()

In [38]:
popular.head()

item_id
0    294
1     74
2    157
3     65
4    798
Name: user_id, dtype: int64

In [39]:
# Sort in descending order
popularsort = popular.sort_values(ascending=False)

In [40]:
popularsort.shape[0]

34067

In [41]:
popularsort.head()

item_id
51     1322
195    1296
70     1262
242    1179
55     1171
Name: user_id, dtype: int64

In [42]:
popularsort.index #id of the Board Game

Int64Index([   51,   195,    70,   242,    55,     5,    44,    15,    93,
              256,
            ...
            27193, 14744, 27180, 27181, 27182, 27183, 27185, 27188, 27190,
            36533],
           dtype='int64', name='item_id', length=34067)

In [43]:
# Get the most popular games
popularbg = np.zeros(shape=(popularsort.shape[0], 3), dtype=object)

for i, ind in enumerate(popularsort.index):
    idx = ind 
    freq = popularsort[idx]  
    title = resample_item_dict[idx]
    popularbg[i] = [idx, title, freq]
    

In [44]:
popularbg[:10,:]

array([[51, 'Carcassonne', 1322],
       [195, 'Love Letter', 1296],
       [70, 'Codenames', 1262],
       [242, 'Pandemic', 1179],
       [55, 'Catan', 1171],
       [5, '7 Wonders', 1160],
       [44, 'Bohnanza', 1155],
       [15, 'Agricola', 1129],
       [93, 'Dominion', 1113],
       [256, 'Power Grid', 1062]], dtype=object)

In [45]:
# Recall function

def recall_at_n(N, test, recommended, train=None):

    if train is not None: 
        
        rec_true = []
        for r in recommended:
            if r not in train:
                rec_true.append(r)
    else:
        rec_true = recommended    
    intersection = len(set(test) & set(rec_true[:N]))
    return intersection / float(np.minimum(N, len(test)))

In [46]:
# Get Board games in train per user
trainUsergby = (train.groupby('user_id')['item_id'].apply(list).reset_index())

In [47]:
trainUsergby.head()

Unnamed: 0,user_id,item_id
0,0,"[276, 308, 192, 300, 100, 6, 391, 388, 287, 10..."
1,1,"[513, 411, 258, 650, 615, 294, 202, 77, 171, 6..."
2,2,"[791, 751, 360, 795, 629, 799, 763, 758, 725, ..."
3,3,"[823, 409, 157, 5, 369, 843, 591, 876, 877, 84..."
4,4,"[375, 903, 941, 939, 227, 957, 232, 947, 148, ..."


In [48]:
# Get Board games in test per user
testUsergby = (test.groupby('user_id')['item_id'].apply(list).reset_index())
testUsergby.head()

Unnamed: 0,user_id,item_id
0,0,"[158, 317, 31, 122, 167, 63, 43, 18, 327, 256,..."
1,1,"[548, 661, 578, 276, 611, 544, 454, 472, 689, ..."
2,2,"[163, 768, 157, 329, 346, 802, 781, 797, 455, ..."
3,3,"[870, 155, 824, 432, 599, 888, 829, 156, 868, ..."
4,4,"[893, 479, 325, 944, 959, 951, 361, 917, 922, ..."


In [49]:
# Merge both df
joinedtt = pd.merge(trainUsergby, testUsergby, how='inner', on='user_id', suffixes=('_train', '_test'))
joinedtt.head()

Unnamed: 0,user_id,item_id_train,item_id_test
0,0,"[276, 308, 192, 300, 100, 6, 391, 388, 287, 10...","[158, 317, 31, 122, 167, 63, 43, 18, 327, 256,..."
1,1,"[513, 411, 258, 650, 615, 294, 202, 77, 171, 6...","[548, 661, 578, 276, 611, 544, 454, 472, 689, ..."
2,2,"[791, 751, 360, 795, 629, 799, 763, 758, 725, ...","[163, 768, 157, 329, 346, 802, 781, 797, 455, ..."
3,3,"[823, 409, 157, 5, 369, 843, 591, 876, 877, 84...","[870, 155, 824, 432, 599, 888, 829, 156, 868, ..."
4,4,"[375, 903, 941, 939, 227, 957, 232, 947, 148, ...","[893, 479, 325, 944, 959, 951, 361, 917, 922, ..."


In [None]:
# Evaluate the model for different number of recommended elements 

list_topN=[1,5,10,20]

for topN in list_topN:
    rec=popularbg[:, 0]
    r_u=joinedtt.apply(lambda l: recall_at_n(N=topN, test=l[2], recommended=rec, train=l[1]), axis=1)
    print('TopN:',topN,',','recall: ',r_u.mean())

In [None]:
# MAP Function

def apk(N, test, recommended, train=None):

    if train is not None: 
        rec_true = []
        for r in recommended:
            if r not in train:
                rec_true.append(r)
    else:
        rec_true = recommended    
    predicted = rec_true[:N] # top-k predictions
    
    score = 0.0 # This will store the numerator
    num_hits = 0.0 # This will store the sum of rel(i)

    for i,p in enumerate(predicted):
        if p in test and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits/(i+1.0)

    return score / min(len(test), N)

In [None]:
list_topN=[1,5,10,20]

for topN in list_topN:
    predictions = popularbg[:, 0]
    m = joinedtt.apply(lambda l: apk(topN, l[2], predictions, l[1]), axis=1)
    print('TopN:',topN,',','map: ',m.mean()) 

#### Create the final dataset for streamlit implementation

In [None]:
pop_rec=pd.DataFrame(popularbg, columns=['item_id','title','freq']).head(10)

In [None]:
pop_rec.head()

In [None]:
#pop_rec.to_csv('popular.csv')

## Collaborative Filtering 

CF is a method for making automatic predictions (filtering) about a user's interests by gathering information about preferences or likes from many users (collaboratively). 

We will apply two collaborative filtering algorithms and see the results of each one of them

### Matrix Co-Ocurrence

The idea is to recommend board games similar to those purchased by a user. The measure of similarity between items is obtained from the co-occurrence matrix. 

In [None]:
#From train and test data  

gamesPerUser = (train.groupby('user_id')['item_id']
                 .apply(np.array)
                 .to_dict()
                 )

In [None]:
# Calculate the number of items in train
n_items = len(resample_item_dict.keys())
n_items

In [None]:
# Co-occurrence matrix
coMatrix = np.zeros((n_items, n_items)) 
for user,movies in gamesPerUser.items():
    for m in movies:
        coMatrix[m, movies] += 1

In [None]:
coMatrix

In [None]:
# visualize the matrix
plt.matshow(coMatrix, fignum=1000, cmap=plt.cm.binary)
plt.gcf().set_size_inches(18.5, 10.5)
plt.show()

In [None]:
def co_occurrance_similarity(item_id, coocurrance, ntop=10):

    similarItems = coocurrance[item_id, :]
    # return indeces of most similar items in descendign order
    mostSimilar = np.argsort(similarItems)[::-1]
    # remove the first element, as it is the item itslef
    mostSimilar = mostSimilar[1:ntop+1]
    
    # return a numpy array with the index (first column) and the value (second column) of the most similar items
    return np.stack((mostSimilar, similarItems[mostSimilar])).T

In [None]:
def co_occurrance_recommendation(items_id, cooccurrance, ntop=10):

    # List similar items and its value
    list_sim_items = np.vstack([co_occurrance_similarity(id_, cooccurrance, ntop) for id_ in items_id])
    
    # Group by id and take the maximum frquency to remove duplicates
    largest_freq = pd.DataFrame(list_sim_items, columns=['id', 'freq']).groupby('id').agg(max).reset_index()
    
    # sort by value in descending order
    sorted_list = largest_freq.sort_values(by='freq', ascending=False)
    
    # get the top N
    out = sorted_list.values[:ntop, 0]
    return out

In [None]:
# Get users in train with their games
trainUsersGrouped = train.groupby('user_id')['item_id'].apply(list).reset_index()
trainUsersGrouped.head()

In [None]:
Ntop = 10
# Get the recommendations for all users 
predictions = trainUsersGrouped.item_id.apply(lambda x: co_occurrance_recommendation(x, coMatrix, Ntop))
predictions.head()

In [None]:
# Get users in test with their movies
testUsersGrouped = test.groupby('user_id')['item_id'].apply(list).reset_index()
testUsersGrouped.head()

In [None]:
for (seen, recom) in zip(testUsersGrouped.values[:5, 1], predictions[:5]):
    print("*"*6)
    print("Seen items: ")
    print([resample_item_dict[i] for i in seen])
    print("Recommended items: ")
    print([resample_item_dict[i] for i in recom]) 

In [None]:
topN=10
# Add a prediction column to train
trainUsersGrouped['prediction'] = trainUsersGrouped.item_id.apply(
    lambda x: co_occurrance_recommendation(x, coMatrix, topN)
)

In [None]:
trainUsersGrouped.head()

In [None]:
# Join the df with train and predictions with the test df
joined = pd.merge(trainUsersGrouped, testUsersGrouped, how='inner', on='user_id', suffixes=('_train', '_test'))

In [None]:
joined.head()

In [None]:
def evaluate_recall(topN, trainGrouped, testGrouped, coMatrix, popularity_baseline):
    # Add a prediction column to train
    trainUsersGrouped['prediction'] = trainUsersGrouped.item_id.apply(
        lambda x: co_occurrance_recommendation(x, coMatrix, topN))
    
    # Join with test data
    joined = pd.merge(trainUsersGrouped, testUsersGrouped, how='inner', on='user_id', suffixes=('_train', '_test'))
    
    # Calculate average recall
    recall = joined.apply(lambda l: 
                 recall_at_n(N=topN, test=l[3], recommended=l[2], train=l[1]), axis=1).mean()
    print("Co-occurance model: recall@%s=%.3f"%(topN, recall))
    
    # Calculate average recall for the baseline
    recall_bl = joined.apply(lambda l: 
                 recall_at_n(N=topN, test=l[3], recommended=popularity_baseline, train=l[1]), axis=1).mean()
    print("Popularity model: recall%s=%.3f"%(topN, recall_bl))    
    return recall, recall_baseline

In [None]:
for k in [5,10,20,30]:
    evaluate_recall(k, trainUsersGrouped, testUsersGrouped, coMatrix, popularbg[:, 0]);

### Alternating Least Square (ALS) 

In [53]:
#Create a spark session
spark = SparkSession.builder.getOrCreate()

In [54]:
#Load data

schema = StructType(
    (
        StructField("user_id", IntegerType()),
        StructField("item_id", IntegerType()),
        StructField("rating", IntegerType()),
    )
)

df_ratings = spark.createDataFrame(dfratings, schema=schema)
df_ratings.show(5)

schema = StructType(
    (
        StructField("item_id", IntegerType()),
        StructField("title", StringType()),
        StructField("category", StringType()),
    )
)


df_games = spark.createDataFrame(dfgames, schema=schema)
df_games.show(5, False)


+-------+-------+------+
|user_id|item_id|rating|
+-------+-------+------+
|      0|      0|     0|
|      0|      1|     1|
|      0|      2|     1|
|      0|      3|     1|
|      0|      4|     1|
+-------+-------+------+
only showing top 5 rows

+-------+------------------+--------------+
|item_id|title             |category      |
+-------+------------------+--------------+
|747    |Die Macher        |1021,1026,1001|
|5896   |Dragonmaster      |1002,1010     |
|292    |Samurai           |1009,1035     |
|6308   |Tal der Könige    |1050          |
|15364  |Mare Mediterraneum|1015,1008     |
+-------+------------------+--------------+
only showing top 5 rows



In [55]:
# Count the total number of ratings in the dataset
numerator = df_ratings.select("rating").count()

# Count the number of distinct user_id and distinct item_id
num_users = df_ratings.select("user_id").distinct().count()
num_movies = df_ratings.select("item_id").distinct().count()

# Set the denominator equal to the number of users multiplied by the number of movies
denominator = num_users * num_movies

# Divide the numerator by the denominator
sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.")

The ratings dataframe is  99.19% empty.


In [56]:
#Split the data using the Spark random 
# Create test and train set
train, test = df_ratings.randomSplit([0.8, 0.2])
print ("N train: ", train.cache().count())
print ("N test: ", test.cache().count())

N train:  671195
N test:  167582


In [57]:
# Create ALS model

als = ALS(maxIter=10, regParam=0.01, implicitPrefs=True, userCol='user_id', itemCol='item_id', ratingCol='rating', nonnegative=True, coldStartStrategy="drop")

# Confirm that a model called "als" was created
type(als)

pyspark.ml.recommendation.ALS

In [58]:
# Train the model
model = als.fit(train)

In [72]:

model.write().overwrite().save("DATA/cfModel_bm")

In [59]:
# See the predictions
predictions = model.transform(test)
predictions.show(10)

+-------+-------+------+-----------+
|user_id|item_id|rating| prediction|
+-------+-------+------+-----------+
|   1903|    148|     1|  0.2472351|
|    580|    148|     1| 0.21274866|
|   2025|    148|     0| 0.18398985|
|   2542|    148|     0|        0.0|
|   1533|    148|     0| 0.07430235|
|   2625|    148|     0|0.032319084|
|    876|    148|     0|        0.0|
|   1417|    148|     1| 0.33603403|
|   2173|    148|     0|  0.7071229|
|   1823|    148|     1|  0.2610604|
+-------+-------+------+-----------+
only showing top 10 rows



In [60]:
# Pick a user and rank the predictions

user = 19

predictions.filter(F.col('user_id') == user)\
                 .orderBy(F.col('item_id')).show(5)  

predictions.filter(F.col('user_id') == user)\
                 .orderBy(F.col('prediction'), ascending=False).show(5)

+-------+-------+------+----------+
|user_id|item_id|rating|prediction|
+-------+-------+------+----------+
|     19|     17|     0|0.33470666|
|     19|     25|     1| 0.5029502|
|     19|     26|     1|0.33301276|
|     19|    190|     1| 0.5967275|
|     19|    195|     1| 0.6421389|
+-------+-------+------+----------+
only showing top 5 rows

+-------+-------+------+----------+
|user_id|item_id|rating|prediction|
+-------+-------+------+----------+
|     19|   1504|     1| 0.7592804|
|     19|   1464|     1|  0.679501|
|     19|    195|     1| 0.6421389|
|     19|    533|     1| 0.6058105|
|     19|    190|     1| 0.5967275|
+-------+-------+------+----------+
only showing top 5 rows



In [61]:
# Evaluation
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [62]:
# Account for the error
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.602846405298217


In [63]:
# Make predictions and evaluate Recall and MAP

predictions_mod=predictions.select(['user_id','item_id','prediction'])

In [64]:
TOP_K = 10
rank_eval = SparkRankingEvaluation(test, predictions_mod, k = TOP_K, col_user="user_id", col_item="item_id", 
                                    col_rating="rating", col_prediction="prediction", 
                                    relevancy_method="top_k")

In [65]:
print("Model:\tALS",
      "Top K:\t%d" % rank_eval.k,
      "MAP:\t%f" % rank_eval.map_at_k(),
      "Recall@K:\t%f" % rank_eval.recall_at_k(), sep='\n')

Model:	ALS
Top K:	10
MAP:	0.359470
Recall@K:	0.359470


In [71]:
# Make a recommendation

# Top 5 games for all users
userRecs = model.recommendForAllUsers(5)

# Filter for our specific user
userRecs.filter(F.col('user_id')==user).show(1,False)

+-------+-----------------------------------------------------------------------------------------+
|user_id|recommendations                                                                          |
+-------+-----------------------------------------------------------------------------------------+
|19     |[{213, 1.1072772}, {230, 1.0811012}, {117, 1.0587282}, {814, 1.031285}, {267, 1.0113207}]|
+-------+-----------------------------------------------------------------------------------------+



#### Create the final dataset for streamlit implementation

In [68]:
#Predictions with filter by rating=1
dataSet = df_ratings.select('user_id','item_id','rating').distinct()

In [69]:
gamelookfor = df_ratings.filter(df_ratings.rating == 1).select('item_id','user_id','rating')

In [70]:
predictions_r = model.transform(dataSet.subtract(gamelookfor)).orderBy('prediction', ascending=False).toPandas()

In [73]:
# Obtain the first highest predictions for each item
bgp=predictions_r.groupby(('item_id'), as_index=False).first()
bgp.head()

Unnamed: 0,item_id,user_id,rating,prediction
0,0,2086,1,1.275331
1,1,1253,1,0.714589
2,2,957,1,0.507767
3,3,1253,1,0.570925
4,4,1310,1,1.14677


In [74]:
# Generate n Recommendations for all users
userRecs = model.recommendForAllUsers(10)

In [75]:
nrecommendations = userRecs\
    .withColumn("rec_exp", explode("recommendations"))\
    .select('user_id', col("rec_exp.item_id"), col("rec_exp.rating"))

In [76]:
nrecommendations.show()

+-------+-------+----------+
|user_id|item_id|    rating|
+-------+-------+----------+
|   1580|    695| 0.9816303|
|   1580|    199|  0.781639|
|   1580|    268| 0.7541874|
|   1580|     55|0.73070264|
|   1580|    361| 0.7152955|
|   1580|    464| 0.6958098|
|   1580|    599|0.69279087|
|   1580|    576|0.69033056|
|   1580|    256|0.68909657|
|   1580|    469|  0.685916|
|    471|    242| 0.9197144|
|    471|     51|0.90704346|
|    471|     55|  0.872535|
|    471|    268| 0.8188705|
|    471|     93|0.80570585|
|    471|    361|0.80476296|
|    471|     63| 0.7802094|
|    471|     15| 0.7738608|
|    471|    303|0.77083135|
|    471|    199| 0.7252281|
+-------+-------+----------+
only showing top 20 rows



In [77]:
# Join with the df_games
nrecus=nrecommendations.join(df_games, on='item_id').toPandas()

In [79]:
nrecus.head()

Unnamed: 0,item_id,user_id,rating,title,category
0,148,463,0.661954,Hanabi,100210391045
1,148,1088,0.844733,Hanabi,100210391045
2,148,1959,0.522263,Hanabi,100210391045
3,148,2659,0.773872,Hanabi,100210391045
4,148,2580,0.508205,Hanabi,100210391045


In [80]:
# Groupby user_id and show the recommendations

nrecusgby = (nrecus.groupby('user_id')['title'].apply(list).reset_index())
nrecusgby.head()

Unnamed: 0,user_id,title
0,0,"[Hanabi, Stone Age, 7 Wonders Duel, 7 Wonders,..."
1,1,"[Lost Cities, Bohnanza, No Thanks!, 7 Wonders,..."
2,2,"[Hanabi, Bohnanza, No Thanks!, Hive, For Sale,..."
3,3,"[Bohnanza, Dominion, Race for the Galaxy, Agri..."
4,4,"[Isle of Skye: From Chieftain to King, 7 Wonde..."


In [81]:
# Join with the dF first highest predictions
sim_rec = pd.merge(bgp[['item_id','user_id']], nrecusgby, how='left', left_on=['user_id'], right_on=['user_id'])

In [82]:
sim_rec.head()

Unnamed: 0,item_id,user_id,title
0,0,2086,"[London, Power Grid: Factory Manager, Twilight..."
1,1,1253,"[Alea Iacta Est, Navegador, Glen More, Catan, ..."
2,2,957,"[Hannibal: Rome vs. Carthage, Combat Commander..."
3,3,1253,"[Alea Iacta Est, Navegador, Glen More, Catan, ..."
4,4,1310,"[No Thanks!, Ingenious, Samurai, For Sale, Col..."


In [83]:
dfgames.head()

Unnamed: 0,item_id,title,category
0,747,Die Macher,102110261001
1,5896,Dragonmaster,10021010
2,292,Samurai,10091035
3,6308,Tal der Könige,1050
4,15364,Mare Mediterraneum,10151008


In [84]:
# Add the Game board title
sim_rec2 = pd.merge(sim_rec[['item_id','title']], dfgames[['item_id', 'title']], how='left', left_on=['item_id'], right_on=['item_id'])

In [85]:
simrec = pd.DataFrame()

simrec['item_id']=sim_rec2['item_id']
simrec['title']=sim_rec2['title_y']
simrec['recommendation']=sim_rec2['title_x']

In [86]:
simrec.head()

Unnamed: 0,item_id,title,recommendation
0,0,1830: Railways & Robber Barons,"[London, Power Grid: Factory Manager, Twilight..."
1,1,18AL,"[Alea Iacta Est, Navegador, Glen More, Catan, ..."
2,2,2 de Mayo,"[Hannibal: Rome vs. Carthage, Combat Commander..."
3,3,23,"[Alea Iacta Est, Navegador, Glen More, Catan, ..."
4,4,6 nimmt!,"[No Thanks!, Ingenious, Samurai, For Sale, Col..."


In [87]:
simrec.set_index('title', inplace = True)
simrec.head()

Unnamed: 0_level_0,item_id,recommendation
title,Unnamed: 1_level_1,Unnamed: 2_level_1
1830: Railways & Robber Barons,0,"[London, Power Grid: Factory Manager, Twilight..."
18AL,1,"[Alea Iacta Est, Navegador, Glen More, Catan, ..."
2 de Mayo,2,"[Hannibal: Rome vs. Carthage, Combat Commander..."
23,3,"[Alea Iacta Est, Navegador, Glen More, Catan, ..."
6 nimmt!,4,"[No Thanks!, Ingenious, Samurai, For Sale, Col..."


In [88]:
#simrec.to_csv("DATA/cf_similars.csv")

In [None]:
# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()

# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

In [None]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Confirm cv was built
print(cv)

In [None]:
#Fit cross validator to the 'train' dataset
model = cv.fit(train)

#Extract best model from the cv model above
best_model = model.bestModel

In [None]:
# Print best_model
print(type(best_model))

# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# # Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())

# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())

# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

In [None]:
# View the predictions
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

In [None]:
test_predictions.show()

In [None]:
# Generate n Recommendations for all users
nrecommendations = best_model.recommendForAllUsers(10)
nrecommendations.limit(10).show()



In [None]:
#Original
'''Este proceso nos permite encontrar los hiperparámetros que mejor se ajustan a una predicción.
   Este proceso, requiere ejecutar varias repeticiones del mismo modelo, por lo que consume tiempo 
   de computación.'''


from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

(trainingRatings, validationRatings) = df_ratings.randomSplit([80.0, 20.0])

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

paramGrid = ParamGridBuilder()\
                .addGrid(rec.rank, [5, 7, 10,20])\
                .addGrid(rec.maxIter, [20])\
                .addGrid(rec.regParam, [0.05, 0.1, 0.5]).build()

crossval = CrossValidator(estimator=rec, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10)
cvModel = crossval.fit(trainingRatings)
predictions = cvModel.transform(validationRatings)

print('The root mean squared error for our model is: {}'.format(evaluator.evaluate(predictions.na.drop())))

In [None]:
best_model=cvModel.bestModel

In [None]:
path='/DATA'
best_model.write().overwrite().save("DATA/cvModel_bm")

In [None]:
predictions.show()

In [None]:
predictions = best_model.transform(validationRatings)
predictions_cv_mod=predictions.select(['user_id','item_id','prediction'])

In [None]:
# Generate top 10 digital music recommendations for each user
userRecs = cvModel.bestModel.recommendForAllUsers(10)
# Generate top 10 user recommendations for each digital music
boardGameRecs = cvModel.bestModel.recommendForAllItems(10)

userRecs.show(5)
boardGameRecs.show(5)

In [83]:
sameModel = cvModel.load("DATA/cvModel")

In [85]:

spark.stop()