# Motores de recomendación

In [1]:
%pylab
%matplotlib inline

%config InlineBackend.figure_format = 'retina'

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib


Se importan los datos de un página web de recomendaciones de cervezas

In [3]:
import pandas as pd

beer_reviews = pd.read_csv("beer_reviews.csv.bz2", compression = "bz2")

print(shape(beer_reviews))
beer_reviews.head()

(1586614, 13)


Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [4]:
beer_reviews['review_profilename'] = beer_reviews['review_profilename'].astype(str)

# Filtado de los datos

Para acelerar los cálculos se utilizan solamente las cervezas que tienen más de 1800 evaluaciones y los usuarios con más de 5 evaluaciones

In [5]:
reviews_by_beer = beer_reviews.groupby(['beer_name']).apply(lambda x: len(x))
reviews_by_beer = reviews_by_beer[reviews_by_beer > 1800].index.tolist()

reviews_by_user = beer_reviews.groupby(['review_profilename']).apply(lambda x: len(x))
reviews_by_user = reviews_by_user[reviews_by_user > 5].index.tolist()

mask = (beer_reviews.beer_name.isin(reviews_by_beer)) & (beer_reviews.review_profilename.isin(reviews_by_user))

beer_reviews = beer_reviews[['beer_name', 'review_profilename', 'review_overall', 'review_aroma', 'review_palate', 'review_taste', 'review_appearance']][mask]
beer_reviews.head()

Unnamed: 0,beer_name,review_profilename,review_overall,review_aroma,review_palate,review_taste,review_appearance
798,Imperial Stout,grumpy,4.5,4.5,4.0,4.5,4.0
1559,Pale Ale,blitheringidiot,4.0,3.5,3.5,3.5,4.0
1560,Pale Ale,NeroFiddled,4.0,2.5,4.0,3.5,4.0
2225,Pale Ale,DaPeculierDane,3.0,1.5,2.0,1.5,4.5
2382,Pale Ale,brewerburgundy,3.0,3.0,3.0,3.5,4.0


## Funciones

### Obtención de la evaluación de cervezas

In [6]:
def get_beer_reviews(beer, users):
    reviews = beer_reviews[(beer_reviews.review_profilename.isin(users)) & (beer_reviews.beer_name == beer)]
    reviews = reviews[reviews.review_profilename.duplicated() == False]
    return reviews

get_beer_reviews('Pale Ale', ['blitheringidiot', 'NeroFiddled'])

Unnamed: 0,beer_name,review_profilename,review_overall,review_aroma,review_palate,review_taste,review_appearance
1559,Pale Ale,blitheringidiot,4.0,3.5,3.5,3.5,4.0
1560,Pale Ale,NeroFiddled,4.0,2.5,4.0,3.5,4.0


### Similaritud

In [8]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import cosine_distances
from scipy.stats.stats import pearsonr

def calculate_similarity(beer1, beer2, features = ['review_overall'], distance = 'euclidean'):
    beer_1_reviewers = beer_reviews[beer_reviews.beer_name==beer1].review_profilename.unique()
    beer_2_reviewers = beer_reviews[beer_reviews.beer_name==beer2].review_profilename.unique()
    common_reviewers = set(beer_1_reviewers).intersection(beer_2_reviewers)
    
    beer_1_reviews = get_beer_reviews(beer1, common_reviewers)
    beer_2_reviews = get_beer_reviews(beer2, common_reviewers)
    
    dists = []
    for f in features:
        review_1 = beer_1_reviews[f].values.reshape(1, -1)
        review_2 = beer_2_reviews[f].values.reshape(1, -1)
        if distance == 'euclidean':
            dists.append(euclidean_distances(review_1, review_2)[0][0])
        elif distance == 'manhattan':
            dists.append(manhattan_distances(review_1, review_2)[0][0])
        elif distance == 'cosine':
            dists.append(cosine_distances(review_1, review_2)[0][0])
        elif distance == 'pearsonr':
            dists.append(pearsonr(beer_1_reviews[f], beer_2_reviews[f])[0])
        else:
            dists.append(euclidean_distances(review_1, review_2)[0][0])
    
    return dists

print(calculate_similarity('Pale Ale', 'Imperial Stout', distance = 'euclidean'))
print(calculate_similarity('Pale Ale', 'Imperial Stout', distance = 'manhattan'))
print(calculate_similarity('Pale Ale', 'Imperial Stout', distance = 'cosine'))
print(calculate_similarity('Pale Ale', 'Imperial Stout', distance = 'pearsonr'))

[23.323807579381203]
[427.0]
[0.03030021850243525]
[-0.017235126331882216]


In [9]:
print(calculate_similarity('Pale Ale', 'Imperial Stout', features = ['review_aroma'], distance = 'euclidean'))
print(calculate_similarity('Pale Ale', 'Imperial Stout', features = ['review_aroma'], distance = 'manhattan'))
print(calculate_similarity('Pale Ale', 'Imperial Stout', features = ['review_aroma'], distance = 'cosine'))
print(calculate_similarity('Pale Ale', 'Imperial Stout', features = ['review_aroma'], distance = 'pearsonr'))

[22.38302928559939]
[423.0]
[0.018435759699908738]
[0.06688644957714024]


In [10]:
print(calculate_similarity('Pale Ale', 'Imperial Stout', features = ['review_overall', 'review_aroma'], distance = 'euclidean'))
print(calculate_similarity('Pale Ale', 'Imperial Stout', features = ['review_overall', 'review_aroma'], distance = 'manhattan'))
print(calculate_similarity('Pale Ale', 'Imperial Stout', features = ['review_overall', 'review_aroma'], distance = 'cosine'))
print(calculate_similarity('Pale Ale', 'Imperial Stout', features = ['review_overall', 'review_aroma'], distance = 'pearsonr'))

[23.323807579381203, 22.38302928559939]
[427.0, 423.0]
[0.03030021850243525, 0.018435759699908738]
[-0.017235126331882216, 0.06688644957714024]


# Sistema de recomendación por similaritud

In [11]:
unique_beer          = beer_reviews.beer_name.unique();
num_beer             = len(unique_beer) 
euclidean_similarity = []

for id_1 in range(0, num_beer - 1):
    for id_2 in range(id_1 + 1, num_beer):
        beer_1 = unique_beer[id_1]
        beer_2 = unique_beer[id_2]
        euclidean_similarity.append([beer_1, beer_2] + calculate_similarity(beer_1, beer_2))

euclidean_similarity = pd.DataFrame(euclidean_similarity,
                                    columns = ["beer1", "beer2", "overall_dist"])
euclidean_similarity = euclidean_similarity.sort_values(by = 'overall_dist', ascending = True)

euclidean_similarity.head()

Unnamed: 0,beer1,beer2,overall_dist
816,Weihenstephaner Hefeweissbier,Pliny The Elder,17.080691
704,Tröegs Nugget Nectar,Samuel Smith's Oatmeal Stout,17.190113
350,Sierra Nevada Pale Ale,Tröegs Nugget Nectar,17.71299
504,Chimay Grande Réserve (Blue),Bell's Hopslam Ale,18.041619
706,Tröegs Nugget Nectar,Weihenstephaner Hefeweissbier,18.069311


In [12]:
euclidean_similarity[euclidean_similarity.beer1 == "Duvel"].head()

Unnamed: 0,beer1,beer2,overall_dist
991,Duvel,Pliny The Elder,20.796634
1000,Duvel,Prima Pils,23.600847
990,Duvel,Ayinger Celebrator Doppelbock,23.953079
998,Duvel,HopDevil Ale,24.474477
1002,Duvel,Hop Wallop,25.019992


In [13]:
unique_beer          = beer_reviews.beer_name.unique();
num_beer             = len(unique_beer) 
euclidean_similarity = []

for id_1 in range(0, num_beer - 1):
    for id_2 in range(id_1 + 1, num_beer):
        beer_1 = unique_beer[id_1]
        beer_2 = unique_beer[id_2]
        euclidean_similarity.append([beer_1, beer_2] + calculate_similarity(beer_1, beer_2,  features = ['review_aroma']))

euclidean_similarity = pd.DataFrame(euclidean_similarity,
                                    columns = ["beer1", "beer2", "review_aroma"])
euclidean_similarity = euclidean_similarity.sort_values(by = 'review_aroma', ascending = True)

euclidean_similarity.head()

Unnamed: 0,beer1,beer2,review_aroma
706,Tröegs Nugget Nectar,Weihenstephaner Hefeweissbier,15.676415
708,Tröegs Nugget Nectar,Schneider Aventinus,16.194135
389,Sierra Nevada Torpedo Extra IPA,Samuel Smith's Oatmeal Stout,16.30184
717,Tröegs Nugget Nectar,Ayinger Celebrator Doppelbock,16.363068
8,Imperial Stout,Sierra Nevada Torpedo Extra IPA,16.552945


In [14]:
euclidean_similarity[euclidean_similarity.beer1 == "Duvel"].head()

Unnamed: 0,beer1,beer2,review_aroma
1002,Duvel,Hop Wallop,21.089097
1000,Duvel,Prima Pils,21.219095
998,Duvel,HopDevil Ale,21.377558
990,Duvel,Ayinger Celebrator Doppelbock,21.406775
999,Duvel,Brooklyn Black Chocolate Stout,21.737065


# Matrices de coocurencia

Creación de la matriz de coocurencia para estimar la puntuación de los usuarios

In [15]:
review_profilename = beer_reviews.review_profilename.unique()
coocuMatrix        = np.zeros(shape = (num_beer, num_beer))

for user in review_profilename:
    beer_list = beer_reviews[beer_reviews.review_profilename == user].beer_name.unique()
    for beer_1 in beer_list:
        for beer_2 in beer_list:
            pos_1 = [i for i, x in enumerate(unique_beer) if x == beer_1]
            pos_2 = [i for i, x in enumerate(unique_beer) if x == beer_2]
            coocuMatrix[pos_1, pos_2] += 1

## Algortimo de recomendación 

In [16]:
evaluateMatrix = np.zeros(shape = (num_beer))

user         = review_profilename[1]
user_reviews = beer_reviews[beer_reviews.review_profilename == user]
user_reviews = user_reviews.groupby('beer_name').apply(lambda x: mean(x))

for n in range(num_beer):
    pos = [i for i, x in enumerate(user_reviews.index) if x == unique_beer[n]]
    if len(pos) == 1:
        evaluateMatrix[n] = user_reviews['review_overall'][pos]
        
recomendation                     = np.dot(coocuMatrix, evaluateMatrix)
recomendation[evaluateMatrix > 0] = 0
recomendation                     = [i for i, x in enumerate(recomendation) if x == max(recomendation)]

print('recomendada', unique_beer[recomendation])

recomendada ['Old Rasputin Russian Imperial Stout']


In [17]:
print('recomendada', unique_beer[recomendation])

recomendada ['Old Rasputin Russian Imperial Stout']


In [18]:
evaluateMatrix = np.zeros(shape = (num_beer))
evaluateMatrix[30] = 1
evaluateMatrix[10] = 5
evaluateMatrix[2] = 5
evaluateMatrix[5] = 5
evaluateMatrix[40] = 5

recomendation                     = np.dot(coocuMatrix, evaluateMatrix)
recomendation[evaluateMatrix > 0] = 0
recomendation                     = [i for i, x in enumerate(recomendation) if x == max(recomendation)]

print('recomendada', unique_beer[recomendation])

recomendada ['Old Rasputin Russian Imperial Stout']
