In [249]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading beer csv file
beer = pd.read_csv('beer_data.csv')

In [250]:
beer.head()

Unnamed: 0,beer_beerid,review_profilename,review_overall
0,48215,stcules,3.0
1,52159,oline73,3.0
2,52159,alpinebryant,3.0
3,52159,rawthar,4.0
4,52159,RangerClegg,3.5


In [251]:
#Inspecting beer data frame columns
beer.columns

Index(['beer_beerid', 'review_profilename', 'review_overall'], dtype='object')

In [252]:
#Check for missing values for each column
beer.isnull().sum()

#'Review_profilename' column contains 100 missing values should be removed.

beer_beerid             0
review_profilename    100
review_overall          0
dtype: int64

In [253]:
#Filtering rows where review_profilename is not null
beer = beer[beer['review_profilename'].notnull()]

In [254]:
beer.count()

beer_beerid           475884
review_profilename    475884
review_overall        475884
dtype: int64

In [255]:
#Dropping duplicate rows from beer data frame
beer.drop_duplicates(subset=['beer_beerid','review_profilename'], keep='first', inplace=True)

In [256]:
beer.count()

beer_beerid           474462
review_profilename    474462
review_overall        474462
dtype: int64

In [257]:
#Grouping beer data frame by beer ID to check for frequency of number of ratings for each beer ID
beer_group = beer.groupby('beer_beerid')

In [258]:
#Calculating the mean frequency of number of  ratings
beer_group['review_profilename'].count().mean()

11.772082175466455

In [259]:
# 1.1 Taking all beers where no of ratings are greater than 11 (N=11)
beer_filter = beer_group.filter(lambda x: x['review_profilename'].count() >= 11)

In [260]:
# 2.1 What are the unique values of ratings?
beer_filter['review_overall'].unique()
#Unique values of rating are 5,4.5,4.0,3.5,3.0,2.5,2.0,1.5,1

array([4.5, 4. , 3. , 3.5, 2.5, 5. , 2. , 1.5, 1. ])

In [261]:
# 2.2.1 The average beer ratings
beer_filter.groupby('beer_beerid')['review_overall'].mean()

beer_beerid
5        3.553097
6        3.711462
7        3.317073
8        3.543478
9        3.378788
10       3.883028
11       3.232143
12       3.720000
13       3.333333
14       3.830508
15       3.824074
17       3.429461
19       4.009036
20       4.041667
21       4.250000
23       3.970588
24       3.894737
26       3.588235
27       3.590909
29       2.909091
30       4.094398
31       3.978758
32       4.010870
33       4.138507
34       4.281965
36       4.139286
39       4.334061
40       3.730769
44       3.647059
50       3.413793
           ...   
74123    3.708955
74131    4.125000
74144    3.531250
74272    3.648148
74321    3.250000
74323    3.920290
74390    3.718750
74405    4.153846
74491    3.812500
74530    3.989362
74548    3.770000
74579    3.730000
74591    3.375000
74634    3.657895
74759    4.083333
74783    3.217391
74827    3.947368
74904    3.911765
74942    3.638889
74986    4.217054
75013    4.179487
75086    3.660494
75160    4.269231
75188    4.13333

In [262]:
# 2.2.2 The average user ratings
beer_filter.groupby('review_profilename')['review_overall'].mean()

review_profilename
0110x011            4.303030
01Ryan10            5.000000
04101Brewer         4.000000
05Harley            4.152778
0beerguy0           4.166667
0runkp0s            3.500000
0tt0                3.964286
1000Bottles         4.071429
1001111             4.000000
100floods           4.035714
1050Sudz            4.000000
108Dragons          4.500000
1099                3.794118
1121987             4.333333
11millsown113       4.333333
11osixBrew          3.785714
11thFloorBrewing    4.375000
1229design          4.500000
12ouncecurls        5.000000
12percent           4.500000
12vUnion            4.500000
12vman              4.000000
130guy              3.100000
13aphomet           4.000000
13smurrf            5.000000
160Shillings        3.333333
1759Girl            3.687500
1759dallas          4.000000
1844original        4.500000
18alpha             3.750000
                      ...   
zooga               3.000000
zook74              3.857143
zoolou              3.25

In [263]:
# 2.2.3 The average number of ratings given to the beers
beer_filter.groupby('beer_beerid')['review_overall'].count().mean()

60.63628020801468

In [264]:
# 2.2.4 The average number of ratings given by the users
beer_filter.groupby('review_profilename')['review_overall'].count().mean()

18.671816126601357

In [265]:
# 3.Recommendation Models
# 3.1 Divide your data into training and testing dataset
from sklearn.model_selection import train_test_split
train, test = train_test_split(beer_filter, test_size=0.30, random_state=31)

In [266]:
print(train.shape)
print(test.shape)

(277508, 3)
(118932, 3)


In [267]:
train.groupby('review_profilename').count()

Unnamed: 0_level_0,beer_beerid,review_overall
review_profilename,Unnamed: 1_level_1,Unnamed: 2_level_1
0110x011,25,25
01Ryan10,1,1
04101Brewer,1,1
05Harley,26,26
0beerguy0,3,3
0tt0,13,13
1000Bottles,6,6
100floods,29,29
1050Sudz,1,1
1099,12,12


In [268]:
# pivot beer ratings into beer features
df_beer_features = train.pivot(
    index='beer_beerid',
    columns='review_profilename',
    values='review_overall'
).fillna(0)

In [269]:
df_beer_features.head()

review_profilename,0110x011,01Ryan10,04101Brewer,05Harley,0beerguy0,0tt0,1000Bottles,100floods,1050Sudz,1099,...,zulufactor,zumicroom,zwalk8,zwan,zymrgy,zymurgy4all,zymurgywhiz,zythus,zyzygy,zzajjber
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [270]:
#These dataset will be used for prediction and evaluation. 
#Dummy train will be used for prediction of the bbers which has not been rated by the user.To ignore the beers rated by the user, we will mark it as 0 during prediction. The beers not rated by user is marked as 1 for prediction. 
#Dummy test will be used for evaluation. To evaluate, we will only make prediction on the beers rated by the user. So, this is marked as 1. This is just opposite of dummy_train


In [271]:
dummy_train = train.copy()
dummy_test = test.copy()

In [272]:
dummy_train['review_overall'] = dummy_train['review_overall'].apply(lambda x: 0 if x>=1 else 1)
dummy_test['review_overall'] = dummy_test['review_overall'].apply(lambda x: 1 if x>=1 else 0)

In [273]:
# The beers not rated by user is marked as 1 for prediction. 
dummy_train = dummy_train.pivot(
   index='beer_beerid',
    columns='review_profilename',
    values='review_overall'
).fillna(1)

# The beers not rated by user is marked as 0 for evaluation. 
dummy_test = dummy_test.pivot(
    index='beer_beerid',
    columns='review_profilename',
    values='review_overall'
).fillna(0)

In [274]:
dummy_train.head()

review_profilename,0110x011,01Ryan10,04101Brewer,05Harley,0beerguy0,0tt0,1000Bottles,100floods,1050Sudz,1099,...,zulufactor,zumicroom,zwalk8,zwan,zymrgy,zymurgy4all,zymurgywhiz,zythus,zyzygy,zzajjber
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [275]:
dummy_test.head()

review_profilename,0110x011,05Harley,0runkp0s,0tt0,1000Bottles,1001111,100floods,108Dragons,1099,1121987,...,ztaylor1,ztoellner,ztprez,ztruempy,zuffenhausen,zuggy9,zwan,zymurgy4all,zymurgywhiz,zythus
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [276]:
# User Similarity Matrix using Cosine Similarity
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
user_correlation = 1 - pairwise_distances(df_beer_features, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

[[1.         0.07125935 0.07811082 ... 0.         0.         0.03655956]
 [0.07125935 1.         0.04877168 ... 0.0266672  0.01762084 0.        ]
 [0.07811082 0.04877168 1.         ... 0.02331029 0.01341187 0.        ]
 ...
 [0.         0.0266672  0.02331029 ... 1.         0.         0.09341961]
 [0.         0.01762084 0.01341187 ... 0.         1.         0.04409194]
 [0.03655956 0.         0.         ... 0.09341961 0.04409194 1.        ]]


In [277]:
user_correlation.shape

(6538, 6538)

In [278]:
# User Similarity Matrix using adjusted Cosine
# Here, not removing the NaN values and calculating the mean only for the beers rated by the user
beers_features = train.pivot(
     index='beer_beerid',
    columns='review_profilename',
    values='review_overall'
)

In [279]:
beers_features.head()

review_profilename,0110x011,01Ryan10,04101Brewer,05Harley,0beerguy0,0tt0,1000Bottles,100floods,1050Sudz,1099,...,zulufactor,zumicroom,zwalk8,zwan,zymrgy,zymurgy4all,zymurgywhiz,zythus,zyzygy,zzajjber
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


In [280]:
#Normalising the rating of the beer for each user aroung 0 mean
mean = np.nanmean(beers_features, axis=1)
df_subtracted = (beers_features.T-mean).T

In [281]:
df_subtracted.head()

review_profilename,0110x011,01Ryan10,04101Brewer,05Harley,0beerguy0,0tt0,1000Bottles,100floods,1050Sudz,1099,...,zulufactor,zumicroom,zwalk8,zwan,zymrgy,zymurgy4all,zymurgywhiz,zythus,zyzygy,zzajjber
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


In [282]:
# Finding cosine similarity
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
user_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

[[ 1.         -0.01216227  0.06485235 ...  0.          0.
   0.00372398]
 [-0.01216227  1.         -0.02246935 ...  0.          0.00866352
   0.        ]
 [ 0.06485235 -0.02246935  1.         ...  0.          0.
   0.        ]
 ...
 [ 0.          0.          0.         ...  1.          0.
   0.        ]
 [ 0.          0.00866352  0.         ...  0.          1.
   0.2942817 ]
 [ 0.00372398  0.          0.         ...  0.          0.2942817
   1.        ]]


In [283]:
#Prediction
#Doing the prediction for the users which are positively related with other users, and not the users which are negatively related as we are interested in the users which are more similar to the current users. So, ignoring the correlation for values less than 0. 
user_correlation[user_correlation<0]=0
user_correlation

array([[1.        , 0.        , 0.06485235, ..., 0.        , 0.        ,
        0.00372398],
       [0.        , 1.        , 0.        , ..., 0.        , 0.00866352,
        0.        ],
       [0.06485235, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.00866352, 0.        , ..., 0.        , 1.        ,
        0.2942817 ],
       [0.00372398, 0.        , 0.        , ..., 0.        , 0.2942817 ,
        1.        ]])

In [284]:
#Rating predicted by the user (for beers rated as well as not rated) is the weighted sum of correlation with the beer rating (as present in the rating dataset). 
user_predicted_ratings = np.dot(user_correlation, beers_features.fillna(0))
user_predicted_ratings

array([[0.25092263, 0.        , 0.        , ..., 0.03565012, 0.06958292,
        0.03751293],
       [0.92720111, 0.        , 0.01096611, ..., 0.        , 0.12413528,
        0.        ],
       [0.81718781, 0.125881  , 0.09950015, ..., 0.29963725, 0.0988606 ,
        0.        ],
       ...,
       [0.37824172, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.49732684, 0.        , 0.        , ..., 0.        , 0.029973  ,
        0.        ],
       [0.1427643 , 0.        , 0.        , ..., 0.02569732, 0.        ,
        0.        ]])

In [285]:
user_predicted_ratings.shape

(6538, 18778)

In [286]:
#Since we are interested only in the beers not rated by the user, we will ignore the beers rated by the user by making it zero.
user_final_rating = np.multiply(user_predicted_ratings,dummy_train)
user_final_rating.head()

review_profilename,0110x011,01Ryan10,04101Brewer,05Harley,0beerguy0,0tt0,1000Bottles,100floods,1050Sudz,1099,...,zulufactor,zumicroom,zwalk8,zwan,zymrgy,zymurgy4all,zymurgywhiz,zythus,zyzygy,zzajjber
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0.250923,0.0,0.0,0.246172,0.185059,0.102691,0.04727,0.631835,0.062471,0.30414,...,0.130106,0.0,0.0,0.034681,0.263626,0.500544,0.0,0.03565,0.069583,0.037513
6,0.927201,0.0,0.010966,0.90167,0.255823,0.462032,0.708844,0.676242,0.0,0.435434,...,0.144546,0.0,0.071885,0.266352,0.0,0.560119,0.083587,0.0,0.124135,0.0
7,0.817188,0.125881,0.0995,0.511857,0.04646,0.215919,0.166849,0.992749,0.031191,0.571399,...,0.017137,0.0,0.023487,0.017862,0.038508,0.485129,0.09962,0.299637,0.098861,0.0
8,0.32363,0.0,0.059545,0.215366,0.287933,0.010279,0.191127,0.136801,0.0,0.175085,...,0.0,0.019262,0.0,0.0,0.0,0.313402,0.068684,0.0,0.271676,0.0
9,0.437395,0.0,0.0,0.486326,0.051145,0.492705,0.457305,0.391214,0.0,0.55318,...,0.016482,0.0,0.094129,0.0,0.00905,0.229621,0.0,0.0,0.042924,0.0


In [287]:
# Finding the top 5 recommendation for the user 1
user_final_rating.iloc[1].sort_values(ascending=False)[0:5]

review_profilename
BuckeyeNation     18.621892
mikesgroove       18.117275
northyorksammy    17.506674
WesWes            15.522401
NeroFiddled       15.508726
Name: 6, dtype: float64

In [288]:
# Implementing Item Based Similarity
# Taking the transpose of the rating matrix to normalize the rating around the mean for different beer ID. In the user based similarity, we had taken mean for each user intead of each beer.
beer_features = train.pivot(
    index='beer_beerid',
    columns='review_profilename',
    values='review_overall'
).T

beer_features.head()

beer_beerid,5,6,7,8,9,10,11,12,13,14,...,75013,75086,75160,75188,75230,75894,76144,76323,76525,76816
review_profilename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0110x011,,,,,,,,,,,...,,,,,,,,,,
01Ryan10,,,,,,,,,,,...,,,,,,,,,,
04101Brewer,,,,,,,,,,,...,,,,,,,,,,
05Harley,,,,,,,,,,,...,,,,,,,,,,
0beerguy0,,,,,,,,,,,...,,,,,,,,,,


In [289]:
#Normalising the beer rating for each beer
mean = np.nanmean(beer_features, axis=1)
df_subtracted = (beer_features.T-mean).T

In [290]:
df_subtracted.head()

beer_beerid,5,6,7,8,9,10,11,12,13,14,...,75013,75086,75160,75188,75230,75894,76144,76323,76525,76816
review_profilename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0110x011,,,,,,,,,,,...,,,,,,,,,,
01Ryan10,,,,,,,,,,,...,,,,,,,,,,
04101Brewer,,,,,,,,,,,...,,,,,,,,,,
05Harley,,,,,,,,,,,...,,,,,,,,,,
0beerguy0,,,,,,,,,,,...,,,,,,,,,,


In [291]:
#Finding the cosine similarity. Note that since the data is normalised, both the cosine metric and correlation metric will give the same value. 
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
item_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [292]:
#Filtering the correlation only for which the value is greater than 0. (Positively correlated)
item_correlation[item_correlation<0]=0
item_correlation

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [293]:
#Prediction
item_predicted_ratings = np.dot((beer_features.fillna(0).T),item_correlation)
item_predicted_ratings

array([[0.93396344, 0.        , 0.        , ..., 0.        , 0.28431726,
        0.        ],
       [2.74408829, 0.        , 0.        , ..., 0.        , 5.8134427 ,
        0.        ],
       [1.34074524, 0.        , 0.        , ..., 0.        , 2.10394178,
        0.        ],
       ...,
       [0.01766629, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.25618477, 0.        , 0.        , ..., 0.        , 0.12278288,
        0.        ],
       [0.0471251 , 0.        , 0.        , ..., 0.        , 0.41674852,
        0.        ]])

In [294]:
item_predicted_ratings.shape

(6538, 18778)

In [295]:
dummy_train.shape

(6538, 18778)

In [296]:
#Filtering the rating only for the beers not rated by the user for recommendation
item_final_rating = np.multiply(item_predicted_ratings,dummy_train)
item_final_rating.head()

review_profilename,0110x011,01Ryan10,04101Brewer,05Harley,0beerguy0,0tt0,1000Bottles,100floods,1050Sudz,1099,...,zulufactor,zumicroom,zwalk8,zwan,zymrgy,zymurgy4all,zymurgywhiz,zythus,zyzygy,zzajjber
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0.933963,0.0,0.0,1.584415,0.453627,1.501425,0.1982,0.814406,0.0,4.43804,...,1.371579,0.0,0.0,0.0,0.490489,2.205171,0.0,0.0,0.284317,0.0
6,2.744088,0.0,0.0,4.472062,1.860689,4.595624,0.71632,2.10734,0.0,8.567385,...,3.504452,0.0,0.0,0.0,0.207807,5.592736,0.0,0.0,5.813443,0.0
7,1.340745,0.0,0.0,2.517756,1.302778,2.113676,1.745386,1.458686,0.0,5.049016,...,3.688498,0.0,0.0,0.0,1.005185,3.331587,0.0,0.0,2.103942,0.0
8,0.244857,0.0,0.0,0.117677,0.886829,0.296,0.0,0.309289,0.0,0.121257,...,0.138127,0.0,0.0,0.0,0.0,0.425196,0.0,0.0,0.143488,0.0
9,0.19399,0.0,0.0,0.222712,0.236444,0.289832,0.067017,0.236573,0.0,1.340159,...,1.207291,0.0,0.0,0.0,0.0,0.249249,0.0,0.0,0.29516,0.0


In [297]:
#Top 5 prediction for the user -1
item_final_rating.iloc[1].sort_values(ascending=False)[0:5]

review_profilename
Gusler          17.459382
Dogbrick        16.477865
mjurney         16.008589
euskera         15.907867
morbiddrumer    15.830601
Name: 6, dtype: float64

In [298]:
#Evaluation
#Evaluation will we same as you have seen above for the prediction. The only difference being, you will evaluate for the beer already rated by the user insead of predicting it for the beer not rated by the user. 
#Using User Similarity
test_beer_features = test.pivot(
    index='beer_beerid',
    columns='review_profilename',
    values='review_overall'
)
mean = np.nanmean(test_beer_features, axis=1)
test_df_subtracted = (test_beer_features.T-mean).T

# User Similarity Matrix
test_user_correlation = 1 - pairwise_distances(test_df_subtracted.fillna(0), metric='cosine')
test_user_correlation[np.isnan(test_user_correlation)] = 0
print(test_user_correlation)

[[ 1.         -0.01239958  0.00844929 ...  0.          0.
   0.        ]
 [-0.01239958  1.         -0.00440571 ...  0.          0.
   0.        ]
 [ 0.00844929 -0.00440571  1.         ...  0.          0.
   0.        ]
 ...
 [ 0.          0.          0.         ...  1.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          1.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   1.        ]]


In [299]:
test_user_correlation[test_user_correlation<0]=0
test_user_predicted_ratings = np.dot(test_user_correlation, test_beer_features.fillna(0))
test_user_predicted_ratings

array([[0.32356573, 1.50905275, 0.        , ..., 0.21254972, 0.        ,
        0.        ],
       [0.04745351, 0.32336221, 0.00752214, ..., 0.83123374, 0.02060392,
        0.        ],
       [0.08967816, 0.0343834 , 0.        , ..., 0.14500738, 0.00325497,
        0.05370168],
       ...,
       [0.        , 0.54465048, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [300]:
#Doing prediction for the beers rated by the user
test_user_final_rating = np.multiply(test_user_predicted_ratings,dummy_test)

In [230]:
test_user_final_rating.head()

beer_beerid,5,6,7,8,9,10,11,12,13,14,...,75013,75086,75160,75188,75230,75894,76144,76323,76525,76816
review_profilename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0110x011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
05Harley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0runkp0s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0tt0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000Bottles,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [231]:
#Calculating the RMSE for only the beers rated by user. For RMSE, normalising the rating to (1,5) range
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = test_user_final_rating.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)

MinMaxScaler(copy=True, feature_range=(1, 5))
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]


In [232]:
test_ = test.pivot(
    index='beer_beerid',
    columns='review_profilename',
    values='review_overall'
)

In [233]:
# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

In [234]:
rmse = (sum(sum((test_ - y )**2))/total_non_nan)**0.5
print(rmse)

1.8576311710035232


In [236]:
#Using Item similarity
test_beer_features = test.pivot(
    index='beer_beerid',
    columns='review_profilename',
    values='review_overall'
).T

mean = np.nanmean(test_beer_features, axis=1)
test_df_subtracted = (test_beer_features.T-mean).T

test_item_correlation = 1 - pairwise_distances(test_df_subtracted.fillna(0), metric='cosine')
test_item_correlation[np.isnan(test_item_correlation)] = 0
test_item_correlation[test_item_correlation<0]=0

In [237]:
test_item_correlation.shape

(6514, 6514)

In [238]:
test_beer_features.shape

(6514, 13445)

In [240]:
test_item_predicted_ratings = (np.dot(test_item_correlation, test_beer_features.fillna(0))).T
test_item_final_rating = np.multiply(test_item_predicted_ratings,dummy_test)
test_item_final_rating.head()

beer_beerid,5,6,7,8,9,10,11,12,13,14,...,75013,75086,75160,75188,75230,75894,76144,76323,76525,76816
review_profilename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0110x011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
05Harley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0runkp0s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0tt0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000Bottles,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [241]:
test_ = test.pivot(
    index='beer_beerid',
    columns='review_profilename',
    values='review_overall'
)

In [243]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = test_item_final_rating.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))


test_ = test.pivot(
        index='beer_beerid',
    columns='review_profilename',
    values='review_overall'
)

# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

MinMaxScaler(copy=True, feature_range=(1, 5))


In [244]:
rmse = (sum(sum((test_ - y )**2))/total_non_nan)**0.5
print(rmse)

2.2248806205195053


In [303]:
#Observations
# 5.Compare the performance of the two models using test data and suggest the one that should be deployed.
# We get RMSE of 2.22 using "Item similarity" and 1.85 using "User similarity".Hence Item Similarity model is better
# and should be deployed because it has higher RMSE value.

user_final_rating.columns

Index(['0110x011', '01Ryan10', '04101Brewer', '05Harley', '0beerguy0', '0tt0',
       '1000Bottles', '100floods', '1050Sudz', '1099',
       ...
       'zulufactor', 'zumicroom', 'zwalk8', 'zwan', 'zymrgy', 'zymurgy4all',
       'zymurgywhiz', 'zythus', 'zyzygy', 'zzajjber'],
      dtype='object', name='review_profilename', length=18778)

In [311]:
# 6. Give the names of the top 5 beers that you would recommend to the users 'cokes', 'genog' and 'giblet' using both the models.

#Cokes
user_final_rating['cokes'].sort_values(ascending=False)[0:5]

beer_beerid
7615     18.894527
2147     17.873162
28267    17.423951
17815    17.137311
26980    17.085183
Name: cokes, dtype: float64

In [312]:
#genog
user_final_rating['genog'].sort_values(ascending=False)[0:5]

beer_beerid
22076    4.565089
21363    4.556132
30956    4.543148
45617    4.427322
60886    4.408256
Name: genog, dtype: float64

In [314]:
#giblet
user_final_rating['giblet'].sort_values(ascending=False)[0:5]

beer_beerid
5976     4.876446
175      4.799745
24403    4.743715
4222     4.597274
855      4.578031
Name: giblet, dtype: float64