In [1]:
#Dataframe manipulation library
import pandas as pd
#Math functions, we'll only need the sqrt function so let's import only that
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Storing the input item information into a pandas dataframe
inputitem_df = pd.read_csv('inputItems.csv')
#Storing the user information into a pandas dataframe
ratings_df = pd.read_csv('ratings.csv')
#Storing the mean rating of other users with common category 
userSubsetCate_df = pd.read_csv('userSubsetCate.csv')

In [3]:
inputitem_df.head()

Unnamed: 0.1,Unnamed: 0,item_id,rating,datetime,cate_name
0,23424,B00004RBDZ,5.0,2009-05-22,Traps
1,32160,B00004TBKG,5.0,2009-05-25,Traps
2,137462,B000A0VOD2,5.0,2013-02-26,Deer Off
3,201089,B000FZYW36,5.0,2010-07-15,Hooks & Hangers
4,242987,B000KL3GEQ,5.0,2013-07-12,Planters


In [4]:
ratings_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,item_id,rating,datetime,cate_name
0,0,A2VNYWOPJ13AFP,981850006,5.0,2009-12-03,Combination Grill Brushes & Scrapers
1,1,A20DWVV8HML3AW,981850006,5.0,2013-06-13,Combination Grill Brushes & Scrapers
2,2,A3RVP3YBYYOPRH,981850006,5.0,2009-11-12,Combination Grill Brushes & Scrapers
3,3,A28XY55TP3Q90O,981850006,5.0,2011-08-24,Combination Grill Brushes & Scrapers
4,4,A3VZW1BGUQO0V3,981850006,5.0,2011-06-17,Combination Grill Brushes & Scrapers


In [5]:
userSubsetCate_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,cate_name,rating
0,0,A002764429D4LF2SJIT18,Traps,1.0
1,1,A00311601FOXCO2AIRNIF,Lures,3.5
2,2,A00400301PNPWZTJFR6RA,Lures,4.0
3,3,A00406841NIK2FFZHXPC8,Planters,4.0
4,4,A004176027NXFTHNBG3MW,Flowers,4.0


In [6]:
userSubsetCateGroup_df = userSubsetCate_df.groupby(['user_id'])
userSubsetCateGroupSample_df = sorted(userSubsetCateGroup_df, key=lambda x: len(x[1]), reverse=True)

In [7]:
userSubsetCateGroupSample_df[0:3]

[('ALSAOZ1V546VT',
          Unnamed: 0        user_id           cate_name    rating
  105997      105997  ALSAOZ1V546VT                 Ant  5.000000
  105998      105998  ALSAOZ1V546VT  Cacti & Succulents  5.000000
  105999      105999  ALSAOZ1V546VT            Deer Off  5.000000
  106000      106000  ALSAOZ1V546VT             Feeders  3.800000
  106001      106001  ALSAOZ1V546VT             Flowers  4.750000
  106002      106002  ALSAOZ1V546VT        Garden Hoses  5.000000
  106003      106003  ALSAOZ1V546VT          Hand Tools  5.000000
  106004      106004  ALSAOZ1V546VT     Hooks & Hangers  5.000000
  106005      106005  ALSAOZ1V546VT               Lures  1.000000
  106006      106006  ALSAOZ1V546VT            Planters  5.000000
  106007      106007  ALSAOZ1V546VT        Pool Enzymes  5.000000
  106008      106008  ALSAOZ1V546VT              Stands  5.000000
  106009      106009  ALSAOZ1V546VT               Traps  3.666667),
 ('A3HPCRD9RX351S',
         Unnamed: 0         user_id

In [8]:
#calculate the avg rating for input items 
inputitem_avg = inputitem_df.groupby(['cate_name']).mean()
inputitem_avg.reset_index(level=0, inplace=True)

In [9]:
inputitem_avg.head()

Unnamed: 0.1,cate_name,Unnamed: 0,rating
0,Ant,555238.0,5.0
1,Cacti & Succulents,273424.0,5.0
2,Deer Off,137462.0,5.0
3,Feeders,437265.2,3.8
4,Flowers,416628.875,4.75


In [10]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetCateGroupSample_df:
    group = group.sort_values(by='cate_name')
    inputitem_avg = inputitem_avg.sort_values(by='cate_name')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the items that they both have in common
    temp_df = inputitem_avg[inputitem_avg['cate_name'].isin(group['cate_name'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [11]:
pearsonCorrelationDict.items()

dict_items([('ALSAOZ1V546VT', 1.0), ('A3HPCRD9RX351S', -0.4245472663190104), ('A100WO06OQR8BQ', 0.3363532501395922), ('A2G7B7FKP2O2PU', -0.5607706035753376), ('A3SZXYX9YEDGT6', 0.09444274855437215), ('A1E50L7PCVXLN4', 0.08738092164291214), ('A1IWR4YH4ZA9BM', 0.752396969082771), ('A1PO24RPVBV09N', 0.24831955688370497), ('A2UO040HWOP0C2', -0.39660777829621996), ('AY3XPKRAMKKY7', -0.502917131825271), ('A1660JZPHFX90N', -0.38654292882260555), ('A1RFZGWX46N8H7', 0.4350473706303844), ('A2HTPS0JV3Q8ZD', 0.889613781793197), ('A2KFAEOHLMLX0H', 0.20628294326537852), ('A2R1HUYHXV7H18', 0.4529398015895223), ('A3UKWQS8SRW6IO', -0.025796244861746247), ('A8WEXFRWX1ZHH', 0.5660615643283782), ('ACJT8MUC0LRF0', 0.22494912374198156), ('AYOMAHLWRQHUG', 0.5348128902580894), ('AZV26LP92E6WU', 0), ('A1152K2KUZ0J7W', -0.6837974687895424), ('A11OTLEDSW8ZXD', -0.2141432549338845), ('A132ETQPMHQ585', 0.00276768842174009), ('A19VW07IUHNV1R', -1.5567211377890134e-13), ('A1BVLNE9OX7RM', 0.978098895392604), ('A1EH2S

In [12]:
#Transform the pearsonCorrelationDict into a panda dataframe
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF['user_id'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.columns = ['similarityIndex','user_id']
pearsonDF.head()

Unnamed: 0,similarityIndex,user_id
0,1.0,ALSAOZ1V546VT
1,-0.424547,A3HPCRD9RX351S
2,0.336353,A100WO06OQR8BQ
3,-0.560771,A2G7B7FKP2O2PU
4,0.094443,A3SZXYX9YEDGT6


In [13]:
#Obtain the top X similar users to input user (i.e.Pikachui)
#set X = 50 (TBD)
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,user_id
951,1.0,A11I1I9QLMAM1A
2928,1.0,A2IT7S59GSASN7
5177,1.0,AF3BYMPWKWO8F
5277,1.0,AHRNDSKWH1YBT
5910,1.0,AYUF7YETYOLNX


In [14]:
# The next steps are sorting the items with weighted average of the 
# rating  using perason correlation as weight.

In [16]:
topUsersRating=topUsers.merge(ratings_df, left_on='user_id', right_on='user_id', how='inner')
topUsersRating.head()

Unnamed: 0.1,similarityIndex,user_id,Unnamed: 0,item_id,rating,datetime,cate_name
0,1.0,A11I1I9QLMAM1A,67095,B0000DI84D,5.0,2013-05-31,Racks
1,1.0,A11I1I9QLMAM1A,95382,B0002XK0IC,3.0,2009-06-21,Freestanding Grills
2,1.0,A11I1I9QLMAM1A,190290,B000FBQJA4,5.0,2014-06-17,Traps
3,1.0,A11I1I9QLMAM1A,190376,B000FCEPMM,4.0,2014-06-10,Traps
4,1.0,A11I1I9QLMAM1A,203256,B000GD3L0C,5.0,2014-06-10,Repellents


In [17]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0.1,similarityIndex,user_id,Unnamed: 0,item_id,rating,datetime,cate_name,weightedRating
0,1.0,A11I1I9QLMAM1A,67095,B0000DI84D,5.0,2013-05-31,Racks,5.0
1,1.0,A11I1I9QLMAM1A,95382,B0002XK0IC,3.0,2009-06-21,Freestanding Grills,3.0
2,1.0,A11I1I9QLMAM1A,190290,B000FBQJA4,5.0,2014-06-17,Traps,5.0
3,1.0,A11I1I9QLMAM1A,190376,B000FCEPMM,4.0,2014-06-10,Traps,4.0
4,1.0,A11I1I9QLMAM1A,203256,B000GD3L0C,5.0,2014-06-10,Repellents,5.0


In [18]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('cate_name').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
cate_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Air Filters,4.0,19.0
Barbecue Forks & Claws,1.0,5.0
Barbecue Turners,1.0,5.0
Benches,1.0,3.0
Bird Seed,6.0,23.0


In [19]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['cate_name'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,cate_name
cate_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Air Filters,4.75,Air Filters
Barbecue Forks & Claws,5.0,Barbecue Forks & Claws
Barbecue Turners,5.0,Barbecue Turners
Benches,3.0,Benches
Bird Seed,3.833333,Bird Seed


In [20]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,cate_name
cate_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Mulch,5.0,Mulch
Flavorizer Bars,5.0,Flavorizer Bars
Easy Set,5.0,Easy Set
Plant Covers,5.0,Plant Covers
"Patio, Lawn & Garden",5.0,"Patio, Lawn & Garden"
Oil Filters,5.0,Oil Filters
Nozzles,5.0,Nozzles
No/No,5.0,No/No
Fire Rings,5.0,Fire Rings
Barbecue Forks & Claws,5.0,Barbecue Forks & Claws
