In [1]:
# basics
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import ast

### Get category information for items from meta data

In [2]:
# Read json file for category information of the items
data = []
with open('meta_Patio_Lawn_and_Garden_2018.json','r') as file:
    for line in file:
        temp_data = ast.literal_eval(line)
        data.append(temp_data)

In [14]:
# Sample Metadata
data[1]

{'category': ['Patio, Lawn & Garden',
  'Outdoor Dcor',
  'Garden Sculptures & Statues',
  'Decorative Stones'],
 'tech1': '',
 'description': ['Stir up your spirit and refresh your soul with the hope and promise of salvation through this anointed masterpiece. Kenneth Copeland takes you on a journey through the faithfulness and holiness of God with these inspirational songs'],
 'fit': '',
 'title': 'Audio CD-I Was On His Mind',
 'also_buy': [],
 'tech2': '',
 'brand': 'Harrison House Publishing',
 'feature': [],
 'rank': ['>#1,228,459 in Patio, Lawn & Garden (See Top 100 in Patio, Lawn & Garden)',
  '>#7,298 in Patio, Lawn & Garden > Gardening > Garden Sculptures & Statues > Decorative Stones',
  '>#467,080 in Patio, Lawn & Garden > Outdoor Dcor'],
 'also_view': [],
 'main_cat': 'Amazon Home',
 'similar_item': '',
 'date': '',
 'price': '$16.99',
 'asin': '0881149659',
 'imageURL': [],
 'imageURLHighRes': []}

In [7]:
# Total # of items
len(data)

279456

In [9]:
list_itemid = []
list_itemcate = []
for i in data:
    item_id = i['asin']
    list_itemid.append(item_id)
    item_cate = i['category'][-1]
    list_itemcate.append(item_cate)

In [12]:
# Convert to DataFrame
itemCate_df = pd.DataFrame({'item_id': list_itemid, 
                            'cate_name': list_itemcate})

In [13]:
# Sample data file
itemCate_df.head()

Unnamed: 0,item_id,cate_name
0,545417600,Lawn Mower Parts & Accessories
1,881149659,Decorative Stones
2,976533502,"Plants, Seeds & Bulbs"
3,979914620,Indoor Gardening & Hydroponics
4,984408525,Repellents


### Create 'rating_df' and merge category information

In [15]:
filename = "Patio_Lawn_and_Garden.csv" 
ratings_df = pd.read_csv(filename, names=("item_id","user_id","rating","timestamp"))
ratings_df['datetime'] = pd.to_datetime(ratings_df['timestamp'], unit = 's')
ratings_df.drop(['timestamp'], 1, inplace = True)
ratings_df = pd.merge(ratings_df, itemCate_df, how = 'left', on = 'item_id')
ratings_df.shape

(5431771, 5)

In [16]:
ratings_df.to_csv('ratings_2018.csv')

In [22]:
ratings_df.head(10)

Unnamed: 0,item_id,user_id,rating,datetime,cate_name
0,986121126,A1SP1855LVHRVS,5.0,2015-05-30,Bug Zappers
1,986121126,AOAIHJZU5XMMH,5.0,2015-05-08,Bug Zappers
2,986121126,A2Z9J0DD7PX7D3,4.0,2016-11-28,Bug Zappers
3,986121126,A2FT1DZIV8CVOT,4.0,2016-11-23,Bug Zappers
4,986121126,A1HYKFP21Y02V4,4.0,2016-11-19,Bug Zappers
5,986121126,A2NJFK04BYEEJM,5.0,2016-11-17,Bug Zappers
6,986121126,A1V39OCNHU5HLQ,5.0,2016-11-14,Bug Zappers
7,986121126,A22ON5TJM4PQCD,5.0,2016-11-14,Bug Zappers
8,986121126,AQMYO3A6RTPH5,5.0,2016-11-04,Bug Zappers
9,986121126,A28Q0JA9H8WZOD,3.0,2016-10-28,Bug Zappers


### Select Pikachu and find all items that Pikachu has rated

In [26]:
# selectPikachu_df = ratings_df.groupby(['user_id']).size().reset_index(name='counts')
# potentialPikachu_df = selectPikachu_df.loc[selectPikachu_df['counts']>25]
# potentialPikachu_df.sample()

In [25]:
# Function: create_inputItems_df
# Input: 
#       user_id: (String) - id of selected Pikaqiu
#       rating_df: (DataFrame) - the dataframe of overall item-base user rating records

# Output: 
#       inputItems_df: (DataFrame) - include all items Pikachu has rated and the corresponding ratings
#                      Column name: item_id(primary key for item),
#                                   rating, 
#                                   datetime(from timestamp, converted to YYYY-MM-DD format), 
#                                   cate_name(the category the item belongs to)

def create_inputItems_df (user_id, rating_df):
    pikachu_df = rating_df.loc[rating_df['user_id'] == user_id]
    inputItems_df = pikachu_df.drop(['user_id'], 1)   
    
    return inputItems_df

In [28]:
inputItems_df = create_inputItems_df('ALSAOZ1V546VT', ratings_df)

In [29]:
inputItems_df.head()

Unnamed: 0,item_id,rating,datetime,cate_name
1752784,B004HB4I8I,5.0,2013-08-24,"Plants, Seeds & Bulbs"
1795806,B004PEIVUS,5.0,2013-03-07,Stands
2335391,B009QAJ6VO,5.0,2015-03-06,Fertilizers & Plant Food
2510073,B00C3YIYAO,5.0,2013-10-26,Pool Enzymes
2606469,B00D8GWX0I,5.0,2016-02-18,Leaf Blowers & Vacuums


### Find all rating history of users who purchased items in the same categories that Pikachu has rated before

In [30]:
# Function: create_userSubsetCate_df
# Input: 
#       user_id: (String) - id of selected Pikaqiu
#       rating_df: (DataFrame) - the dataframe of overall item-base user rating records
#       inputItems_df: (DataFrame) - the dataframe of all items Pikachu has rated and the corresponding ratings
#                                  Output of create_inputItems_df

# Output: 
#       userSubsetCate_df: (DataFrame) - the dataframe of other users with common categories of item in inputItems_df
#                                        and their corresponding mean rating of category-based
#                                        !!! DOES NOT INCLUDE RECORDS OF PIKAQIU !!! 
#                          Column name: user_id(primary key for user), 
#                                       cate_name(the category the user has rated before)
#                                       rating (average rating of each category), 

def create_userSubsetCate_df (user_id, rating_df, inputItems_df):
    
    # Find the categories that Pikachu has rated, and calculate the rating of each category using average
    inputCates_df = inputItems_df.groupby(['cate_name']).mean().reset_index()
    
    # Find all rating records of user who have rated items in same category which Pikachu has rated before (in inputCates_df)
    userSubset_df = rating_df[rating_df['cate_name'].isin(inputCates_df['cate_name'].tolist())]
    
    # Group all rating records by user_id and cate_name, get each user's average rating for each category
    userSubsetCate_df = userSubset_df.groupby(['user_id', 'cate_name']).mean().reset_index()
    
    # Drop the selected Pikaqiu from the DataFrame
    userSubsetCate_df.drop(userSubsetCate_df[userSubsetCate_df['user_id'] == user_id].index, inplace = True)
    
    return userSubsetCate_df
    

In [57]:
userSubsetCateGroup_df = userSubsetCate_df.groupby(['user_id'])

In [71]:
userSubsetCateGroup_df.head()

Unnamed: 0,user_id,cate_name,rating
0,A002764429D4LF2SJIT18,Traps,1.000000
1,A00311601FOXCO2AIRNIF,Lures,3.500000
2,A00400301PNPWZTJFR6RA,Lures,4.000000
3,A00406841NIK2FFZHXPC8,Planters,4.000000
4,A004176027NXFTHNBG3MW,Flowers,4.000000
5,A00436183LI7CN7VLFC18,Flowers,1.000000
6,A00473363TJ8YSZ3YAGG9,Feeders,4.000000
7,A00473363TJ8YSZ3YAGG9,Hooks & Hangers,4.000000
8,A00477922HSN48OM8838J,Planters,2.000000
9,A00495601IFV8DHJ1KD78,Garden Hoses,1.000000


In [60]:
userSubsetCateGroupSample_df = sorted(userSubsetCateGroup_df, key=lambda x: len(x[1]), reverse=True)

In [65]:
userSubsetCateGroupSample_df[0:3]

[('ALSAOZ1V546VT',               user_id           cate_name    rating
  105997  ALSAOZ1V546VT                 Ant  5.000000
  105998  ALSAOZ1V546VT  Cacti & Succulents  5.000000
  105999  ALSAOZ1V546VT            Deer Off  5.000000
  106000  ALSAOZ1V546VT             Feeders  3.800000
  106001  ALSAOZ1V546VT             Flowers  4.750000
  106002  ALSAOZ1V546VT        Garden Hoses  5.000000
  106003  ALSAOZ1V546VT          Hand Tools  5.000000
  106004  ALSAOZ1V546VT     Hooks & Hangers  5.000000
  106005  ALSAOZ1V546VT               Lures  1.000000
  106006  ALSAOZ1V546VT            Planters  5.000000
  106007  ALSAOZ1V546VT        Pool Enzymes  5.000000
  106008  ALSAOZ1V546VT              Stands  5.000000
  106009  ALSAOZ1V546VT               Traps  3.666667),
 ('A3HPCRD9RX351S',               user_id     cate_name    rating
  78108  A3HPCRD9RX351S      Deer Off  4.500000
  78109  A3HPCRD9RX351S       Feeders  4.571429
  78110  A3HPCRD9RX351S       Flowers  4.000000
  78111  A3HPCR

In [107]:
# random selected pikachu #2 - a richer pikachu with hot srping spa pool
# pikachu2_df = ratings_df.loc[ratings_df['user'] == 'A1B6O7SAIYG2N0']