In [1]:
# basics
import argparse
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import ast


# sklearn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

In [34]:
# The dataframe of overall item-base user rating records (including those of Pikachu)
# Column name: user_id(primary key for user), 
#              item_id(primary key for item, can find corresponding item: https://www.amazon.com/dp/'item_id'), 
#              rating, 
#              datetime(from timestamp, converted to YYYY-MM-DD format), 
#              cate_name(the category the item belongs to)
ratings_df = pd.read_csv("ratings.csv",usecols = ['user_id','item_id','rating','datetime','cate_name'])

In [35]:
# Sample rows for 'ratings_df'
ratings_df.head()

Unnamed: 0,user_id,item_id,rating,datetime,cate_name
0,A2VNYWOPJ13AFP,981850006,5.0,2009-12-03,Combination Grill Brushes & Scrapers
1,A20DWVV8HML3AW,981850006,5.0,2013-06-13,Combination Grill Brushes & Scrapers
2,A3RVP3YBYYOPRH,981850006,5.0,2009-11-12,Combination Grill Brushes & Scrapers
3,A28XY55TP3Q90O,981850006,5.0,2011-08-24,Combination Grill Brushes & Scrapers
4,A3VZW1BGUQO0V3,981850006,5.0,2011-06-17,Combination Grill Brushes & Scrapers


In [36]:
# The input dataframe of Pikachu(user_id = 'ALSAOZ1V546VT')
# Include all items pikachu has rated and the corresponding ratings
inputItems_df = pd.read_csv("inputItems.csv",usecols = ['item_id','rating','datetime','cate_name'])

In [37]:
# Sample rows for 'inputItems_df'
inputItems_df.head()

Unnamed: 0,item_id,rating,datetime,cate_name
0,B00004RBDZ,5.0,2009-05-22,Traps
1,B00004TBKG,5.0,2009-05-25,Traps
2,B000A0VOD2,5.0,2013-02-26,Deer Off
3,B000FZYW36,5.0,2010-07-15,Hooks & Hangers
4,B000KL3GEQ,5.0,2013-07-12,Planters


In [38]:
# The dataframe of other users with common categories of item and their corresponding mean rating of category-based 
userSubsetCate_df = pd.read_csv('userSubsetCate.csv',usecols = ['user_id','cate_name','rating'])
# Remove the rating record of Pikachu
userSubsetCate_df.drop(userSubsetCate_df[userSubsetCate_df['user_id']=='ALSAOZ1V546VT'].index, inplace = True)

In [39]:
# Sample rows for 'userSubsetCate_df'
userSubsetCate_df.head()

Unnamed: 0,user_id,cate_name,rating
0,A002764429D4LF2SJIT18,Traps,1.0
1,A00311601FOXCO2AIRNIF,Lures,3.5
2,A00400301PNPWZTJFR6RA,Lures,4.0
3,A00406841NIK2FFZHXPC8,Planters,4.0
4,A004176027NXFTHNBG3MW,Flowers,4.0


Two possible methods for finding similarity: 

In [None]:
# Create the user-item matrix for similarity 

def create_user_item_matrix(ratings, user_key="user",item_key="item"):

    n = len(set(ratings[user_key]))
    d = len(set(ratings[item_key]))

    user_mapper = dict(zip(np.unique(ratings[user_key]), list(range(n))))
    item_mapper = dict(zip(np.unique(ratings[item_key]), list(range(d))))

    user_inverse_mapper = dict(zip(list(range(n)), np.unique(ratings[user_key])))
    item_inverse_mapper = dict(zip(list(range(d)), np.unique(ratings[item_key])))

    user_ind = [user_mapper[i] for i in ratings[user_key]]
    item_ind = [item_mapper[i] for i in ratings[item_key]]

    # csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
    # where data, row_ind and col_ind satisfy the relationship a[row_ind[k], col_ind[k]] = data[k].
    X = sparse_matrix((ratings["rating"], (user_ind, item_ind)), shape=(n,d))
    
    return X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind    

Method 2: 
Follow Happer's code starting from 'Similarity of users to input user' part. 'userSubsetCateGroupSample_df' corresponds to 'userSubsetGroup'

In [28]:
userSubsetCateGroup_df = userSubsetCate_df.groupby(['user_id'])
userSubsetCateGroupSample_df = sorted(userSubsetCateGroup_df, key=lambda x: len(x[1]), reverse=True)

In [33]:
userSubsetCateGroupSample_df[0:3]

[('A3HPCRD9RX351S',               user_id     cate_name    rating
  78108  A3HPCRD9RX351S      Deer Off  4.500000
  78109  A3HPCRD9RX351S       Feeders  4.571429
  78110  A3HPCRD9RX351S       Flowers  4.000000
  78111  A3HPCRD9RX351S  Garden Hoses  4.000000
  78112  A3HPCRD9RX351S    Hand Tools  5.000000
  78113  A3HPCRD9RX351S         Lures  5.000000
  78114  A3HPCRD9RX351S      Planters  4.000000
  78115  A3HPCRD9RX351S        Stands  5.000000
  78116  A3HPCRD9RX351S         Traps  4.600000),
 ('A100WO06OQR8BQ',             user_id     cate_name    rating
  106  A100WO06OQR8BQ           Ant  1.000000
  107  A100WO06OQR8BQ      Deer Off  3.000000
  108  A100WO06OQR8BQ       Feeders  4.142857
  109  A100WO06OQR8BQ  Garden Hoses  4.625000
  110  A100WO06OQR8BQ    Hand Tools  5.000000
  111  A100WO06OQR8BQ         Lures  1.000000
  112  A100WO06OQR8BQ      Planters  1.000000
  113  A100WO06OQR8BQ         Traps  3.090909),
 ('A2G7B7FKP2O2PU',               user_id     cate_name  rating
  