In [10]:
import json, gzip, tqdm, math, csv, random, numpy as np, pandas as pd, os, array
from collections import defaultdict
from sklearn import linear_model
from IPython.display import display, HTML
def parse(path, max_ct = 5000):
  f = open(path, 'rb')
  ct = 0
  for l in tqdm.tqdm(f):
    yield eval(l)
    ct += 1
    if ct > max_ct:
      break
def getDF(path, max_ct = 5000):
  i = 0
  df = {}
  for d in parse(path, max_ct = max_ct):
    d["categories"] = d["categories"][0]
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')
def getDFRev(path, max_ct = 5000):
  i = 0
  df = {}
  for d in parse(path, max_ct = max_ct):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')


In [11]:
limit = 99999999
df = getDF('meta_Clothing_Shoes_and_Jewelry.json', max_ct = limit)
# filter metadata without images
df = df[df["imUrl"].notna()]
df_rev = getDFRev('reviews_Clothing_Shoes_and_Jewelry.json', max_ct = limit)
print("Count Nans: ", [(col, df[col].isna().sum()) for col in df.columns])

1503384it [02:16, 10978.09it/s]
5748920it [02:55, 32813.47it/s]


Count Nans:  [('asin', 0), ('related', 446185), ('title', 609), ('price', 928425), ('salesRank', 81093), ('imUrl', 0), ('brand', 1405173), ('categories', 0), ('description', 1417866)]


In [12]:
display(df[:1])
display(df_rev[:1])

Unnamed: 0,asin,related,title,price,salesRank,imUrl,brand,categories,description
0,37214,"{'also_viewed': ['B00JO8II76', 'B00DGN4R1Q', '...",Purple Sequin Tiny Dancer Tutu Ballet Dance Fa...,6.99,{'Clothing': 1233557},http://ecx.images-amazon.com/images/I/31mCncNu...,Big Dreams,"[Clothing, Shoes & Jewelry, Girls]",


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2XVJBSRI3SWDI,31887,abigail,"[0, 0]",Perfect red tutu for the price. I baught it as...,5.0,Nice tutu,1383523200,"11 4, 2013"


In [13]:
cat_cts = defaultdict(int)
for cats in tqdm.tqdm(df["categories"]):
    for c in cats:
        cat_cts[c] += 1
cat_lists = list(cat_cts.items())
# print categories and their counts
cat_lists.sort(key = lambda x: -x[1])
print(cat_lists[:20])

100%|██████████| 1503305/1503305 [00:01<00:00, 991953.57it/s]

[('Clothing, Shoes & Jewelry', 1435790), ('Women', 502696), ('Clothing', 410680), ('Shoes & Accessories: International Shipping Available', 267208), ('Men', 215989), ('Novelty, Costumes & More', 112130), ('Accessories', 110695), ('Jewelry', 69197), ('Sports & Outdoors', 63031), ('Shoes', 58289), ('Dresses', 51687), ('Tops & Tees', 49797), ('Girls', 48399), ('Novelty', 46932), ('Shirts', 36812), ('New Arrivals', 36603), ('Lingerie, Sleep & Lounge', 36506), ('Baby', 33910), ('Costumes & Accessories', 32980), ('Casual', 31283)]





In [14]:
def filter_df_by_category(category, acceptable_asins):
  return df[df.apply(lambda x: category in x["categories"] and x["asin"] in acceptable_asins, axis=1)]
def filter_rev_by_ids(id_set):
  return df_rev[df_rev.apply(lambda x: x["asin"] in id_set, axis=1)]
# reads (asins, features) for all asins that exist in the asin_set
def readImageFeatures(path, asin_set):
  f = open(path, 'rb')
  while True:
    asin = f.read(10).decode("utf-8")
    if len(asin) == 0: break
    a = array.array('f')
    a.fromfile(f, 4096)
    if asin in asin_set:
      yield asin, a
# Reads ALL asins from a featurePath
def readFeatAsins(path):
  f = open(path, 'rb')
  while True:
    asin = f.read(10).decode("utf-8")
    if len(asin) == 0: break
    f.read(4096 * 4)
    yield asin
def createFeatsColumn(path, asin_set, asins):
  fts_dict = dict(readImageFeatures(path, asin_set))
  print(len(fts_dict), len(asins))
  assert(len(fts_dict) == len(asins))
  return [list(fts_dict[t]) for t in asins]
def write(base_path, meta, rev):
  os.makedirs(base_path, exist_ok=True)
  rev.to_json(os.path.join(base_path, "rev.json"))
  meta.to_json(os.path.join(base_path, "meta.json"))

In [15]:
# investigating discrepency
filter_name = "Shoes"

# set of asins that have image features
acceptable_asins = set(readFeatAsins("image_features_Clothing_Shoes_and_Jewelry.b"))
print("Length of acceptable asins: ", len(acceptable_asins))
# filtered rows that have the specified filter_name and also exist in the image features binary file
df_filtered = filter_df_by_category(filter_name, acceptable_asins)

id_set = set(df_filtered["asin"])
rev_filtered= filter_rev_by_ids(id_set)
# img_feats = dict(readImageFeatures("image_features_Clothing_Shoes_and_Jewelry.b", id_set))
feats_col = createFeatsColumn("image_features_Clothing_Shoes_and_Jewelry.b", id_set, df_filtered["asin"])
df_filtered["feats"] = feats_col
print("Length meta filtered: ", len(df_filtered))
print("Length rev filtered: ", len(rev_filtered))
print("Length of feats: ", len(feats_col))

Length of acceptable asins:  1494171
57985 57985
Length meta filtered:  57985
Length rev filtered:  165947
Length of feats:  57985


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
df_filtered[:2]

Unnamed: 0,asin,related,title,price,salesRank,imUrl,brand,categories,description,feats
147,8921463216,"{'also_viewed': ['8921463267', '8921463208', '...",Hello Kitty LALA Lovely Womens Summer Slippers...,,{'Beauty': 377777},http://ecx.images-amazon.com/images/I/41hbUxgB...,,"[Clothing, Shoes & Jewelry, Women, Shoes, Slip...",,"[0.0, 3.3559999465942383, 0.0, 0.0, 1.55359995..."
148,8921463267,"{'also_viewed': ['8921463259', 'B007682TD0', '...",Hello Kitty LALA Lovely Womens Summer Slippers...,22.99,{'Beauty': 171597},http://ecx.images-amazon.com/images/I/41H1HmaH...,Hello Kitty,"[Clothing, Shoes & Jewelry, Women, Shoes, Slip...",,"[0.0, 0.9203000068664551, 0.0, 0.0, 3.70679998..."


In [17]:
write("./data/shoes", df_filtered, rev_filtered)