In [42]:
import numpy as np
import pandas as pd
import json

In [43]:
with open("../data/item_with_images.json", "r") as items_json:
    items = json.load(items_json)

In [44]:
items[:10]

[{'title': 'kitchenaid measuring cups & spoons plastic',
  'metadata': '7 mins ago·westerly',
  'price': 5.0,
  'link': 'https://providence.craigslist.org/hsh/d/westerly-kitchenaid-measuring-cups/7726876655.html',
  'images': ['https://images.craigslist.org/00C0C_eGe6uOxr3zC_07K0ak_300x300.jpg'],
  'num_images': 1,
  'category': 'hsh',
  'metadata_length': 2,
  'date': '3/14',
  'location': 'westerly',
  'mileage': None},
 {'title': 'hoffritz measuring spoon stainless steel',
  'metadata': '7 mins ago·westerly',
  'price': 4.0,
  'link': 'https://providence.craigslist.org/hsh/d/westerly-hoffritz-measuring-spoon/7717240495.html',
  'images': ['https://images.craigslist.org/01010_kux8s1c6Il4_0jm0pO_300x300.jpg'],
  'num_images': 1,
  'category': 'hsh',
  'metadata_length': 2,
  'date': '3/14',
  'location': 'westerly',
  'mileage': None},
 {'title': "bell ballistic 5' 10mm cable lock",
  'metadata': '7 mins ago·westerly',
  'price': 7.0,
  'link': 'https://providence.craigslist.org/bop/d

In [45]:
with open("../data/text_embeds.npy", "rb") as npy_file:
    text_features = np.load(npy_file)

In [46]:
with open("../data/reduced_features.npy", "rb") as npy_file:
    image_features = np.load(npy_file)

In [47]:
image_features.shape

(3253, 210)

In [48]:
assert image_features.shape[0] == len(items)
assert text_features.shape[0] == len(items)

In [49]:
df = pd.DataFrame(items)[["num_images", "category", "date", "mileage", "price", "location"]]

In [50]:
df

Unnamed: 0,num_images,category,date,mileage,price,location
0,1,hsh,3/14,,5.0,westerly
1,1,hsh,3/14,,4.0,westerly
2,1,bop,3/14,,7.0,westerly
3,1,tls,3/14,,3.0,westerly
4,1,hsh,3/14,,5.0,westerly
...,...,...,...,...,...,...
3248,1,atq,3/11,,900.0,lincoln
3249,2,fuo,3/11,,30.0,pawt
3250,1,trb,3/11,,6795.0,cm truck & trailer sales llc
3251,1,fuo,3/11,,300.0,saunderstown


In [51]:
df["date"].unique()

array(['3/14', '3/13', '3/12', '3/16', '3/11'], dtype=object)

In [52]:
sum(df["mileage"].isna())

3053

In [53]:
date2order = {
    "3/11": 0,
    "3/12": 1,
    "3/13": 2,
    "3/14": 3,
    "3/16": 5
}

In [54]:
df["date_idx"] = df["date"].map(date2order)

In [55]:
df = df.drop("date", axis = 1)

In [56]:
# Create dummy variables for both category and location
cat_dummy = pd.get_dummies(df["category"], prefix='category')
loc_dummy = pd.get_dummies(df["location"], prefix='location') #change#1: added location to the item attributes

In [57]:
df = pd.concat([df, cat_dummy, loc_dummy], axis=1)
df = df.drop(["category", "location"], axis=1)  # Remove original categorical columns

In [58]:
df = df.fillna(0.0)

In [59]:
df

Unnamed: 0,num_images,mileage,price,date_idx,category_app,category_art,category_atq,category_avo,category_bab,category_bfd,...,location_woonsocket,"location_woonsocket ,r.i.",location_woonsocket ri,"location_woonsocket, ri",location_ww/ coventry,location_ww/cov,location_ww/coventry,location_www.smdwoods.com,location_wyoming,"location_🇺🇸#piano mover and tuning ..ri,ma.,ct, insured!"
0,1,0.0,5.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,0.0,4.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1,0.0,7.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1,0.0,3.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1,0.0,5.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3248,1,0.0,900.0,0,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3249,2,0.0,30.0,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3250,1,0.0,6795.0,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3251,1,0.0,300.0,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [60]:
df.to_csv("../data/items_processed.csv", index = False)