In [43]:
import numpy as np
import pandas as pd
import json

In [44]:
with open("../data/item_with_images.json", "r") as items_json:
    items = json.load(items_json)

In [45]:
with open("../data/text_embeds.npy", "rb") as npy_file:
    text_features = np.load(npy_file)

In [46]:
with open("../data/reduced_features.npy", "rb") as npy_file:
    image_features = np.load(npy_file)

In [47]:
image_features.shape

(3253, 210)

In [48]:
assert image_features.shape[0] == len(items)
assert text_features.shape[0] == len(items)

In [49]:
df = pd.DataFrame(items)[["num_images", "category", "date", "mileage", "price"]]

In [50]:
df

Unnamed: 0,num_images,category,date,mileage,price
0,1,hsh,3/14,,5.0
1,1,hsh,3/14,,4.0
2,1,bop,3/14,,7.0
3,1,tls,3/14,,3.0
4,1,hsh,3/14,,5.0
...,...,...,...,...,...
3248,1,atq,3/11,,900.0
3249,2,fuo,3/11,,30.0
3250,1,trb,3/11,,6795.0
3251,1,fuo,3/11,,300.0


In [51]:
df["date"].unique()

array(['3/14', '3/13', '3/12', '3/16', '3/11'], dtype=object)

In [52]:
sum(df["mileage"].isna())

3053

In [53]:
date2order = {
    "3/11": 0,
    "3/12": 1,
    "3/13": 2,
    "3/14": 3,
    "3/16": 5
}

In [54]:
df["date_idx"] = df["date"].map(date2order)

In [55]:
df = df.drop("date", axis = 1)

In [56]:
cat_dummy = pd.get_dummies(df["category"])

In [57]:
df = pd.concat([df, cat_dummy], axis = 1).drop("category", axis = 1)

In [58]:
df = df.fillna(0.0)

In [59]:
df

Unnamed: 0,num_images,mileage,price,date_idx,app,art,atq,avo,bab,bfd,...,tag,tix,tld,tls,trb,tro,vgm,wan,wtd,wto
0,1,0.0,5.0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0.0,4.0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0.0,7.0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,3.0,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,0.0,5.0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3248,1,0.0,900.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3249,2,0.0,30.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3250,1,0.0,6795.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3251,1,0.0,300.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
df.to_csv("../data/items_processed.csv", index = False)