In [1]:
import pandas as pd
import clean_tabular_data as ctd

images = ctd.import_image_data()

products = ctd.import_product_data(doGeoCode=False)


In [2]:
images.head()

Unnamed: 0,id,product_id
0,912bb259-3ad9-457b-9db1-ce1da9016057,5f5f57d7-778f-4336-bb10-b43863418c8c
1,b166d305-b852-4bdd-83f4-465b20da94fa,5f5f57d7-778f-4336-bb10-b43863418c8c
2,68f5a29d-0075-4d60-81c1-ab684a82e50c,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4
3,f6a309d7-d247-446a-9b5e-aceefdd4334d,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4
4,2c2b3a6f-15b3-4289-937a-15482d9f5781,8292aa4e-7f1b-4655-bf0e-f1f2c9e3ffaf


In [3]:
products.head()

Unnamed: 0,product_id,product_name,category,product_description,price,location
0,243809c0-9cfc-4486-ad12-3b7a16605ba9,"Mirror wall art | in Wokingham, Berkshire | Gu...","Home & Garden / Dining, Living Room Furniture ...","Mirror wall art. Posted by Nisha in Dining, Li...",5.0,"Wokingham, Berkshire"
1,1c58d3f9-8b93-47ea-9415-204fcc2a22e6,"Stainless Steel Food Steamer | in Inverness, H...",Home & Garden / Other Household Goods,Morphy Richard’s (model no 48755)Stainless ste...,20.0,"Inverness, Highland"
2,860673f1-57f6-47ba-8d2f-13f9e05b8f9a,"Sun loungers | in Skegness, Lincolnshire | Gum...",Home & Garden / Garden & Patio / Outdoor Setti...,I have 2 of these - collection only as I don’t...,20.0,"Skegness, Lincolnshire"
3,59948726-29be-4b35-ade5-bb2fd7331856,Coffee side table from Ammunition ammo box hai...,"Home & Garden / Dining, Living Room Furniture ...",Great reclaimed army ammunition box used as co...,115.0,"Radstock, Somerset"
4,16dbc860-696e-4cda-93f6-4dd4926573fb,Modern Shannon Sofa for sale at low cost | in ...,"Home & Garden / Dining, Living Room Furniture ...",New Design Shannon Corner sofa 5 Seater Avail...,450.0,"Delph, Manchester"


Extracting "root" category and "sub" categories for each product

In [4]:
products[['root_category','sub_categories']] = products["category"].apply(lambda x: pd.Series(str(x).split(sep="/",maxsplit=1)))

In [5]:
products['root_category'] = products['root_category'].str.strip()
products['root_category'] = pd.Categorical(products['root_category'])
products['label'] = products['root_category'].cat.codes

In [6]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7156 entries, 0 to 7155
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   product_id           7156 non-null   object  
 1   product_name         7156 non-null   object  
 2   category             7156 non-null   object  
 3   product_description  7156 non-null   object  
 4   price                7156 non-null   float64 
 5   location             7156 non-null   object  
 6   root_category        7156 non-null   category
 7   sub_categories       7156 non-null   object  
 8   label                7156 non-null   int8    
dtypes: category(1), float64(1), int8(1), object(6)
memory usage: 406.1+ KB


The `product_ID` column in `images` links the two dataframes, so we can now use this to bring in the category to the images dataframe.

In the below the merge looks for columns that are in both, and uses these as the lookup value between the two. The section after the merge inside [[]] then tells is what columns we want in the output dataframe.

In [7]:
training_data = pd.merge(images,products)[['id','label']]
images = pd.merge(images,products)[['id','product_id','root_category','label']]


images.head()

Unnamed: 0,id,product_id,root_category,label
0,912bb259-3ad9-457b-9db1-ce1da9016057,5f5f57d7-778f-4336-bb10-b43863418c8c,Home & Garden,6
1,b166d305-b852-4bdd-83f4-465b20da94fa,5f5f57d7-778f-4336-bb10-b43863418c8c,Home & Garden,6
2,68f5a29d-0075-4d60-81c1-ab684a82e50c,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4,Home & Garden,6
3,f6a309d7-d247-446a-9b5e-aceefdd4334d,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4,Home & Garden,6
4,2c2b3a6f-15b3-4289-937a-15482d9f5781,8292aa4e-7f1b-4655-bf0e-f1f2c9e3ffaf,Home & Garden,6


In [8]:
import json
categories,labels = [products['root_category'].unique(),products['label'].unique()]

encoder = dict()
for i in range(len(categories)):
    encoder[categories[i]] = int(labels[i])

#also add the reversed values so we can go cat>label or label>cat
encoder.update({v: k for k, v in encoder.items()})

with open("image_decoder.json", "w") as outfile:
    json.dump(encoder,outfile)




Let's save the training data to csv for later.

In [9]:
training_data.to_csv("training_data.csv",index=False)

In [10]:
print(len(categories))

13


In [11]:
import clean_images
clean_images.clean_image_data("images/", 224)

Resizing: 100%|██████████| 100% 12668/12668 [00:00]
