# Clean The Data

Collate userbase and itembase here:

* For each identified category, collate the product information.
    - Todo: identify categories
    - Todo: identify schema for products. Note that this can be a very wide and sparse array because each type of product has different product features
* For each product, add to userbase.
    - Todo: group DataFrame by unique users. One user could have left reviews on multiple products and we have to account for that. We need to make sure we capture this information so that we can identify each user's purchase/review history.

Please see the diagram:

![images/DataCleaning](../images/DataCleaning.png)

In [1]:
categories_dict = {
    "Children": ["action figures", "building toys", "toddler toy", "toy airplanes", "toy cars",
                 "toy dolls", "baby bottle", "baby formula", "baby wipes", "car seat", "crib",
                 "diaper", "nursery", "pacifier", "stroller"],
    "Books": ["adventure novel", "fantasy novel", "mystery novel", "nonfiction novel", "romance novels",
              "science fiction novel", "thriller novel", "young adult novel"],
    "Cleaning Material": ["vacuum", "detergent", "mop", "broom", "dishwasher", "fabric conditioner"],
    "Kitchen": ["utensils", "air fryer", "coffee maker", "frying pan", "kitchen knife", "microwave",
                "oven", "over the counter", "steamer", "stove", "kitchen", "dining room"],
    "Bedroom": ["bedding", "bedroom", "toilet", "playroom", "desk lamp", "iron", "lamp", "bedframe",
                "bookshelf", "cabinet", "desk", "dresser", "nightstand"],
    "Living Room": ["carpet", "home decor", "living room", "ring doorbell", "wall mount", "curtain",
                    "coffee table", "couch", "chair", "furniture", "patio", "shoe rack"],
    "Bathroom": ["bathroom", "mattress", "pillow", "air purifier", "washing machine",
                 "air freshener", "mirror", "linen"],
    "Fashion": ["belt", "cap", "coat", "dress", "face mask", "jewelry", "men bag",
                "men jeans", "men shirt", "men shoes", "men sweater", "socks", "underwear",
                "women bag", "women jeans", "women shirt", "women shirt", "women shoes",
                "women sweater", "workout clothes"],
    "Electronic Devices": ["camera", "headphones", "laptop", "monitor", "smart watch",
                           "speakers", "surveillance camera",
                           "tablet", "television", "videogame console", "wifi router"],
    "Peripheral Devices": ["keyboard", "mouse", "webcam", "microphone", "printer", "projector",
                           "usb", "computer accessories"],
    "Computer Components": ["pc ram", "cpu cooler", "gpu", "hard drive", "intel amd processor",
                            "motherboard", "pc chassis", "pc fan", "pc power supply", "solid state drive"],
    "Mobile Accessories": ["cables", "chargers", "phone case", "screen protector", "tripod"],
    "Personal Care": ["conditioner", "deodorant", "face wash", "facial toner", "feminine wash",
                      "lotion", "makeup", "moisturizer", "mouthwash", "shaving cream", "shampoo", "soap",
                      "tampon", "tissue", "toothbrush", "vitamins"],
    "Car Stuff": ["car accessories", "dash cam", "gps", "ram vehicles", "tires", "garage"],
    "Office Supplies": ["office chair", "folder", "home_office", "notebook", "school supplies", "stationary", "seat cushion"],
    "Travel Essentials": ["first aid", "luggage", "packing cubes", "stanley cup", "travel essentials", "water flask", "portable fan"]
}

In [2]:
import os
import json
import re
import numpy as np
import pandas as pd

In [3]:
def remove_ascii_specials(input_string):
    pattern = r'[^\x00-\x7F]'
    cleaned_string = re.sub(pattern, '', input_string)
    return cleaned_string

def remove_special_chars(input_string):
    forbidden = [',', '!', '?', '.', '_', '-', '/',
                 '\\', '\&', '%', ';', ':', "'", '"']
    for f in forbidden:
        input_string = input_string.replace(f, '')
    return input_string.strip().lower()

def clean_price(x):
    if isinstance(x, list):
        sums = 0
        for price in x:
            sums += float(price.split("$")[1].replace(',', ''))
        x = sums / len(x)
    if isinstance(x, str):
        try:
            x = x.split("$")[1]
        except IndexError:
            return np.nan
        if ' ' in x:
            x = float(x.split(' ')[0].replace(',', ''))
        else:
            x = float(x.replace(',', ''))
    if x:
        return float(x)
    else:
        return np.nan

In [4]:
raw_data_path = "../dataset/extracts/amazon"
if not os.path.exists(f"clean_data"):
    os.mkdir(f"clean_data")

for category in categories_dict:
    if not os.path.exists(f"clean_data/{category}"):
        os.mkdir(f"clean_data/{category}")
    items = []
    users = []
    subcategories = categories_dict[category]
    for subcategory in subcategories:
        walk_dir = raw_data_path + f"/{subcategory}"
        for root, subdirs, files in os.walk(walk_dir):
            for file in files:
                row = {}
                if ('sspa.json' not in file and '.csv' not in file
                    and '_.json' not in file and '.txt' not in file):
                    try:
                        f = open(os.path.join(root,file))
                        data = json.load(f)
                        f.close()
                    except:
                        pass
                    if 'error' not in data.keys():
                        if 'productInformation' not in data['body'].keys():
                            continue
                        row['ASIN'] = data['url'].split('/')[-1]
                        row['name'] = data['body']['name'].lower()
                        row['price'] = data['body']['price']
                        row['merchantInfo'] = remove_special_chars(data['body']['merchantInfo'])
                        row['isPrime'] = data['body']['isPrime']
                        try:
                            row['customerReview'] = float(data['body']['customerReview'].split(' ')[0])
                        except:
                            row['customerReview'] = np.nan
                        row['customerReviewCount'] = data['body']['customerReviewCount']
                        desc = ""
                        manufacturer_desc = ""
                        if len(data['body']['description']) > 0:
                            desc = remove_special_chars(data['body']['description'])
                        if len(data['body']['manufacturerProductDescription']) > 0:
                            manufacturer_desc = remove_special_chars(data['body']['manufacturerProductDescription'][0]['media_text'])
                        row['description'] = desc + ' ' + manufacturer_desc
                        row['byLineInfo'] = remove_special_chars(data['body']['byLineInfo']['name'])
                        breadCrumbs = data['body']['breadCrumbs']
                        for bc in breadCrumbs:
                            row[bc['name']] = 1
                        row['features'] = [remove_special_chars(remove_ascii_specials(x)) for x in data['body']['features']]
                        product_information = data['body']['productInformation']
                        for info in product_information:
                            feat_name = remove_special_chars(remove_ascii_specials(info['name']))
                            feat_value = remove_special_chars(remove_ascii_specials(info['value']))
                            if 'best sellers' in feat_name:
                                feat_value = float(feat_value.split(' ')[0].replace("#", ""))
                            if 'customer review' not in feat_name and 'asin' not in feat_name:
                                row[feat_name] = feat_value
                items.append(row)
    df = pd.DataFrame(items)
    df.dropna(how='all', inplace=True)
    df['price'] = df['price'].apply(clean_price)
    df.to_csv(f"clean_data/{category}/items_{category}.csv")