# Cleanup Scripts
1. Use this to clean up files before pushing to Github to make them windows ingestible
2. Use this to manually create the item type csv files in case the crawling process gets interrupted for some reason.
3. Count the number of items so far in the dataset.

In [2]:
import os
import json
import pandas as pd

In [3]:
# count number of products so far
walk_dir = f'extracts/amazon'
cnt = 0
for root, subdirs, files in os.walk(walk_dir):
    for file in files:
        if '.json' in file and 'sspa' not in file:
            cnt += 1
cnt

30888

In [6]:
# manually create csv file for an item
# use this in case the crawler process gets interrupted
columns = [
    "url", "name", "asin",
    "image", "price", "isPrime",
    "offer", "customerReview", "customerReviewCount"
]

item_name = 'pc chassis'
walk_dir = f'extracts/amazon/{item_name}'
product_data = []
for root, subdirs, files in os.walk(walk_dir):
    for file in files:
        row = {}
        if 'sspa.json' not in file and '.csv' not in file and '_.json' not in file and '.txt' not in file:
            f = open(os.path.join(root,file))
            data = json.load(f)
            f.close()

            row['url'] = data['url']
            try:
                row['asin'] = data['body']['productInformation'][2]['value']
            except:
                row['asin'] = row['url'].split('/')[-1]
            try:
                row['name'] = data['body']['name']
            except:
                row['name'] = row['asin']
            try:
                row['image'] = data['body']['mainImage']
            except:
                row['image'] = None
            try:
                row['price'] = data['body']['price']
            except:
                continue
            try:
                row['offer'] = data['offer']
            except:
                row['offer'] = None
            try:
                row['customerReview'] = data['body']['customerReview']
            except:
                continue
            try:
                row['customerReviewCount'] = data['body']['customerReviewCount']
            except:
                continue
            product_data.append(row)
df = pd.DataFrame(product_data, columns=columns)
df.to_csv(f"extracts/amazon/{item_name}/amazon_{item_name}_manual.csv")
df.shape

(226, 9)

In [67]:
# clean up filenames and make them ingestible on Windows
walk_dir = 'extracts/amazon'

for root, subdirs, files in os.walk(walk_dir):
    for file in files:
        if '.json' in file:
            fname = file.split('.')[0]
            fname = fname[:30] + '.json' # just take first 30 characters
            os.rename(
                os.path.join(root,file),
                os.path.join(root, fname)
            )
        if '.txt' in file:
            fname = file.split('.')[0]
            fname = fname.replace(':', '').replace('-', '').replace(' ', '_') + '.txt'
            os.rename(
                os.path.join(root,file),
                os.path.join(root, fname)
            )
        if '.csv' in file:
            fname = file.split('.')[0]
            fname = fname.replace(':', '').replace('-', '').replace(' ', '_') + '.csv'
            os.rename(
                os.path.join(root,file),
                os.path.join(root, fname)
            )