In [3]:
!pip install nanoid

Defaulting to user installation because normal site-packages is not writeable
Collecting nanoid
  Using cached nanoid-2.0.0-py3-none-any.whl.metadata (3.5 kB)
Using cached nanoid-2.0.0-py3-none-any.whl (5.8 kB)
Installing collected packages: nanoid
Successfully installed nanoid-2.0.0


In [6]:
import csv
import json
import os
import re
import ast
import pandas as pd
from nanoid import generate

def get_uuid():
    alphanum = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
    nano_id = generate(alphanum, size=4)
    return nano_id

# Define folder paths
input_folder = "input"
output_folder = "output"
vendor_file = "amazon_vendor.csv"
l4_file = "l4_collection.json"

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Function to format category names
def format_category_name(name):
    """Formats category names: lowercase, removes special chars, and replaces spaces with hyphens."""
    name = name.lower().replace("'", "").replace("&", "and").replace(",", "")
    name = re.sub(r'\s+', '-', name.strip())
    return name


# Load L4 collection data
l4_lookup = {}
l4_df = pd.read_json(os.path.join(input_folder, l4_file))
for index, row in l4_df.iterrows():
    # print(row)
    l4_lookup[format_category_name(str(row["category"]))] = row




# Load vendor data into a set for fast lookup
vendor_data_lookup = set()
vendor_data = {}
vendor_data_df = pd.read_csv(os.path.join(input_folder, vendor_file))
for index, row in vendor_data_df.iterrows():
    categories = [
        str(row["l1_cat"]), str(row["l2_cat"]), str(row["l3_cat"]), str(row["l4_cat"]),
        str(row["l5_cat"]), str(row["l6_cat"]), str(row["l7_cat"]), str(row["l8_cat"]),
        str(row["l9_cat"]), str(row["l10_cat"]), str(row["l11_cat"])
    ]

    categories = [val for val in categories if val != 'nan']

    key = ', '.join(categories)
    vendor_data_lookup.add(key)
    vendor_data[key] = row





output_data = []
l4_output_data = []

# Process each CSV file in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".csv") and file_name not in [vendor_file]:  # Ignore vendor file
        input_path = os.path.join(input_folder, file_name)
        output_path = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}.json")

        
        with open(input_path, "r", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                amazon_merged_entry_sd = row.get("Amazon Merged  Entry", "").strip()
              
                amazon_merged_entry_for_this_row = []

                try:
                  amazon_merged_entry_for_this_row = ast.literal_eval(amazon_merged_entry_sd)
        
                except:
                  continue

                l4_sd = format_category_name(row.get("L4", "").strip())
                l4_object = l4_lookup[l4_sd]

                amazon_ids_against_l4 = []

                for amazon_merged_entry in amazon_merged_entry_for_this_row:
                  if amazon_merged_entry in vendor_data_lookup:
                    amazon_data = vendor_data[amazon_merged_entry]
                    new_id = get_uuid()

                    output_entry = {
                        "id": new_id,
                        "l1_cat": str(amazon_data["l1_cat"]),
                        "l2_cat": str(amazon_data["l2_cat"]),
                        "l3_cat": str(amazon_data["l3_cat"]),
                        "l4_cat": str(amazon_data["l4_cat"]),
                        "l5_cat": str(amazon_data["l5_cat"]),
                        "l6_cat": str(amazon_data["l6_cat"]),
                        "l7_cat": str(amazon_data["l7_cat"]),
                        "l8_cat": str(amazon_data["l8_cat"]),
                        "l9_cat": str(amazon_data["l9_cat"]),
                        "l10_cat": str(amazon_data["l10_cat"]),
                        "l11_cat": str(amazon_data["l11_cat"]),
                        "sd_l4_id": str(l4_object["id"])
                    }

                    cleaned_output_entry = {k: v for k, v in output_entry.items() if v.lower() != "nan"}
                    output_data.append(cleaned_output_entry)
                    amazon_ids_against_l4.append(new_id)

                l4_output_entry = {
                    "id": str(l4_object["id"]),
                    "l1_id": str(l4_object["l1_id"]),
                    "l2_id": str(l4_object["l2_id"]),
                    "l3_id": str(l4_object["l3_id"]),
                    "category": str(l4_object["category"]),
                    "amazon_category_ids": amazon_ids_against_l4 
                }
                l4_output_data.append(l4_output_entry)

    print(f"Processed {file_name}")

# Save output JSON
with open(os.path.join(output_folder, "amazon_categories.json"), "w", encoding="utf-8") as f:
    json.dump(output_data, f, indent=4)

# Save output JSON
with open(os.path.join(output_folder, "l4_output.json"), "w", encoding="utf-8") as f:
    json.dump(l4_output_data, f, indent=4)

Processed amazon_vendor.csv
Processed beauty_and_personal_care.csv
Processed books_movies.csv
Processed l4_collection.json
