In [1]:
# Import required libraries: pandas for DataFrame handling, json for parsing JSON lines
import pandas as pd
import json

# Function to load .jsonl files (JSON Lines format: one JSON object per line)
def load_jsonl(file_path):
    # Open the file and convert each line (a JSON object) into a list of dicts, then to a DataFrame
    with open(file_path, 'r') as f:
        return pd.DataFrame([json.loads(line) for line in f])

# Load the datasets from local .jsonl files into DataFrames
reviews_df = load_jsonl("All_Beauty.jsonl")
meta_df = load_jsonl("meta_All_Beauty.jsonl")

# Quick sanity checks: print (rows, columns) for each DataFrame
print("Loaded Reviews:", reviews_df.shape)
print("Loaded Metadata:", meta_df.shape)

# Rename overlapping/ambiguous columns in metadata to avoid collisions after merge
# This preserves both review fields and product metadata fields clearly
meta_df_renamed = meta_df.rename(columns={
    'title': 'product_title',
    'images': 'product_images',
    'description': 'product_description',
    'features': 'product_features',
    'price': 'product_price',
    'store': 'product_store',
    'details': 'product_details',
    'categories': 'product_categories',
    'videos': 'product_videos'
})

# Merge on the common key 'parent_asin'
# Left join keeps all reviews and brings in matching product metadata when available
if 'parent_asin' in reviews_df.columns and 'parent_asin' in meta_df_renamed.columns:
    merged_df = pd.merge(reviews_df, meta_df_renamed, on='parent_asin', how='left')
    print("Merged Successfully:", merged_df.shape)
else:
    # Fail early with a clear message if the join key is missing in either DataFrame
    raise KeyError(" 'parent_asin' not found in both DataFrames.")

# Persist the merged dataset to a CSV file for downstream ingestion (e.g., Bronze layer copy)
merged_df.to_csv("All_Beauty_Merged_Clean.csv", index=False)
print("Merged and cleaned CSV saved as: All_Beauty_Merged_Clean.csv")


Loaded Reviews: (701528, 10)
Loaded Metadata: (112590, 14)
Merged Successfully: (701528, 23)
Merged and cleaned CSV saved as: All_Beauty_Merged_Clean.csv
