# Electronics Dataset Preprocessing

In [None]:
import json
import pandas as pd
from collections import Counter, defaultdict
import gzip
from pathlib import Path
import numpy as np
from tqdm import tqdm

# Set paths
DATA_DIR = Path("../data")
REVIEWS_FILE = DATA_DIR / "Electronics.jsonl"
META_FILE = DATA_DIR / "meta_Electronics.jsonl"
OUTPUT_DIR = DATA_DIR / "processed"
OUTPUT_DIR.mkdir(exist_ok=True)

# Configuration
TARGET_PRODUCTS = 1000
MIN_REVIEWS_PER_PRODUCT = 10  # Minimum reviews to be considered "popular"


## Step 1: Count Reviews per Product

In [None]:
def count_reviews_per_product(reviews_file):
    """Count the number of reviews for each product (parent_asin)."""
    print(f"Counting reviews per product from {reviews_file}...")
    
    review_counts = Counter()
    total_reviews = 0
    
    # Handle both .jsonl and .jsonl.gz files
    if str(reviews_file).endswith('.gz'):
        file_opener = gzip.open
        mode = 'rt'
    else:
        file_opener = open
        mode = 'r'
    
    with file_opener(reviews_file, mode, encoding='utf-8') as f:
        for line_num, line in enumerate(tqdm(f, desc="Processing reviews")):
            if line_num % 100000 == 0 and line_num > 0:
                print(f"Processed {line_num:,} reviews, found {len(review_counts):,} unique products")
            
            try:
                review = json.loads(line.strip())
                parent_asin = review.get('parent_asin')
                if parent_asin:
                    review_counts[parent_asin] += 1
                    total_reviews += 1
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON at line {line_num + 1}")
                continue
    
    print(f"\nTotal reviews processed: {total_reviews:,}")
    print(f"Unique products found: {len(review_counts):,}")
    
    return review_counts

# Count reviews per product
review_counts = count_reviews_per_product(REVIEWS_FILE)


In [None]:
def select_top_products(review_counts, target_count=1000, min_reviews=10):
    """Select top products by review count."""
    print(f"\nSelecting top {target_count} products with at least {min_reviews} reviews...")
    
    # Filter products with minimum review count
    filtered_products = {k: v for k, v in review_counts.items() if v >= min_reviews}
    print(f"Products with ≥{min_reviews} reviews: {len(filtered_products):,}")
    
    # Get top products by review count
    top_products = dict(review_counts.most_common(target_count))
    
    print(f"\nSelected {len(top_products)} products")
    print(f"Review count range: {min(top_products.values())} - {max(top_products.values())}")
    
    # Show distribution
    counts = list(top_products.values())
    print(f"\nReview count statistics:")
    print(f"  Mean: {np.mean(counts):.1f}")
    print(f"  Median: {np.median(counts):.1f}")
    print(f"  75th percentile: {np.percentile(counts, 75):.1f}")
    print(f"  90th percentile: {np.percentile(counts, 90):.1f}")
    
    return top_products

top_products = select_top_products(review_counts, TARGET_PRODUCTS, MIN_REVIEWS_PER_PRODUCT)
selected_parent_asins = set(top_products.keys())


In [None]:
def extract_product_metadata(meta_file, selected_asins):
    """Extract metadata for selected products."""
    print(f"\nExtracting metadata for {len(selected_asins)} products...")
    
    products_metadata = []
    found_count = 0
    
    # Handle both .jsonl and .jsonl.gz files
    if str(meta_file).endswith('.gz'):
        file_opener = gzip.open
        mode = 'rt'
    else:
        file_opener = open
        mode = 'r'
    
    with file_opener(meta_file, mode, encoding='utf-8') as f:
        for line_num, line in enumerate(tqdm(f, desc="Processing metadata")):
            try:
                product = json.loads(line.strip())
                parent_asin = product.get('parent_asin')
                
                if parent_asin in selected_asins:
                    # Add review count to metadata
                    product['review_count'] = top_products[parent_asin]
                    products_metadata.append(product)
                    found_count += 1
                    
                    if found_count % 100 == 0:
                        print(f"Found metadata for {found_count}/{len(selected_asins)} products")
                        
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON at line {line_num + 1}")
                continue
    
    print(f"\nFound metadata for {found_count}/{len(selected_asins)} products")
    return products_metadata

products_metadata = extract_product_metadata(META_FILE, selected_parent_asins)


In [None]:
def extract_sample_reviews(reviews_file, selected_asins, max_reviews_per_product=20):
    """Extract sample reviews for selected products."""
    print(f"\nExtracting sample reviews (max {max_reviews_per_product} per product)...")
    
    product_reviews = defaultdict(list)
    total_extracted = 0
    
    # Handle both .jsonl and .jsonl.gz files
    if str(reviews_file).endswith('.gz'):
        file_opener = gzip.open
        mode = 'rt'
    else:
        file_opener = open
        mode = 'r'
    
    with file_opener(reviews_file, mode, encoding='utf-8') as f:
        for line_num, line in enumerate(tqdm(f, desc="Processing reviews")):
            try:
                review = json.loads(line.strip())
                parent_asin = review.get('parent_asin')
                
                if (parent_asin in selected_asins and 
                    len(product_reviews[parent_asin]) < max_reviews_per_product):
                    
                    # Only keep essential review fields
                    clean_review = {
                        'asin': review.get('asin'),
                        'parent_asin': parent_asin,
                        'rating': review.get('rating'),
                        'title': review.get('title', ''),
                        'text': review.get('text', ''),
                        'timestamp': review.get('timestamp'),
                        'verified_purchase': review.get('verified_purchase'),
                        'helpful_vote': review.get('helpful_vote', 0)
                    }
                    
                    product_reviews[parent_asin].append(clean_review)
                    total_extracted += 1
                    
                    if total_extracted % 1000 == 0:
                        print(f"Extracted {total_extracted} reviews for {len(product_reviews)} products")
                        
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON at line {line_num + 1}")
                continue
    
    print(f"\nExtracted {total_extracted} reviews for {len(product_reviews)} products")
    return dict(product_reviews)

sample_reviews = extract_sample_reviews(REVIEWS_FILE, selected_parent_asins, max_reviews_per_product=20)


In [None]:
# Convert to DataFrame for analysis
df_products = pd.DataFrame(products_metadata)

print("=== DATASET SUMMARY ===")
print(f"Total products: {len(df_products)}")
print(f"Total reviews extracted: {sum(len(reviews) for reviews in sample_reviews.values())}")

print("\n=== PRODUCT METADATA FIELDS ===")
print(f"Available fields: {list(df_products.columns)}")

print("\n=== REVIEW COUNT DISTRIBUTION ===")
if 'review_count' in df_products.columns:
    print(df_products['review_count'].describe())

print("\n=== PRICE DISTRIBUTION ===")
if 'price' in df_products.columns:
    # Clean price data (remove nulls and convert to numeric)
    prices = pd.to_numeric(df_products['price'], errors='coerce').dropna()
    print(f"Products with price info: {len(prices)}/{len(df_products)}")
    if len(prices) > 0:
        print(prices.describe())

print("\n=== RATING DISTRIBUTION ===")
if 'average_rating' in df_products.columns:
    ratings = pd.to_numeric(df_products['average_rating'], errors='coerce').dropna()
    print(f"Products with rating info: {len(ratings)}/{len(df_products)}")
    if len(ratings) > 0:
        print(ratings.describe())

print("\n=== TOP 10 MOST REVIEWED PRODUCTS ===")
if 'review_count' in df_products.columns and 'title' in df_products.columns:
    top_10 = df_products.nlargest(10, 'review_count')[['title', 'review_count', 'average_rating', 'price']]
    for idx, row in top_10.iterrows():
        print(f"{row['review_count']:,} reviews - {row['title'][:80]}...")


In [None]:
# Save product metadata
products_file = OUTPUT_DIR / "electronics_top1000_products.jsonl"
with open(products_file, 'w', encoding='utf-8') as f:
    for product in products_metadata:
        f.write(json.dumps(product, ensure_ascii=False) + '\n')

print(f"Saved {len(products_metadata)} products to {products_file}")

# Save sample reviews
reviews_file = OUTPUT_DIR / "electronics_top1000_reviews.jsonl"
total_reviews_saved = 0
with open(reviews_file, 'w', encoding='utf-8') as f:
    for parent_asin, reviews in sample_reviews.items():
        for review in reviews:
            f.write(json.dumps(review, ensure_ascii=False) + '\n')
            total_reviews_saved += 1

print(f"Saved {total_reviews_saved} reviews to {reviews_file}")

# Save summary statistics
summary = {
    'dataset_info': {
        'source': 'Amazon Reviews 2023 - Electronics Category',
        'citation': 'Hou et al. (2024) - Bridging Language and Items for Retrieval and Recommendation (arXiv:2403.03952)',
        'processing_date': pd.Timestamp.now().isoformat(),
        'selection_criteria': {
            'target_products': TARGET_PRODUCTS,
            'min_reviews_per_product': MIN_REVIEWS_PER_PRODUCT,
            'max_reviews_per_product': 20
        }
    },
    'statistics': {
        'total_products': len(products_metadata),
        'total_reviews': total_reviews_saved,
        'products_with_metadata': len(products_metadata),
        'products_with_reviews': len(sample_reviews)
    }
}

if 'review_count' in df_products.columns:
    summary['statistics']['review_count_stats'] = {
        'min': int(df_products['review_count'].min()),
        'max': int(df_products['review_count'].max()),
        'mean': float(df_products['review_count'].mean()),
        'median': float(df_products['review_count'].median())
    }

summary_file = OUTPUT_DIR / "dataset_summary.json"
with open(summary_file, 'w', encoding='utf-8') as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print(f"Saved dataset summary to {summary_file}")

print("\n=== PROCESSING COMPLETE ===")
print(f"Output files created in: {OUTPUT_DIR}")
print(f"  - {products_file.name}: Product metadata")
print(f"  - {reviews_file.name}: Sample reviews")
print(f"  - {summary_file.name}: Dataset summary")


In [None]:
def create_rag_documents(products_metadata, sample_reviews):
    """Create documents optimized for RAG retrieval."""
    rag_documents = []
    
    for product in products_metadata:
        parent_asin = product.get('parent_asin')
        
        # Create product document
        doc = {
            'id': f"product_{parent_asin}",
            'type': 'product',
            'parent_asin': parent_asin,
            'title': product.get('title', ''),
            'description': ' '.join(product.get('description', [])) if product.get('description') else '',
            'features': ' '.join(product.get('features', [])) if product.get('features') else '',
            'price': product.get('price'),
            'average_rating': product.get('average_rating'),
            'rating_number': product.get('rating_number'),
            'review_count': product.get('review_count'),
            'store': product.get('store', ''),
            'categories': product.get('categories', []),
            'details': product.get('details', {})
        }
        
        # Create searchable text content
        content_parts = []
        if doc['title']:
            content_parts.append(f"Product: {doc['title']}")
        if doc['description']:
            content_parts.append(f"Description: {doc['description']}")
        if doc['features']:
            content_parts.append(f"Features: {doc['features']}")
        if doc['store']:
            content_parts.append(f"Store: {doc['store']}")
        if doc['categories']:
            content_parts.append(f"Categories: {' > '.join(doc['categories'])}")
        
        doc['content'] = ' '.join(content_parts)
        rag_documents.append(doc)
        
        # Add review summaries
        if parent_asin in sample_reviews:
            reviews = sample_reviews[parent_asin]
            
            # Create review summary document
            positive_reviews = [r for r in reviews if r.get('rating', 0) >= 4]
            negative_reviews = [r for r in reviews if r.get('rating', 0) <= 2]
            
            review_summary = {
                'id': f"reviews_{parent_asin}",
                'type': 'review_summary',
                'parent_asin': parent_asin,
                'product_title': doc['title'],
                'total_reviews': len(reviews),
                'positive_reviews': len(positive_reviews),
                'negative_reviews': len(negative_reviews)
            }
            
            # Sample positive and negative review texts
            pos_texts = [r.get('text', '') for r in positive_reviews[:5] if r.get('text')]
            neg_texts = [r.get('text', '') for r in negative_reviews[:5] if r.get('text')]
            
            content_parts = [f"Reviews for {doc['title']}"]
            if pos_texts:
                content_parts.append(f"Positive feedback: {' '.join(pos_texts[:3])}")
            if neg_texts:
                content_parts.append(f"Critical feedback: {' '.join(neg_texts[:3])}")
            
            review_summary['content'] = ' '.join(content_parts)
            rag_documents.append(review_summary)
    
    return rag_documents

# Create RAG documents
rag_documents = create_rag_documents(products_metadata, sample_reviews)

# Save RAG documents
rag_file = OUTPUT_DIR / "electronics_rag_documents.jsonl"
with open(rag_file, 'w', encoding='utf-8') as f:
    for doc in rag_documents:
        f.write(json.dumps(doc, ensure_ascii=False) + '\n')

print(f"Created {len(rag_documents)} RAG documents saved to {rag_file}")
print(f"Document types: {Counter(doc['type'] for doc in rag_documents)}")
