## 1. Import Libraries

In [2]:
import json
import os
from openai import OpenAI
from dotenv import load_dotenv
from datetime import datetime
import time
from pathlib import Path

## 2. Configure OpenAI API Key

In [3]:
# Resolve project root (one level above notebooks/)
project_root = Path().resolve().parent

# Load .env explicitly
load_dotenv(project_root / ".env")

# Fetch API key
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("OPENAI_API_KEY not found in .env")

print("API key loaded successfully")

# Initialize OpenAI client
client = OpenAI(api_key=api_key)

print("OpenAI client initialized")


API key loaded successfully
OpenAI client initialized


## 3. Load Product Summary Data

In [4]:
import json

with open('../datasets/processed/product_summary_data.json',
          'r',
          encoding='utf-8') as f:
    summary_data = json.load(f)

print(f"✓ Loaded data for {len(summary_data)} categories")
print("\nCategories:")

for cat_data in summary_data:
    category = cat_data['category']
    n_products = len(cat_data.get('top_products',
                                  cat_data.get('top_3_products', [])))
    print(f"  - {category}: {n_products} products")

✓ Loaded data for 5 categories

Categories:
  - Computers and electronics: 3 products
  - Amazon devices & accessories: 3 products
  - Entertainment Appliances: 3 products
  - Kids Toys & kids entertainment: 3 products
  - Batteries and household essentials: 2 products


## 4. Helper Functions for Prompt Generation

In [5]:
#This function formats product details into readable text for the LLM.
def format_product_info(product, rank):
    """
    Format product information for the prompt.
    """
    info = f"""
Product {rank}: {product['name']}
- Average Rating: {product['avg_rating']}/5.0
- Total Reviews: {product['total_reviews']:,}
- Quality Score: {product['quality_score']:.2f}/5.0
- Sentiment Distribution: {product['positive_pct']:.1f}% positive, {product['negative_pct']:.1f}% negative, {product['neutral_pct']:.1f}% neutral
"""
    
    # Add sample positive reviews
    if product.get('positive_reviews'):
        info += "\nPositive Review Examples:\n"
        for i, review in enumerate(product['positive_reviews'][:3], 1):
            info += f"  {i}. \"{review}\"\n"
    
    # Add sample negative reviews
    if product.get('negative_reviews'):
        info += "\nNegative Review Examples:\n"
        for i, review in enumerate(product['negative_reviews'][:3], 1):
            info += f"  {i}. \"{review}\"\n"
    
    return info


def format_worst_product_info(product):
    """
    Format worst product information for the prompt.
    """
    info = f"""
Product to Avoid: {product['name']}
- Average Rating: {product['avg_rating']}/5.0
- Total Reviews: {product['total_reviews']:,}
- Quality Score: {product['quality_score']:.2f}/5.0
- Negative Reviews: {product['negative_pct']:.1f}%
"""
    
    if product.get('negative_reviews'):
        info += "\nCommon Complaints:\n"
        for i, review in enumerate(product['negative_reviews'][:5], 1):
            info += f"  {i}. \"{review}\"\n"
    
    return info


def create_summarization_prompt(category_data):
    """
    Create a comprehensive prompt for ChatGPT to generate a meta-review.
    """
    category = category_data['category']
    top_products = category_data.get('top_products', category_data.get('top_3_products', []))
    worst_product = category_data.get('worst_product')
    
    # Handle category metadata if present
    metadata = category_data.get('category_metadata', {})
    total_products = metadata.get('total_products', len(top_products))
    top_label = metadata.get('top_label', f'Top {len(top_products)}')
    
    prompt = f"""You are a professional product reviewer writing for a consumer electronics blog. Write a comprehensive, engaging article about products in the {category} category on Amazon.

# Article Requirements:
1. Write in a conversational, helpful tone (like Wirecutter or The Verge)
2. Be specific and cite actual review examples
3. Compare and contrast the products clearly
4. Provide actionable recommendations
5. Length: 600-800 words

# Structure:
## Introduction
- Brief overview of the {category} category
- Why these products matter to consumers

## {top_label}
"""
    
    # Add each product
    for i, product in enumerate(top_products, 1):
        prompt += format_product_info(product, i)
        prompt += "\n"
    
    prompt += """
### For Each Product:
- Summarize what customers love (based on positive reviews)
- Highlight key features and strengths
- Mention common complaints (based on negative reviews)
- Who should buy this product?

## Key Differences
- Create a clear comparison highlighting what makes each product unique
- Help readers choose based on their needs

## Common Issues Across Products
- Aggregate the top complaints mentioned in negative reviews
- Provide context and severity of each issue
"""
    
    # Add worst product if present
    if worst_product:
        prompt += "\n## Product to Avoid\n"
        prompt += format_worst_product_info(worst_product)
        prompt += "\n- Explain why this product underperforms based on the complaints\n"
    else:
        prompt += "\n## Note"
        if total_products <= 3:
            prompt += f"\n- This category has limited options ({total_products} products)\n"
            prompt += "- All available products are included above\n"
        else:
            prompt += "\n- All products in this category perform reasonably well\n"
    
    prompt += """
## Final Recommendation
- Provide a clear "best overall" pick
- Suggest "best value" or "best for specific use case" alternatives
- End with a confident recommendation

# Writing Guidelines:
- Use direct quotes from actual reviews (provided above)
- Be balanced - mention both pros and cons
- Use headers and bullet points for readability
- Write product names in bold on first mention
- Include specific numbers (ratings, review counts) to build credibility
- Avoid marketing language - be honest and helpful
"""
    
    return prompt

print("✓ Helper functions defined")

✓ Helper functions defined


## 6. Test Prompt Generation (Optional)

In [6]:
# Preview the prompt for the first category
test_prompt = create_summarization_prompt(summary_data[0])

print("="*100)
print("SAMPLE PROMPT FOR FIRST CATEGORY")
print("="*100)
print(test_prompt[:2000])  # Show first 2000 characters
print("\n... (truncated for display) ...\n")
print("="*100)
print(f"Full prompt length: {len(test_prompt)} characters")

SAMPLE PROMPT FOR FIRST CATEGORY
You are a professional product reviewer writing for a consumer electronics blog. Write a comprehensive, engaging article about products in the Computers and electronics category on Amazon.

# Article Requirements:
1. Write in a conversational, helpful tone (like Wirecutter or The Verge)
2. Be specific and cite actual review examples
3. Compare and contrast the products clearly
4. Provide actionable recommendations
5. Length: 600-800 words

# Structure:
## Introduction
- Brief overview of the Computers and electronics category
- Why these products matter to consumers

## Top 3

Product 1: Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes Special Offers, Black
- Average Rating: 4.67/5.0
- Total Reviews: 15
- Quality Score: 1.05/5.0
- Sentiment Distribution: 100.0% positive, 0.0% negative, 0.0% neutral

Positive Review Examples:
  1. "I initially had trouble deciding between the paperwhite and the voyage because reviews more or less said the same thing: the p

## 7. Generate Summaries with ChatGPT

In [7]:
def generate_summary(category_data, model="gpt-3.5-turbo", temperature=0.7):
    """
    Generate a summary article for a category using ChatGPT.
    
    Args:
        category_data: Dictionary containing category and product information
        model: OpenAI model to use ("gpt-3.5-turbo" or "gpt-4" or "gpt-4-turbo")
        temperature: Creativity level (0.0-1.0, higher = more creative)
    
    Returns:
        Generated article text
    """
    category = category_data['category']
    print(f"\nGenerating summary for: {category}...")
    
    # Create the prompt
    prompt = create_summarization_prompt(category_data)
    
    try:
        # Call ChatGPT API
        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert product reviewer who writes honest, detailed, and helpful reviews for consumers. You analyze customer feedback to provide balanced recommendations."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=temperature,
            max_tokens=2000  # Adjust based on desired article length
        )
        
        summary = response.choices[0].message.content
        
        # Get token usage info
        usage = response.usage
        print(f"  ✓ Generated ({usage.total_tokens} tokens: {usage.prompt_tokens} prompt + {usage.completion_tokens} completion)")
        
        return summary
        
    except Exception as e:
        print(f"  ✗ Error: {str(e)}")
        return f"Error generating summary for {category}: {str(e)}"


print("✓ Summary generation function defined")

✓ Summary generation function defined


## 8. Generate Summaries for All Categories

In [8]:
# Configuration
MODEL = "gpt-3.5-turbo"  # Options: "gpt-3.5-turbo" (cheapest) or "gpt-4" (best quality)
TEMPERATURE = 0.7  # 0.0 = more focused, 1.0 = more creative
DELAY_BETWEEN_CALLS = 2  # Seconds to wait between API calls (to avoid rate limits)

print("="*100)
print("GENERATING SUMMARIES FOR ALL CATEGORIES")
print("="*100)
print(f"Model: {MODEL}")
print(f"Temperature: {TEMPERATURE}")
print(f"Categories to process: {len(summary_data)}")
print()

# Store all generated summaries
generated_summaries = []

# Generate summary for each category
for i, category_data in enumerate(summary_data, 1):
    category = category_data['category']
    
    print(f"[{i}/{len(summary_data)}] Processing: {category}")
    
    # Generate the summary
    summary_text = generate_summary(
        category_data=category_data,
        model=MODEL,
        temperature=TEMPERATURE
    )
    
    # Store the result
    generated_summaries.append({
        'category': category,
        'summary': summary_text,
        'generated_at': datetime.now().isoformat(),
        'model': MODEL,
        'temperature': TEMPERATURE
    })
    
    # Wait before next API call (avoid rate limits)
    if i < len(summary_data):
        time.sleep(DELAY_BETWEEN_CALLS)

print()
print("="*100)
print(f"✓ COMPLETED: Generated {len(generated_summaries)} summaries")
print("="*100)

GENERATING SUMMARIES FOR ALL CATEGORIES
Model: gpt-3.5-turbo
Temperature: 0.7
Categories to process: 5

[1/5] Processing: Computers and electronics

Generating summary for: Computers and electronics...
  ✓ Generated (3959 tokens: 2915 prompt + 1044 completion)
[2/5] Processing: Amazon devices & accessories

Generating summary for: Amazon devices & accessories...
  ✓ Generated (3214 tokens: 2221 prompt + 993 completion)
[3/5] Processing: Entertainment Appliances

Generating summary for: Entertainment Appliances...
  ✓ Generated (1874 tokens: 987 prompt + 887 completion)
[4/5] Processing: Kids Toys & kids entertainment

Generating summary for: Kids Toys & kids entertainment...
  ✓ Generated (2752 tokens: 1588 prompt + 1164 completion)
[5/5] Processing: Batteries and household essentials

Generating summary for: Batteries and household essentials...
  ✓ Generated (2034 tokens: 1244 prompt + 790 completion)

✓ COMPLETED: Generated 5 summaries


## 9. Display Generated Summaries

In [9]:
# Display all generated summaries
for i, summary in enumerate(generated_summaries, 1):
    print("\n" + "="*100)
    print(f"CATEGORY {i}: {summary['category'].upper()}")
    print("="*100)
    print()
    print(summary['summary'])
    print()
    print("-"*100)
    print(f"Generated: {summary['generated_at']} | Model: {summary['model']}")
    print()


CATEGORY 1: COMPUTERS AND ELECTRONICS

## Computers and Electronics on Amazon: A Comprehensive Review

In today's tech-savvy world, consumers are constantly on the lookout for the latest and greatest in computers and electronics. With the vast array of options available on Amazon, it can be overwhelming to find the right products that suit your needs. In this article, we'll dive into the top three products in the Computers and Electronics category on Amazon, highlighting their features, pros, cons, and customer feedback to help you make an informed decision.

## Top 3 Products:

### **Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes Special Offers, Black**
- **Average Rating**: 4.67/5.0
- **Total Reviews**: 15
- **Quality Score**: 1.05/5.0

#### What Customers Love:
- Customers appreciate the responsiveness and ease of use of the touch screen.
- Reading on the Fire Tablet is comfortable and doesn't strain the eyes like some other devices.
- International shipping options from Amazon ens

## 10. Save Summaries to Files

In [10]:
from pathlib import Path

# Create output directory one level above notebooks/
output_dir = Path('../generated_summaries')
output_dir.mkdir(parents=True, exist_ok=True)

print("Saving summaries...")
print()

# Save each summary as a separate file
for summary in generated_summaries:
    # Create filename (sanitize category name)
    filename = (
        summary['category']
        .lower()
        .replace(' ', '_')
        .replace('&', 'and')
    )

    filepath = output_dir / f"{filename}.txt"

    # Write to file
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(f"# {summary['category']}\n")
        f.write(f"# Generated: {summary['generated_at']}\n")
        f.write(f"# Model: {summary['model']} (temperature: {summary['temperature']})\n")
        f.write("\n" + "="*100 + "\n\n")
        f.write(summary['summary'])

    print(f"  ✓ Saved: {filepath.resolve()}")

print()
print(f"✓ All summaries saved to '{output_dir.resolve()}/'")

Saving summaries...

  ✓ Saved: C:\Users\yogan\Documents\JupyterNotebook_ironhack\Bootcamp\week_06_02_project\project_nlp_automated_customers_reviews\generated_summaries\computers_and_electronics.txt
  ✓ Saved: C:\Users\yogan\Documents\JupyterNotebook_ironhack\Bootcamp\week_06_02_project\project_nlp_automated_customers_reviews\generated_summaries\amazon_devices_and_accessories.txt
  ✓ Saved: C:\Users\yogan\Documents\JupyterNotebook_ironhack\Bootcamp\week_06_02_project\project_nlp_automated_customers_reviews\generated_summaries\entertainment_appliances.txt
  ✓ Saved: C:\Users\yogan\Documents\JupyterNotebook_ironhack\Bootcamp\week_06_02_project\project_nlp_automated_customers_reviews\generated_summaries\kids_toys_and_kids_entertainment.txt
  ✓ Saved: C:\Users\yogan\Documents\JupyterNotebook_ironhack\Bootcamp\week_06_02_project\project_nlp_automated_customers_reviews\generated_summaries\batteries_and_household_essentials.txt

✓ All summaries saved to 'C:\Users\yogan\Documents\JupyterNoteb

## 11. Save All Summaries as Single Document

In [11]:
from pathlib import Path
from datetime import datetime

# Define output directory one level above notebooks/
output_dir = Path('../generated_summaries')
output_dir.mkdir(parents=True, exist_ok=True)

# Save as a single combined document
combined_filepath = output_dir / "all_category_summaries.txt"

with open(combined_filepath, 'w', encoding='utf-8') as f:
    f.write("# Amazon Product Category Reviews\n")
    f.write(f"# Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"# Model: {MODEL}\n")
    f.write("\n" + "="*100 + "\n\n")
    
    for i, summary in enumerate(generated_summaries, 1):
        f.write(f"\n\n{'='*100}\n")
        f.write(f"CATEGORY {i}: {summary['category'].upper()}\n")
        f.write(f"{'='*100}\n\n")
        f.write(summary['summary'])
        f.write("\n\n")

print(f"✓ Combined document saved: {combined_filepath.resolve()}")

✓ Combined document saved: C:\Users\yogan\Documents\JupyterNotebook_ironhack\Bootcamp\week_06_02_project\project_nlp_automated_customers_reviews\generated_summaries\all_category_summaries.txt


In [12]:
# -----------------------------
# SAVE AS PDF (single document)
# -----------------------------

from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.platypus import HRFlowable
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import letter
from reportlab.lib.units import inch
from reportlab.lib import colors

pdf_filepath = output_dir / "all_category_summaries.pdf"

doc = SimpleDocTemplate(str(pdf_filepath), pagesize=letter)
elements = []
styles = getSampleStyleSheet()

# Title
elements.append(Paragraph("Amazon Product Category Reviews", styles["Heading1"]))
elements.append(Spacer(1, 0.2 * inch))
elements.append(Paragraph(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", styles["Normal"]))
elements.append(Paragraph(f"Model: {MODEL}", styles["Normal"]))
elements.append(Spacer(1, 0.3 * inch))
elements.append(HRFlowable(width="100%", thickness=1, color=colors.grey))
elements.append(Spacer(1, 0.3 * inch))

# Categories
for i, summary in enumerate(generated_summaries, 1):
    elements.append(Paragraph(
        f"CATEGORY {i}: {summary['category'].upper()}",
        styles["Heading2"]
    ))
    elements.append(Spacer(1, 0.2 * inch))
    
    elements.append(Paragraph(
        summary['summary'].replace('\n', '<br/>'),
        styles["BodyText"]
    ))
    
    elements.append(Spacer(1, 0.4 * inch))
    elements.append(HRFlowable(width="100%", thickness=0.5, color=colors.lightgrey))
    elements.append(Spacer(1, 0.3 * inch))

doc.build(elements)

print(f"✓ PDF saved: {pdf_filepath.resolve()}")

✓ PDF saved: C:\Users\yogan\Documents\JupyterNotebook_ironhack\Bootcamp\week_06_02_project\project_nlp_automated_customers_reviews\generated_summaries\all_category_summaries.pdf


## 12. Save Summaries as JSON

In [13]:
from pathlib import Path
import json

# Define output directory one level above notebooks/
output_dir = Path('../generated_summaries')
output_dir.mkdir(parents=True, exist_ok=True)

# Save as JSON for easy integration with web apps
json_filepath = output_dir / "summaries.json"

with open(json_filepath, 'w', encoding='utf-8') as f:
    json.dump(generated_summaries, f, indent=2, ensure_ascii=False)

print(f"✓ JSON file saved: {json_filepath.resolve()}")

✓ JSON file saved: C:\Users\yogan\Documents\JupyterNotebook_ironhack\Bootcamp\week_06_02_project\project_nlp_automated_customers_reviews\generated_summaries\summaries.json
