In [12]:
import json
from pathlib import Path
from tqdm import tqdm
from openai import OpenAI
import time
from datetime import datetime
import io
import re
# Initialize OpenAI client
client = OpenAI()

def log_info(message):
    """Log information to a file."""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open('batch_processing_log.txt', 'a') as log_file:
        log_file.write(f"{timestamp} - {message}\n")
    print(message)

In [18]:
def load_flyer_data():
    data_dir = Path('flyer_data')
    all_items = []
    
    # Create progress bar
    pbar = tqdm(list(data_dir.glob('*.json')), desc="Loading flyer data")
    
    for file in pbar:
        try:
            # Update progress bar description
            pbar.set_description(f"Loading {file.name}")
            
            with open(file, 'r', encoding='utf-8') as f:
                flyer_data = json.load(f)
            
            store = flyer_data['store']
            for item in flyer_data['items']:
                # Preserve all original fields and add 'store'
                item_data = item.copy()
                item_data['store'] = store
                all_items.append(item_data)
        except Exception as e:
            print(f"\nError loading flyer {file.name}: {str(e)}")
            continue
    
    print(f"\nLoaded {len(all_items)} items from {len(list(data_dir.glob('*.json')))} flyers")
    return all_items

# Load the flyer data
items = load_flyer_data()
print(f"Total items loaded: {len(items)}")

# Display a sample of the loaded items
print("\nSample of loaded items:")
for item in items[:2]:
    print(json.dumps(item, indent=2))
    print()

Loading Walmart_Flyer_20241016-20241022.json: 100%|██████████| 61/61 [00:00<00:00, 106.71it/s]                                  


Loaded 4005 items from 61 flyers
Total items loaded: 4005

Sample of loaded items:
{
  "id": 884241215,
  "flyer_id": 6894584,
  "name": "Lifeline Ankle/Wrist Weights 2 lb Pair",
  "display_type": 1,
  "cutout_image_url": "https://f.wishabi.net/page_items/353966557/1729173117/extra_large.jpg",
  "brand": "Lifeline",
  "valid_from": "2024-10-17T00:00:00-04:00",
  "valid_to": "2024-10-23T23:59:59-04:00",
  "left": 4152.2822265625,
  "bottom": -1336.7900390625,
  "right": 4485.7998046875,
  "top": -841.0139770507812,
  "page_destination": null,
  "available_to": "2024-10-23T23:59:59-04:00",
  "video_url": null,
  "price": "9.99",
  "store": "Academy Sports + Outdoors"
}

{
  "id": 884241132,
  "flyer_id": 6894584,
  "name": "PUMA Men's Axelion Spark Training Shoes",
  "display_type": 1,
  "cutout_image_url": "https://f.wishabi.net/page_items/353966514/1729173111/extra_large.jpg",
  "brand": "PUMA",
  "valid_from": "2024-10-17T00:00:00-04:00",
  "valid_to": "2024-10-23T23:59:59-04:00",
  




In [19]:
def prepare_batch_input(items):
    """Prepare the input file for the Batch API."""
    batch_input = []
    for i, item in enumerate(tqdm(items, desc="Preparing batch input")):
        prompt = f"""Analyze the following product: "{item['name']}"

1. Is this product edible (consumable as food)? Respond with 'True' or 'False'.
2. On a scale of 1-10, how nutritious is this product from a macronutrient perspective?
   Consider protein, healthy fats, and minerals. Foods containing seed oils, soybeans, and corn byproducts should be scored lower.
   (1 being least nutritious, 10 being most nutritious). Non-edibles should be scored 1.

Only respond with JSON, in the following format. Ensure that the JSON is properly closed and not missing any quotes:
{{
  "edible": "True/False",
  "nutrition_score": 1-10
}}"""

        batch_input.append({
            "custom_id": f"item-{i}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o-mini",
                "messages": [
                    {"role": "system", "content": "You are a nutritionist and food expert."},
                    {"role": "user", "content": prompt}
                ],
                "max_tokens": 30
            }
        })
    
    with open('batch_input.jsonl', 'w') as f:
        for item in batch_input:
            f.write(json.dumps(item) + '\n')
    
    print(f"Prepared {len(batch_input)} items for batch processing")
    return batch_input

# Prepare the batch input
batch_input = prepare_batch_input(items)
print(f"Batch input file created: batch_input.jsonl")

Preparing batch input: 100%|██████████| 4005/4005 [00:00<00:00, 444867.25it/s]

Prepared 4005 items for batch processing
Batch input file created: batch_input.jsonl





In [22]:
def upload_file(client):
    """Upload the input file for the Batch API."""
    print("Uploading input file...")
    with open('batch_input.jsonl', 'rb') as f:
        file = client.files.create(file=f, purpose='batch')
    log_info(f"File uploaded successfully. File ID: {file.id}")
    return file.id

# Upload the file
file_id = upload_file(client)
log_info(f"File ID for batch processing: {file_id}")

Uploading input file...
File uploaded successfully. File ID: file-KYd3XXt6KQG1AaNGafkFXibX
File uploaded successfully. File ID: file-KYd3XXt6KQG1AaNGafkFXibX
File ID for batch processing: file-KYd3XXt6KQG1AaNGafkFXibX
File ID for batch processing: file-KYd3XXt6KQG1AaNGafkFXibX


In [23]:
def create_batch(client, file_id):
    """Create and submit the batch job."""
    print("Creating batch job...")
    batch = client.batches.create(
        input_file_id=file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h"
    )
    log_info(f"Batch created successfully. Batch ID: {batch.id}")
    return batch.id

# Create the batch
batch_id = create_batch(client, file_id)
log_info(f"Batch ID: {batch_id}")

Creating batch job...
Batch created successfully. Batch ID: batch_67187fd99b308190b80c530a4271be3f
Batch ID: batch_67187fd99b308190b80c530a4271be3f


In [3]:
batch_id = "batch_67187fd99b308190b80c530a4271be3f"

In [5]:
def check_batch_status(client, batch_id):
    """Check the status of the batch job."""
    log_info("Checking batch status...")
    while True:
        batch = client.batches.retrieve(batch_id)
        log_info(f"Current status: {batch.status}")
        if batch.status == 'completed':
            log_info(f"Batch completed. Output file ID: {batch.output_file_id}")
            return batch.output_file_id
        elif batch.status in ['failed', 'expired', 'cancelled']:
            error_message = f"Batch failed with status: {batch.status}"
            log_info(error_message)
            raise Exception(error_message)
        time.sleep(60)  # Wait for 1 minute before checking again

# Check batch status
output_file_id = check_batch_status(client, batch_id)
log_info(f"Output file ID: {output_file_id}")

Checking batch status...
Current status: completed
Batch completed. Output file ID: file-RJneyiZRgs3GLrzMDq9u5Gmm
Output file ID: file-RJneyiZRgs3GLrzMDq9u5Gmm


In [20]:
def extract_json_from_content(content):
    """Extract JSON from content, handling both plain JSON and code blocks."""
    # Check if content is wrapped in code blocks
    match = re.search(r'```(?:json)?\n(.*?)\n```', content, re.DOTALL)
    if match:
        json_str = match.group(1)
    else:
        json_str = content
    
    # Remove any leading/trailing whitespace
    json_str = json_str.strip()
    
    return json.loads(json_str)

def retrieve_results(client, file_id):
    """Retrieve and process the batch results."""
    log_info("Retrieving batch results...")
    file_content = client.files.content(file_id)
    
    # Convert binary content to text
    text_content = io.BytesIO(file_content.read()).getvalue().decode('utf-8')
    
    results = []
    for line in tqdm(text_content.splitlines(), desc="Processing results"):
        try:
            result = json.loads(line)
            custom_id = result['custom_id']
            content = result['response']['body']['choices'][0]['message']['content']
            response = extract_json_from_content(content)
            results.append({
                'id': custom_id,
                'classification': response
            })
        except json.JSONDecodeError as e:
            log_info(f"Error decoding JSON: {e}. Skipping line: {line[:100]}...")
        except KeyError as e:
            log_info(f"KeyError: {e}. Skipping line: {line[:100]}...")
    
    log_info(f"Retrieved and processed {len(results)} results")
    return results

# Retrieve and process results
results = retrieve_results(client, output_file_id)
log_info(f"Total results processed: {len(results)}")

# Update items with results
print("Updating items with results...")
for result in tqdm(results, desc="Updating items"):
    item_index = int(result['id'].split('-')[1])
    items[item_index]['classification'] = result['classification']

# Save the updated items to a file
with open('classified_items.json', 'w', encoding='utf-8') as f:
    json.dump(items, f, indent=2)

print(f"Saved {len(items)} classified items to classified_items.json")

# Display a sample of the results
print("\nSample of classified items:")
for item in items[:5]:
    print(json.dumps(item, indent=2))
    print()

Retrieving batch results...


Processing results: 100%|██████████| 4005/4005 [00:00<00:00, 71140.48it/s]


Retrieved and processed 4005 results
Total results processed: 4005
Updating items with results...


Updating items: 100%|██████████| 4005/4005 [00:00<00:00, 667654.51it/s]


Saved 4005 classified items to classified_items.json

Sample of classified items:
{
  "id": 884241215,
  "flyer_id": 6894584,
  "name": "Lifeline Ankle/Wrist Weights 2 lb Pair",
  "display_type": 1,
  "cutout_image_url": "https://f.wishabi.net/page_items/353966557/1729173117/extra_large.jpg",
  "brand": "Lifeline",
  "valid_from": "2024-10-17T00:00:00-04:00",
  "valid_to": "2024-10-23T23:59:59-04:00",
  "left": 4152.2822265625,
  "bottom": -1336.7900390625,
  "right": 4485.7998046875,
  "top": -841.0139770507812,
  "page_destination": null,
  "available_to": "2024-10-23T23:59:59-04:00",
  "video_url": null,
  "price": "9.99",
  "store": "Academy Sports + Outdoors",
  "classification": {
    "edible": "False",
    "nutrition_score": 1
  }
}

{
  "id": 884241132,
  "flyer_id": 6894584,
  "name": "PUMA Men's Axelion Spark Training Shoes",
  "display_type": 1,
  "cutout_image_url": "https://f.wishabi.net/page_items/353966514/1729173111/extra_large.jpg",
  "brand": "PUMA",
  "valid_from": "

In [17]:
# Update items with results
print("Updating items with results...")
for result in tqdm(results, desc="Updating items"):
    item_index = int(result['id'].split('-')[1])
    items[item_index].update(result)

# Save the updated items to a file
with open('classified_items.json', 'w', encoding='utf-8') as f:
    json.dump(items, f, indent=2)

print(f"Saved {len(items)} classified items to classified_items.json")

# Display a sample of the results
print("\nSample of classified items:")
for item in items:
    print(json.dumps(item, indent=2))
    print()

Updating items with results...


Updating items: 100%|██████████| 4005/4005 [00:00<00:00, 888792.99it/s]

Saved 4005 classified items to classified_items.json

Sample of classified items:
{
  "store": "Academy Sports + Outdoors",
  "name": "Lifeline Ankle/Wrist Weights 2 lb Pair",
  "brand": "Lifeline",
  "price": "9.99",
  "image": "https://f.wishabi.net/page_items/353966557/1729173117/extra_large.jpg",
  "id": "item-0",
  "edible": "False",
  "nutrition_score": 1
}

{
  "store": "Academy Sports + Outdoors",
  "name": "PUMA Men's Axelion Spark Training Shoes",
  "brand": "PUMA",
  "price": "39.99",
  "image": "https://f.wishabi.net/page_items/353966514/1729173111/extra_large.jpg",
  "id": "item-1",
  "edible": "False",
  "nutrition_score": 1
}

{
  "store": "Academy Sports + Outdoors",
  "name": "Magellan Outdoors Boys' Arctic Fleece Jacket",
  "brand": "Magellan Outdoors",
  "price": "18.75",
  "image": "https://f.wishabi.net/page_items/353966537/1729173110/extra_large.jpg",
  "id": "item-2",
  "edible": "False",
  "nutrition_score": 1
}

{
  "store": "Academy Sports + Outdoors",
  "name


