PDF's

In [33]:
import base64
import os
import json
from groq import Groq
from PIL import Image
import fitz  # PyMuPDF
from pdf2image import convert_from_path

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Function to process PDF and extract images
def process_pdf(pdf_path):
    # Convert PDF pages to images
    images = convert_from_path(pdf_path)
    
    # Store all guidelines analysis from all pages
    all_page_analyses = []
    
    # Initialize Groq client
    client = Groq(
        api_key=os.environ.get("GROQ_API_KEY"),
    )
    
    # Process each page, skipping the first page
    for i, image in enumerate(images[1:], start=1):
        # Save page image temporarily
        temp_path = f"temp_page_{i}.jpg"
        image.save(temp_path, "JPEG")
        
        # Get base64 string
        base64_image = encode_image(temp_path)

        # Get detailed analysis from vision model
        vision_response = client.chat.completions.create(
            messages=[
                {
                    "role": "user", 
                    "content": [
                        {
                            "type": "text", 
                            "text": """Analyze this image with special emphasis on identifying the exact product type and category. Then describe all guidelines, specifications, and requirements. Focus on:

1. Product type/category identification (CRITICAL):
   - Exact product name and model
   - Product category/family
   - Brand and sub-brand if present
   - Any product identifiers (SKU, model number, etc.)

2. Technical specifications and requirements specific to the identified product
3. Safety guidelines or warnings for this product type
4. Product parameters or limitations
5. Visual patterns or layouts suggesting guidelines
6. Instructional or requirement text
7. Tables or structured information with guidelines
8. Icons/symbols indicating requirements

IMPORTANT: For each guideline found, explicitly state which product or product category it applies to. If multiple products appear, organize information by product type. Be extremely precise in product identification and categorization. If no guideline information is found, state that clearly."""
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}",
                            },
                        },
                    ],
                }
            ],
            model="llama-3.2-11b-vision-preview",
            temperature=0.2,  # Reduced temperature for more focused responses
            stream=False
        )
        
        # Store analysis from this page
        page_analysis = {
            "page_number": i + 1,
            "analysis": vision_response.choices[0].message.content
        }
        all_page_analyses.append(page_analysis)
                
        # Clean up temp file
        os.remove(temp_path)
    # Process analyses in smaller batches to avoid token limits
    batch_size = 3
    consolidated_guidelines = []
    
    for i in range(0, len(all_page_analyses), batch_size):
        batch = all_page_analyses[i:i + batch_size]
        
        summary_response = client.chat.completions.create(
            messages=[
                {
                    "role": "user", 
                    "content": f"""Based on these page analyses, extract and structure the guidelines into clear, specific items:

{json.dumps(batch, indent=2)}

Create a detailed JSON structure that captures all unique guidelines mentioned. Group similar items and maintain specificity. Format as:
{{
  "guidelines": [
    {{
      "product_type": "Specific product or category this guideline applies to",
      "category": "Technical specs|Requirements|Safety guidelines|Parameters",
      "description": "Detailed description of the guideline",
      "priority": "high|medium|low"
    }}
  ]
}}

Be comprehensive but avoid duplicates. Preserve specific measurements, requirements, and technical details. Ensure each guideline clearly indicates which product type it applies to."""
                }
            ],
            model="llama-3.3-70b-versatile",
            response_format={"type": "json_object"},
            temperature=0.2
        )
        
        batch_results = json.loads(summary_response.choices[0].message.content)
        consolidated_guidelines.extend(batch_results["guidelines"])
    # Save results to JSON file in pdf_analyses folder
    output_filename = os.path.join('pdf_analyses', f"guidelines_{os.path.splitext(os.path.basename(pdf_path))[0]}.json")
    
    results = {
        "raw_page_analyses": all_page_analyses,
        "consolidated_guidelines": consolidated_guidelines
    }
    
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"\nResults saved to {output_filename}")

# Process each PDF
pdf_files = [
    "Guide_Line_Dry Groseries.pdf",
    "Guide_Line_Pantallas.v.2.0.2019.pdf"
]

for pdf_file in pdf_files:
    print(f"\nProcessing {pdf_file}...")
    process_pdf(pdf_file)



Processing Guide_Line_Dry Groseries.pdf...


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01jf84n2z5fsq8x0s2q93chdqe` on : Limit 100000, Used 99209, Requested 1902. Please try again in 15m59.271s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': '', 'code': 'rate_limit_exceeded'}}

Excel


In [30]:
import pandas as pd
import os
import json
from groq import Groq
from typing import Dict, List

# Create output directories if they don't exist
output_dirs = ['excel_analyses', 'pdf_analyses', 'restructured_analyses']
for dir_name in output_dirs:
    os.makedirs(dir_name, exist_ok=True)

def analyze_excel_file(excel_path):
    print(f"\nAnalyzing {os.path.basename(excel_path)}...")
    
    # Read all sheets from the Excel file
    try:
        excel_file = pd.ExcelFile(excel_path)
        sheets = {}
        
        for sheet_name in excel_file.sheet_names:
            print(f"\nReading sheet: {sheet_name}")
            
            try:
                # Read each sheet into a DataFrame, handling multi-header columns
                df = pd.read_excel(excel_file, sheet_name=sheet_name)
                
                # Convert multi-index columns to string format
                if isinstance(df.columns, pd.MultiIndex):
                    df.columns = [' - '.join(str(level) for level in col if pd.notna(level)) 
                                for col in df.columns.values]
                
                # Basic analysis of sheet structure
                print(f"Columns found: {list(df.columns)}")
                print(f"Number of rows: {len(df)}")
                
                # Store sheet data with serializable column names
                sheets[sheet_name] = {
                    "columns": list(map(str, df.columns)),
                    "row_count": len(df),
                    "header_levels": df.columns.nlevels if isinstance(df.columns, pd.MultiIndex) else 1,
                    "sample_data": df.head().to_dict('records')
                }
                
            except Exception as e:
                print(f"Error reading sheet {sheet_name}: {str(e)}")
                
        analysis = {
            "filename": os.path.basename(excel_path),
            "sheet_count": len(excel_file.sheet_names),
            "sheets": sheets
        }
        
        # Save individual Excel analysis
        output_filename = os.path.join('excel_analyses', f"{os.path.splitext(os.path.basename(excel_path))[0]}_analysis.json")
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(analysis, f, indent=2, ensure_ascii=False)
        print(f"Analysis saved to {output_filename}")
        
        return analysis
                
    except Exception as e:
        print(f"Error analyzing Excel file: {str(e)}")
        return None

# Initialize Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

def analyze_and_restructure_json(json_data: Dict, filename: str) -> Dict:
    """
    Use Groq LLM to analyze and restructure the JSON data into a clearer format.
    """
    prompt = f"""
    Analyze this JSON data and restructure it into a clearer, more organized format:
    
    {json_data}
    
    Please organize it with the following considerations:
    1. Group guidelines by product category
    2. Sort by priority within each category
    3. Create clear hierarchical relationships
    4. Standardize property names
    5. Add metadata section
    
    Return only the restructured JSON without any explanation.
    """
    
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="llama-3.3-70b-versatile",
        response_format={"type": "json_object"},
        temperature=0.2
    )
    
    # Parse the JSON string into a Python dictionary
    restructured_json = json.loads(chat_completion.choices[0].message.content)
    
    # Save individual restructured analysis with pretty formatting
    output_filename = os.path.join('restructured_analyses', f"{os.path.splitext(filename)[0]}_restructured.json")
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(restructured_json, f, 
                 indent=4,
                 ensure_ascii=False,
                 sort_keys=True,
                 separators=(',', ': '))
    print(f"Restructured analysis saved to {output_filename}")
    
    return restructured_json

# Process Excel files
excel_files = [
    r"C:\Users\gairo\OneDrive\Documents\Neurotry\PDF_Analyzer\IF2 abarrotes secos.xlsx",
    r"C:\Users\gairo\OneDrive\Documents\Neurotry\PDF_Analyzer\IF2_Pantallas_Audio.xlsx"
]

# Process each Excel file individually
for excel_file in excel_files:
    print(f"\nProcessing {os.path.basename(excel_file)}...")
    
    # Analyze Excel file
    analysis = analyze_excel_file(excel_file)
    
    if analysis:
        # Restructure the analysis
        restructured_data = analyze_and_restructure_json(
            {"excel_file_analysis": analysis}, 
            os.path.basename(excel_file)
        )

print("\nAll analyses completed and saved in separate directories")



Processing IF2 abarrotes secos.xlsx...

Analyzing IF2 abarrotes secos.xlsx...

Reading sheet: Hoja1
Columns found: ['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Facet', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Información de GDA´S', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'DIMENSIONES DEL ARTÍCULO CON EMPAQUE', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'DIMENSIONES DEL ARTÍCULO SIN EMPAQUE', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55',