In [None]:
from openai import OpenAI
import base64
import os
import json
import pandas as pd

In [None]:
client = OpenAI(
    api_key="xxx",
)

In [None]:
product_category = ['Tools & Home Improvement', 'Automotive', 'Home & Kitchen', 'Toys & Games', 'Patio, Lawn & Garden', 'Sports & Outdoors','Clothing, Shoes & Jewelry', 'Electronics', 'Arts, Crafts & Sewing', 'Industrial & Scientific', 'Pet Supplies', 'Office Products', 'Musical Instruments', 'Grocery & Gourmet Food', 'Beauty & Personal Care', 'Appliances', 'Health & Household', 'Baby Products', 'Cell Phones & Accessories', 'Video Games', 'CDs & Vinyl']

prompt = f'''
            The following text and picture were posted by an influencer on Instagram. 
            Please analyze the product he/she endorses, what brand the product is and what category the product is in, select the category from this list: {product_category}
            Output only in json format, for example:
            {{
                "nodes": [
                    {{"name": "", "attribute": ""}},
                ],
                "edges": [
                    {{"source": "", "target": "", "relationship": ""}},
                ]

            }}
            Please identify the attribute from the following categories: 'product_name', 'brand', 'product_category'.
            Please identify the relationship from the following categories: 'use_product', 'its_brand', 'product_category'.
            If you couldn't infer accurate information, please do not output and proceed to the next picture.
            Make sure the target and source of edges match an existing node. 
            Do not include the markdown triple quotes above and below the JSON, jump straight into it with a curly bracket.
            '''

In [None]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [None]:
# Make sure encode_image is defined in a previous cell or run the cell that defines it

posts_dir = ""

for user_folder in os.listdir(posts_dir):
    user_folder_path = os.path.join(posts_dir, user_folder)

    file_pairs = {}

    for file in os.listdir(user_folder_path):
            file_path = os.path.join(user_folder_path, file)

            if file.endswith(".txt"):
                file_prefix = os.path.splitext(file)[0]
                image_file = f"{file_prefix}.jpg"
                image_path = os.path.join(user_folder_path, image_file)

                if os.path.exists(image_path):
                    with open(file_path, 'r') as txt_file:
                        txt_content = txt_file.read()
                    base64_image = encode_image(image_path)

                    file_pairs[file_prefix] = {"txt": txt_content, "image": base64_image}

    output_folder_path = os.path.join("", user_folder)
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    for file_prefix, data in file_pairs.items():
        txt_content = data["txt"]
        base64_image = data["image"]

        response  = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant!"},
                {"role": "user", "content": [
                    {"type": "text", "text": f"{prompt}"},
                    {"type": "text", "text": txt_content},
                    {"type": "image_url", "image_url": {
                                "url": f"data:image/png;base64,{base64_image}"}
                            }
                ]}
            ],
            temperature=0.0,
        )
        results = response.choices[0].message.content
    
        output_file = os.path.join(output_folder_path, f"{file_prefix}.json")
        with open(output_file, "w") as f:
            json.dump(results, f, indent=4)
    
    print(f"Saved data for {user_folder}")
                    