In [2]:
from utils.base_prompts import (
    PROMPT_A1,
    PROMPT_A2,
    PROMPT_B,
    PROMPT_C
)
from utils.config import (
    IMAGE_RAW_PATH,
    IMAGE_HEATMAP_PATH,
    MODEL_ID
)

import requests
from PIL import Image
import json
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [39]:
class MarketingAgent:
    """
    Main class that contains the four different prompt pipelines
    """

    def __init__(
            self, 
            image_raw_path: str = IMAGE_RAW_PATH,
            image_heatmap_path: str = IMAGE_HEATMAP_PATH,
            model_id: str = MODEL_ID
        ):
        self.image_raw = Image.open(image_raw_path)
        self.image_heat = Image.open(image_heatmap_path)
        self.pipe = pipeline("image-to-text", model=model_id)
    
    def format_prompt(self, prompt):
        complete_prompt = fr'USER: <image>\n {prompt} \nASSISTANT:\n'
        return complete_prompt
    
    def clean_output(self, prompt_output):
        return json.loads(prompt_output[0]["generated_text"].split("ASSISTANT:\\n\n\n", 1)[-1].replace(r'\_', '_'))
    
    def combine_outputs(
            self,
            json_output_A1,
            json_output_A2,
            json_output_B
        ):
        # Extract elements from each JSON output
        ad_description = json_output_A1[0]["ad_description"]
        ad_purpose = json_output_A1[0]["ad_purpose"]
        ad_saliency_description = json_output_A2[0]["saliency_description"]
        ad_cognitive_description = json_output_B[0]["cognitive_description"]

        # Combine into a new JSON object
        json_combined = {
            "ad_description": ad_description,
            "ad_purpose": ad_purpose,
            "ad_saliency_description": ad_saliency_description,
            "ad_cognitive_description": ad_cognitive_description
        }
        return json_combined

    def run_marketing_prompt(self, image, prompt, json_combined={}, multimodal=True):
        if multimodal:
            prompt = self.format_prompt(prompt)
            prompt_output = self.pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
            print(prompt_output[0]["generated_text"].split("ASSISTANT:\n", 1)[-1].replace(r'\_', '_'))
        else:
            prompt = f'{prompt} {json_combined}'
            prompt = self.format_prompt(prompt)
            prompt_output = self.pipe(prompt=prompt, generate_kwargs={"max_new_tokens": 200})
        json_output = self.clean_output(prompt_output)
        return json_output

In [37]:
def format_prompt(self, prompt):
    complete_prompt = fr'USER: <image>\n {prompt} \nASSISTANT:'
    return complete_prompt

def clean_output(prompt_output):
    return json.loads(prompt_output[0]["generated_text"].split("ASSISTANT:\\n\n\n", 1)[-1].replace(r'\_', '_'))

def combine_outputs(
        json_output_A1,
        json_output_A2,
        json_output_B
    ):
    # Extract elements from each JSON output
    ad_description = json_output_A1[0]["ad_description"]
    ad_purpose = json_output_A1[0]["ad_purpose"]
    ad_saliency_description = json_output_A2[0]["saliency_description"]
    ad_cognitive_description = json_output_B[0]["cognitive_description"]

    # Combine into a new JSON object
    json_combined = {
        "ad_description": ad_description,
        "ad_purpose": ad_purpose,
        "ad_saliency_description": ad_saliency_description,
        "ad_cognitive_description": ad_cognitive_description
    }
    return json_combined

def run_marketing_prompt(self, image, prompt, json_combined={}, multimodal=True):
    if multimodal:
        prompt = self.format_prompt(prompt)
        prompt_output = self.pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
    else:
        prompt = f'{prompt} {json_combined}'
        prompt = self.format_prompt(prompt)
        prompt_output = self.pipe(prompt=prompt, generate_kwargs={"max_new_tokens": 200})
    json_output = self.clean_output(prompt_output)
    return json_output

In [40]:
prompt = predict_helper.format_prompt(PROMPT_A1)
prompt_output = predict_helper.pipe(predict_helper.image_raw, prompt=prompt, generate_kwargs={"max_new_tokens": 200})

KeyboardInterrupt: 

In [None]:
prompt_output

[{'generated_text': 'USER:  \\n \n<role>\nYou are a Senior Insights Manager with decades of experience, and a background\nin marketing.\n</role>\n<input-overview>\nYou are provided with an image of a digital advertisement.\n</input-overview>\n<task>\nYou have two tasks:\n1) Provide a detailed description of the advert. In other words, identify and\ndescribe the key elements such as the product being advertised, the brand name,\nand the call-to-action (CTA), where available.\n2) Additionally, assess and determine the primary purpose of the advertisement,\ni.e. whether it is aimed at brand building or aimed at driving conversion.\n</task>\n<response-template>\nProvide the output in the following JSON format\n```\n[\n    {\n        "ad_description":$description,\n        "ad_purpose":$purpose\n    }\n]\n```\n\nIn this format, $description is a placeholder for the description of the\nadvert, $purpose can only be either "brand-building" or "conversion".\n</response-template>.\n \\nASSISTANT

In [38]:
json.loads(prompt_output[0]["generated_text"].split("ASSISTANT:\\n\n\n", 1)[-1].replace(r'\_', '_'))

[{'ad_description': 'A woman wearing a colorful jacket and a knitted hat is posing for a photo. She is wearing a pink hat and a pink jacket. The background is a brightly colored gradient. The advertisement is for Snowstyle, a clothing brand. The call-to-action is to visit the Snowstyle website.',
  'ad_purpose': 'conversion'}]

In [35]:
predict_helper = MarketingAgent()

Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.27s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [36]:
predict_helper.run_marketing_prompt(predict_helper.image_raw,PROMPT_A1)

USER:  \n 
<role>
You are a Senior Insights Manager with decades of experience, and a background
in marketing.
</role>
<input-overview>
You are provided with an image of a digital advertisement.
</input-overview>
<task>
You have two tasks:
1) Provide a detailed description of the advert. In other words, identify and
describe the key elements such as the product being advertised, the brand name,
and the call-to-action (CTA), where available.
2) Additionally, assess and determine the primary purpose of the advertisement,
i.e. whether it is aimed at brand building or aimed at driving conversion.
</task>
<response-template>
Provide the output in the following JSON format
```
[
    {
        "ad_description":$description,
        "ad_purpose":$purpose
    }
]
```

In this format, $description is a placeholder for the description of the
advert, $purpose can only be either "brand-building" or "conversion".
</response-template>.
 \nASSISTANT:\n

[
{
"ad_description": "A woman wearing a colorfu

JSONDecodeError: Expecting value: line 1 column 1 (char 0)