Kernel: huggingface1

### Imports

In [None]:
import sys
import os
import json

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# To import util, add relevant paths to system path
util_path = '../util'
  
# Select child directory
child_dir = os.path.abspath(util_path)  
# print(f'child_dir:{child_dir}')
  
# Add the child directory to sys.path  
if child_dir not in sys.path:  
    sys.path.append(child_dir)
    print(f'child_dir added to sys.path')
else:
    print(f'child_dir already in sys.path')

In [None]:
from util import image_to_base64, getFileList, call_llm_api
from tqdm import tqdm

### Config

In [None]:
images_base_path = '/mnt/batch/tasks/shared/LS_root/mounts/clusters/computeinstance10-gpu/code/datasets/face_mask/images'
annotations_captions_jsonl_base_path = '../annotations' # Train and test files will be created at this path (Task = <MORE_DETAILED_CAPTION>)

In [None]:
GPT4V_ENDPOINT = 'https://XXXXX.openai.azure.com/openai/deployments/gpt-4o-global-standard/chat/completions'
GPT4V_API_VERSION = '2024-05-01-preview' #'2024-02-15-preview'
GPT4V_KEY = 'YOUR_API_KEY' #"YOUR_API_KEY"
GPT4V_ENABLED = False # Must be set to True to generate annotations, safety switch

In [None]:
print(f'GPT4V_ENABLED:{GPT4V_ENABLED}')

Specify which config to use, this will depending on the use case. Create addtional config (e.g. c) if required for a use case.

In [None]:
# Specify which config to use

# which_config_enabled = 'a' # Annotation caption as per few_shot_config_a few-shot examples
which_config_enabled = 'b' # Annotation caption as per few_shot_config_b few-shot examples

In [None]:
# Custom caption examples for LLM
few_shot_config_a = {
    "maksssksksss0.png":{
        "caption": "There are two people standing next to each other. There is a woman wearing a white jacket and a black hat. The woman on the left side is wearing a face mask on her face. There are lot of people in the background and appears to be a busy street.",
        "everyone_wearing_mask": "no",
        "anyone_wearing_glasses": "no"
    },
    "maksssksksss1.png":{
        "caption": "There are people standing a que. A person at the front is checking the temperature with a digital thermo-meter. They are wearing warm cloths and some people are wearing masks. This is an indoor location.",
        "everyone_wearing_mask": "no",
        "anyone_wearing_glasses": "yes"
    }
}

In [None]:
# This is specific to config 'b' as shown in cell below
no_of_questions = 4 # Expected from LLM, based on few-shot example format

In [None]:
# Custom input-text-based annotation examples for LLM
few_shot_config_b = {
    "maksssksksss0.png":{
        "question_1": "What are people doing?",
        "answer_1": "Walking, people are walking on a street.",
        
        "question_2": "Does this look like a photo taken indoors?",
        "answer_2": "No, it looks like it was taken outdoors.",
        
        "question_3": "Is this photo taken during the day or night?",
        "answer_3": "Evening, it appears to be taken during the evening (or night) due to the lighting.",
        
        "question_4": "Are people carrying any items?",
        "answer_4": "Yes, one person is holding a mobile phone."
    },
    "maksssksksss1.png":{
        "question_1": "What are people doing?",
        "answer_1": "Waiting, people are waiting in a queue in a crowded area for temperature check.",
        
        "question_2": "Does this look like a photo taken indoors?",
        "answer_2": "Yes, it looks like it was taken indoors.",
        
        "question_3": "Is this photo taken during the day or night?",
        "answer_3": "Unclear, as it is indoors and artificial lighting is used.",
        
        "question_4": "Are people carrying any items?",
        "answer_4": "Yes, some people are carrying backpacks."
    }
}

In [None]:
# Set which few-shot examples to be used

if which_config_enabled =='a':
    few_shot_config = few_shot_config_a
    print(f'Config a')
elif which_config_enabled =='b':
    few_shot_config = few_shot_config_b
    print(f'Config b')

In [None]:
training_example_count = 500 #10 # 50 # For how many images automated annotations to be created, these will be used for fine-tuning
test_size = 0.1
random_state = 42

print(f'The code will create annotations for {training_example_count} images from the images_base_path')

In [None]:
# Physical files that will be created (train and test)

if which_config_enabled =='a':
    annotations_jsonl_path_train = os.path.join(annotations_captions_jsonl_base_path, 'face_caption_annotations_sample' + '_train' + '.jsonl')
    annotations_jsonl_path_test = os.path.join(annotations_captions_jsonl_base_path, 'face_caption_annotations_sample' + '_test' + '.jsonl')
    print(f'Config a annotations')
elif which_config_enabled =='b':
    annotations_jsonl_path_train = os.path.join(annotations_captions_jsonl_base_path, 'face_input_text_annotations_sample' + '_train' + '.jsonl')
    annotations_jsonl_path_test = os.path.join(annotations_captions_jsonl_base_path, 'face_input_text_annotations_sample' + '_test' + '.jsonl')
    print(f'Config b annotations')

In [None]:
prefix = "<MORE_DETAILED_CAPTION>"
# prefix = "<MORE_DETAILED_CAPTION_CUSTOM>"

### Common Functions

In [None]:
# Create few-shot example, in format expected by LLM function
encoded_image_text_pairs = [] # List of dictionaries

for key in few_shot_config:
    
    few_shot_example = {}
    
    image_path = os.path.join(images_base_path, key)
    # print(image_path)
    # print(few_shot_config[key]) 
    
    image_base64 = image_to_base64(image_path)
    
    few_shot_example["base64_image"] = image_base64
    few_shot_example["expected_response"] = json.dumps(few_shot_config[key]) # Expected output text
    
    encoded_image_text_pairs.append(few_shot_example)
    
# print(f'encoded_image_text_pairs:{encoded_image_text_pairs}')

In [None]:
# Get all image files
image_list = getFileList(images_base_path, '.png')
# print(image_list)

# Take subset based on config value
image_list = image_list[0:training_example_count]
# print(image_list)

In [None]:
# print(len(getFileList(images_base_path, '.png')))

In [None]:
%%time

# Config 'a'
if GPT4V_ENABLED and which_config_enabled =='a':
    
    print(f'Processing {len(image_list)} images, which_config_enabled {which_config_enabled}')

    image_annotations = []

    for img in tqdm(image_list):

        img_name = os.path.basename(img)

        # Generate for images not in few-shot example list
        if img_name not in few_shot_config.keys():
            # print(img)

            encoded_image = image_to_base64(img) 
            llm_response_json = call_llm_api(GPT4V_ENDPOINT, GPT4V_API_VERSION, GPT4V_KEY, encoded_image, encoded_image_text_pairs, which_config_enabled)
            # print(f'llm_response_json:{json.dumps(llm_response_json, indent=4)}')

            # Check if message available as part of the response
            if llm_response_json and "choices" in llm_response_json.keys() and len(llm_response_json["choices"]) > 0:
                # print(llm_response_json["choices"][0]["message"]["content"])

                jsonl_response = {"image":img_name, "prefix":prefix, "suffix":llm_response_json["choices"][0]["message"]["content"]}
                # print(f'jsonl_response:{jsonl_response}')

                image_annotations.append(jsonl_response)

    print(f'Completed, which_config_enabled: {which_config_enabled}')
    
    
# Config 'b'
if GPT4V_ENABLED and which_config_enabled =='b':
    
    print(f'Processing {len(image_list)} images, which_config_enabled {which_config_enabled}')

    image_annotations = []

    for img in tqdm(image_list):

        img_name = os.path.basename(img)

        # Generate for images not in few-shot example list
        if img_name not in few_shot_config.keys():
            # print(img)

            encoded_image = image_to_base64(img) 
            llm_response_json = call_llm_api(GPT4V_ENDPOINT, GPT4V_API_VERSION, GPT4V_KEY, encoded_image, encoded_image_text_pairs, which_config_enabled)
            # print(f'llm_response_json:{json.dumps(llm_response_json, indent=4)}')

            # Check if message available as part of the response
            if llm_response_json and "choices" in llm_response_json.keys() and len(llm_response_json["choices"]) > 0:
                # print(llm_response_json["choices"][0]["message"]["content"])               
                
                jsonl_response = {"image":img_name, "content": json.loads(llm_response_json["choices"][0]["message"]["content"])}
                # print(f'jsonl_response:{jsonl_response}')

                image_annotations.append(jsonl_response)

    print(f'Completed, which_config_enabled: {which_config_enabled}')

In [None]:
# image_annotations

Materialise the responses into JSON file records. This is where you decide how the response is transfromed and saved into annotation of exptected format.

In [None]:
# Config 'a'
if GPT4V_ENABLED and which_config_enabled =='a':
    
    # Split the list into train and test sets
    image_annotations_train, image_annotations_test = train_test_split(image_annotations, test_size=test_size, random_state=random_state)  
    print(f'len(image_annotations_train):{len(image_annotations_train)}')
    print(f'len(image_annotations_test):{len(image_annotations_test)}')

    # Save to line_dict_list into a jsonl file (train)
    with open(annotations_jsonl_path_train, 'w') as file1:  
        for dictionary1 in image_annotations_train:  
            # Convert the dictionary to a JSON string  
            json_str = json.dumps(dictionary1)  
            # Write the JSON string to the file followed by a newline  
            file1.write(json_str + '\n')  

    # Save to line_dict_list into a jsonl file (test)
    with open(annotations_jsonl_path_test, 'w') as file2:  
        for dictionary2 in image_annotations_test:  
            # Convert the dictionary to a JSON string  
            json_str = json.dumps(dictionary2)  
            # Write the JSON string to the file followed by a newline  
            file2.write(json_str + '\n') 

    print(f'Files created: \n{annotations_jsonl_path_train}, \n{annotations_jsonl_path_test}')   

In [None]:
# Config 'b'
if GPT4V_ENABLED and which_config_enabled =='b':   
    
    # Split the list into train and test sets
    image_annotations_train, image_annotations_test = train_test_split(image_annotations, test_size=test_size, random_state=random_state)  
    print(f'len(image_annotations_train):{len(image_annotations_train)}')
    print(f'len(image_annotations_test):{len(image_annotations_test)}')
    
    # Save to line_dict_list into a jsonl file (train)
    with open(annotations_jsonl_path_train, 'w') as file1:  
        for dictionary1 in image_annotations_train: 
            # print(f'dictionary1:{dictionary1}')
            # print(dictionary1.keys())
            
            # Split response into individual question/answer pairs
            for question_no in range(no_of_questions):
                # print(question_no)
            
                # Check if question/answer pair exists
                if f'question_{question_no+1}' in dictionary1["content"].keys() and f'answer_{question_no+1}' in dictionary1["content"].keys():
                    qa_pair_response = {"image":dictionary1["image"], 
                                      "prefix":dictionary1["content"][f'question_{question_no+1}'],                                       
                                      "suffix":dictionary1["content"][f'answer_{question_no+1}']
                                     }
                    # print(f'qa_pair_response:{qa_pair_response}')
                    
                    # Convert the dictionary to a JSON string  
                    json_str = json.dumps(qa_pair_response)  
                    # Write the JSON string to the file followed by a newline  
                    file1.write(json_str + '\n')  
                    
    # Save to line_dict_list into a jsonl file (test)
    with open(annotations_jsonl_path_test, 'w') as file2:  
        for dictionary2 in image_annotations_test: 
            # print(f'dictionary2:{dictionary2}')
            # print(dictionary2.keys())
            
            # Split response into individual question/answer pairs
            for question_no in range(no_of_questions):
                # print(question_no)
            
                # Check if question/answer pair exists
                if f'question_{question_no+1}' in dictionary2["content"].keys() and f'answer_{question_no+1}' in dictionary2["content"].keys():
                    qa_pair_response = {"image":dictionary2["image"], 
                                      "prefix":dictionary2["content"][f'question_{question_no+1}'],                                       
                                      "suffix":dictionary2["content"][f'answer_{question_no+1}']
                                     }
                    # print(f'qa_pair_response:{qa_pair_response}')
                    
                    # Convert the dictionary to a JSON string  
                    json_str = json.dumps(qa_pair_response)  
                    # Write the JSON string to the file followed by a newline  
                    file2.write(json_str + '\n')
                    
    print(f'Files created: \n{annotations_jsonl_path_train}, \n{annotations_jsonl_path_test}')

### Create annotation copy without Caption
These new files with _trimmed suffix will not full have caption, but only have dictionary format

e.g.
{"image": "maksssksksss539.png", "prefix": "<MORE_DETAILED_CAPTION>", "suffix": "{\"everyone_wearing_mask\": \"no\", \"anyone_wearing_glasses\": \"yes\"}"}

In [None]:
annotations_jsonl_path_train_trimmed = annotations_jsonl_path_train.replace('_train.jsonl','_train_trimmed.jsonl')
annotations_jsonl_path_test_trimmed = annotations_jsonl_path_test.replace('_test.jsonl','_test_trimmed.jsonl')

print(f'annotations_jsonl_path_train_trimmed:{annotations_jsonl_path_train_trimmed}')
print(f'annotations_jsonl_path_test_trimmed:{annotations_jsonl_path_test_trimmed}')

In [None]:
# Config 'a'
if which_config_enabled =='a':
    
    # Open the JSONL file and write to a new file - train
    with open(annotations_jsonl_path_train, 'r') as file, open(annotations_jsonl_path_train_trimmed, 'w') as file_w:
        for line in file:          

            # the JSON object from the current line  
            json_obj = json.loads(line)            
            # print(json_obj)        

            json_obj_trimmed = {"image":json_obj["image"],
                                "prefix":json_obj["prefix"],
                                "suffix":{ "everyone_wearing_mask" : json.loads(json_obj["suffix"])["everyone_wearing_mask"],
                                           "anyone_wearing_glasses" : json.loads(json_obj["suffix"])["anyone_wearing_glasses"]
                                         }
                               }

             # Convert the dictionary to a JSON string  
            json_str = json.dumps(json_obj_trimmed)  
            # Write the JSON string to the file followed by a newline  
            file_w.write(json_str + '\n') 


    # Open the JSONL file and write to a new file - test
    with open(annotations_jsonl_path_test, 'r') as file, open(annotations_jsonl_path_test_trimmed, 'w') as file_w:
        for line in file:          

            # the JSON object from the current line  
            json_obj = json.loads(line)            
            # print(json_obj)        

            json_obj_trimmed = {"image":json_obj["image"],
                                "prefix":json_obj["prefix"],
                                "suffix":{ "everyone_wearing_mask" : json.loads(json_obj["suffix"])["everyone_wearing_mask"],
                                           "anyone_wearing_glasses" : json.loads(json_obj["suffix"])["anyone_wearing_glasses"]
                                         }
                               }

             # Convert the dictionary to a JSON string  
            json_str = json.dumps(json_obj_trimmed)  
            # Write the JSON string to the file followed by a newline  
            file_w.write(json_str + '\n') 