In [247]:
import os
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
import pytesseract
from PIL import Image
import json
from pydantic import BaseModel, model_validator
from typing import List, Dict
import glob


In [248]:
folder_path = "./screen_question/*/*"

In [249]:
glob.glob(folder_path)

['./screen_question/S3/CleanShot 2024-07-05 at 16.50.24@2x.png',
 './screen_question/S3/CleanShot 2024-07-05 at 16.52.05@2x.png',
 './screen_question/S3/CleanShot 2024-07-05 at 16.50.54@2x.png',
 './screen_question/S3/CleanShot 2024-07-05 at 16.50.39@2x.png',
 './screen_question/S3/CleanShot 2024-07-05 at 16.50.11@2x.png',
 './screen_question/S3/CleanShot 2024-07-05 at 16.52.38@2x.png',
 './screen_question/S3/CleanShot 2024-07-05 at 16.52.22@2x.png',
 './screen_question/SNS-SQS-KENESIS/CleanShot 2024-07-09 at 18.32.36@2x.png',
 './screen_question/SNS-SQS-KENESIS/CleanShot 2024-07-07 at 12.12.55@2x.png',
 './screen_question/SNS-SQS-KENESIS/CleanShot 2024-07-07 at 12.02.03@2x.png',
 './screen_question/SNS-SQS-KENESIS/CleanShot 2024-07-07 at 12.03.44@2x.png',
 './screen_question/SNS-SQS-KENESIS/CleanShot 2024-07-07 at 12.10.52@2x.png',
 './screen_question/SNS-SQS-KENESIS/CleanShot 2024-07-07 at 12.09.26@2x.png',
 './screen_question/SNS-SQS-KENESIS/CleanShot 2024-07-07 at 12.13.33@2x.png',

In [250]:

list_file = glob.glob(folder_path)

In [251]:

model = ChatOpenAI(model="gpt-4",api_key='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')
from langchain_community.llms import Ollama
model = Ollama(
    model="gemma2"
)  # assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3 `


In [270]:
prompt = """
I extract the data from this question:
Question 4:

Which of the following is an IAM best practice?

O Create several IAM Users for one physical person

© Don't use the root user account

O Share your AWS account credentials with your colleague, so (s)he can perform a task for you

O Do not enable MFA for easier access

And I return only the data in JSON format in this form:
#######
{
    "question": "Which of the following is an IAM best practice?",
    "options": [
        "Create several IAM Users for one physical person",
        "Don't use the root user account",
        "Share your AWS account credentials with your colleague, so (s)he can perform a task for you",
        "Do not enable MFA for easier access"
    ],
    "answer": "Don't use the root user account"
}
other exemple:
Question 2:

You have enabled versioning in your S3 bucket which already contains a lot of files. Which version will the
existing files have?

O 1

O 0

O -1

© null
#######
'{"question":"You have enabled versioning in your S3 bucket which already contains a lot of files. Which version will the existing files have?","options":["1","0","-1","null"],"answer":"null"}'

In some questions, it is asked to find the "incorrect" answer or "Except" and the data should be put in the "answer" field
answer is a str not a list
Be careful that the "answer" must be in list "options"
Correct any typos while respecting the previous constraints
Remove unnecessary line breaks

Respond without an introductory phrase like "Here is the JSON data corresponding to the question"
"""


In [271]:


# Function to extract text from image and convert to JSON
def extract_text(image_path):
    # Open the image file
    img = Image.open(image_path)
    
    # Use Tesseract to do OCR on the image
    text = pytesseract.image_to_string(img)
    return text

In [272]:

class Question(BaseModel):
    question: str
    options: List[str]
    answer: str
    theme: str = ''
    embedding : List[float] = None

    @model_validator(mode='after')
    def correct_must_be_in_reponses(self):
        if self.answer not in self.options:
            raise ValueError('All correct answers must be in the responses list')
            
        return self

In [273]:
def get_json_question(image_path):
    questions = extract_text(image_path)
    messages = [
        SystemMessage(content=prompt),
        HumanMessage(content=questions),
    ]
    i=3
    while i>0:
        try:
            reponse = model.invoke(input=messages)
            q = Question(**json.loads(reponse))
            dir_path= os.path.dirname(image_path)
            q.theme = os.path.basename(dir_path)
            return q
        except Exception as e:
            print ("Erreur pour le fichier:",image_path)
            i=i-1
            print(e)
    return None

In [274]:
from tqdm import tqdm
questions = list()
for f in tqdm(list_file[:5]):
    questions.append(get_json_question(f))

100%|██████████| 5/5 [00:17<00:00,  3.54s/it]


In [278]:
quizz_question_dict =[q.model_dump() for q in  questions]

In [279]:
quizz_question_dict

[{'question': 'You have enabled versioning in your S3 bucket which already contains a lot of files. Which version will the existing files have?',
  'options': ['1', '0', '-1', 'null'],
  'answer': 'null',
  'theme': 'S3',
  'embedding': None},
 {'question': 'You have 3 S3 buckets. One source bucket A, and two destination buckets B and C in different AWS Regions. You want to replicate objects from bucket A to both bucket B and C. How would you achieve this?',
  'options': ['Configure replication from bucket A to bucket B, then from bucket A to bucket C',
   'Configure replication from bucket A to bucket B, then from bucket B to bucket C',
   'Configure replication from bucket A to bucket C, then from bucket C to bucket B'],
  'answer': 'Configure replication from bucket A to bucket B, then from bucket A to bucket C',
  'theme': 'S3',
  'embedding': None},
 {'question': 'You want the content of an S3 bucket to be fully available in different AWS Regions. That will help your team perform 

In [280]:
with open('./quizz_question.json', 'w',encoding='utf-8') as f:
    json.dump(quizz_question_dict,f,ensure_ascii=True,indent=2)

In [281]:
quizz_question_dict

[{'question': 'You have enabled versioning in your S3 bucket which already contains a lot of files. Which version will the existing files have?',
  'options': ['1', '0', '-1', 'null'],
  'answer': 'null',
  'theme': 'S3',
  'embedding': None},
 {'question': 'You have 3 S3 buckets. One source bucket A, and two destination buckets B and C in different AWS Regions. You want to replicate objects from bucket A to both bucket B and C. How would you achieve this?',
  'options': ['Configure replication from bucket A to bucket B, then from bucket A to bucket C',
   'Configure replication from bucket A to bucket B, then from bucket B to bucket C',
   'Configure replication from bucket A to bucket C, then from bucket C to bucket B'],
  'answer': 'Configure replication from bucket A to bucket B, then from bucket A to bucket C',
  'theme': 'S3',
  'embedding': None},
 {'question': 'You want the content of an S3 bucket to be fully available in different AWS Regions. That will help your team perform 

In [282]:
quizz_question_dict

[{'question': 'You have enabled versioning in your S3 bucket which already contains a lot of files. Which version will the existing files have?',
  'options': ['1', '0', '-1', 'null'],
  'answer': 'null',
  'theme': 'S3',
  'embedding': None},
 {'question': 'You have 3 S3 buckets. One source bucket A, and two destination buckets B and C in different AWS Regions. You want to replicate objects from bucket A to both bucket B and C. How would you achieve this?',
  'options': ['Configure replication from bucket A to bucket B, then from bucket A to bucket C',
   'Configure replication from bucket A to bucket B, then from bucket B to bucket C',
   'Configure replication from bucket A to bucket C, then from bucket C to bucket B'],
  'answer': 'Configure replication from bucket A to bucket B, then from bucket A to bucket C',
  'theme': 'S3',
  'embedding': None},
 {'question': 'You want the content of an S3 bucket to be fully available in different AWS Regions. That will help your team perform 