In [24]:
from langchain_community.chat_models import ChatOllama # type: ignore
from langchain_core.output_parsers import StrOutputParser # type: ignore
from langchain_core.prompts import ChatPromptTemplate # type: ignore
from langchain_core.output_parsers import JsonOutputParser # type: ignore
from langchain_core.pydantic_v1 import BaseModel, Field # type: ignore
from langchain_teddynote.messages import stream_response  # type: ignore
from bs4 import BeautifulSoup # type: ignore
from openai import OpenAI # type: ignore
from tqdm import tqdm # type: ignore
from groq import Groq # type: ignore
import re
import json
import os
import requests # type: ignore
import copy
import time

In [None]:
groq_client = Groq(
    api_key=""
)

In [47]:
model_name = 'bert-base-cased'

In [48]:
with open(f"./models/{model_name}/Contents.txt", "r", encoding="utf-8") as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, "html.parser")

In [49]:
model_description = soup.text.strip()

In [50]:
metadata_item = {
    "ModelArchitecture": "The structure or framework of the model (e.g., Transformer, RNN, CNN)",
    "Purpose": "The main use cases or applications the model is designed for (e.g., natural language processing, image recognition)",
    "ModelTrainingTime": "The amount of time taken to train the model",
    "MaintainerContact": "Contact information for the model's maintainers",
    "RelatedDocumentation": "Documents or resources related to the model",
    "CommunityFeedback": "Feedback from users or the community",
    "InferenceSpeed": "The speed of the model's inference (how quickly it makes predictions)",
    "PerformanceMetric": "Metrics used to evaluate the model’s performance (e.g., accuracy, F1 score)",
    "KnownLimitations": "Known limitations or constraints of the model",
    "IntendedUse": "The intended use or purpose of the model",
    "EthicalConsiderations": "Ethical considerations when using the model"
}

In [51]:
prompt_1st = f"""
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
Follow the instructions below to ensure that the Llama 3.1 model can accurately understand and perform the task.
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
Metadata Item:  
{metadata_item}

Model Description:  
{model_description}

Instructions:  
1. Identify the paragraph(s) from the {model_name} model description that contain content related to the Metadata Item.  
2. Extract and reproduce the entire paragraph(s) exactly as they appear in the original model_description.  
3. Do not make any changes, interpretations, or omissions. Preserve the original context and meaning fully.  
4. Extract the paragraph(s) as generously as possible, even if the content may only be indirectly related to the Metadata Item.  
5. Double-check that all paragraphs relevant to the Metadata Item have been included.  
6. Do not generate or add any additional information beyond what is explicitly present in the model description.
<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
Answer:
<|eot_id|>                                                                                                           
"""

In [52]:
groq_chat = groq_client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt_1st
        }
    ],
    model="llama3-70b-8192",
)

In [53]:
content_related_metadataitem = groq_chat.choices[0].message.content

In [54]:
prompt_2nd = f"""
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
Please carefully follow the instructions to help the Llama 3 model understand and complete the task accurately.
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
Metadata Item:  
{metadata_item}

Model Description related to Metadata Item:  
{content_related_metadataitem}

Instructions:  
1. Find and extract the specific instances from the {model_name} model description that directly relate to the Metadata Item.  
2. Reproduce the instance(s) exactly as they appear without making any modifications or omissions.
3. Do not add any additional information beyond what is present in the original description.
4. Do not include long sentences, descriptions, or interpretations. Only extract and return the value or keyword that directly corresponds to the metadata item.
5. Ensure all matched metadata items and their corresponding values are returned in valid JSON format. Follow strict JSON formatting rules.
<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
Answer:
<|eot_id|>
"""

In [55]:
groq_chat = groq_client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt_2nd
        }
    ],
    model="llama3-70b-8192",
)

In [56]:
res = groq_chat.choices[0].message.content

In [None]:
openai_client = OpenAI(
    api_key=""
)

In [58]:
input_text = f"""{res}\n\n
                Above content provided contains the description for the {model_name} model. 
                Please extract instances that correspond to the following metadata items.
                *** 
                If a specific item is not mentioned in the content, insert a null value for that metadata.
                Match the metadata items with specific values or keywords found in the model description.
                Avoid long descriptive sentences or explanations—only extract the direct value or keyword that matches the metadata item.
                ***
                Metadata items to map:\n{metadata_item}\n
                Once extracted, please return the result in a well-formatted JSON structure. 
                Ensure the output adheres to proper JSON syntax. """

In [59]:
openai_chat = openai_client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": input_text
        }           
    ],
    model="gpt-4o-mini",
)

In [60]:
response_message = openai_chat.choices[0].message.content

In [61]:
response_message

'Here is the extracted metadata in valid JSON format based on the provided content:\n\n```json\n{\n  "ModelArchitecture": "transformers",\n  "Purpose": "sequence classification, token classification or question answering",\n  "ModelTrainingTime": "one million steps",\n  "MaintainerContact": null,\n  "RelatedDocumentation": null,\n  "CommunityFeedback": null,\n  "InferenceSpeed": null,\n  "PerformanceMetric": "Glue test results: MNLI-(m/mm), QQP, QNLI, SST-2, CoLA, STS-B, MRPC, RTE",\n  "KnownLimitations": "biased predictions",\n  "IntendedUse": "masked language modeling, next sentence prediction, fine-tuning on downstream tasks",\n  "EthicalConsiderations": "biased predictions"\n}\n```'

In [62]:
gpt_inst = re.search(r'(\{.*\})', response_message, re.DOTALL)

In [63]:
gpt_inst.group(0)

'{\n  "ModelArchitecture": "transformers",\n  "Purpose": "sequence classification, token classification or question answering",\n  "ModelTrainingTime": "one million steps",\n  "MaintainerContact": null,\n  "RelatedDocumentation": null,\n  "CommunityFeedback": null,\n  "InferenceSpeed": null,\n  "PerformanceMetric": "Glue test results: MNLI-(m/mm), QQP, QNLI, SST-2, CoLA, STS-B, MRPC, RTE",\n  "KnownLimitations": "biased predictions",\n  "IntendedUse": "masked language modeling, next sentence prediction, fine-tuning on downstream tasks",\n  "EthicalConsiderations": "biased predictions"\n}'

In [64]:
chatgpt_inst = gpt_inst.group(0)

In [66]:
chatgpt_inst = json.loads(chatgpt_inst) # 문자열을 딕셔너리로 변환

In [67]:
model_name

'bert-base-cased'

In [68]:
folder_path = f'./models/{model_name}/'

with open(folder_path + 'Contents.json', 'w') as f:
    json.dump(chatgpt_inst, f, indent=4)