In [9]:
from rich import print
import os
from openai import OpenAI
from datasets import Dataset, DatasetDict, load_dataset
import json
from dotenv import load_dotenv

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

# Initialisation of LLM

In [10]:



client = OpenAI(
    base_url="https://integrate.api.nvidia.com/v1",
    api_key=os.environ["NVIDIA_API_KEY"]
)
MODEL = "meta/llama-3.1-405b-instruct"

# Creation of subtopics

In [11]:
n_subtopics = 3

# TOPIC_GENERATION_PROMPT_TEMPLATE = """\
# I want to create a synthetic dataset of natural language and Git commands. Base
# to cover what needs to be covered when working with Git.
# The list must be without numbers, and without any description of the subtopics.
# """
TOPIC_GENERATION_PROMPT_TEMPLATE = """\
You are a GIT expert. I want to create a synthetic dataset of natural language and Git commands. Create a list of subtopics. Provide only the list without numbers, and without any description of the subtopics. It is forbidden to not follow those instructions.
List:
"""
def generate_subtopics(client, n_subtopics):
    prompt = TOPIC_GENERATION_PROMPT_TEMPLATE.format(n_subtopics=n_subtopics)
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "user",
             "content": prompt}
        ],
        temperature=0.2,
        top_p=0.7,
    )
    return response
responses = generate_subtopics(client, n_subtopics=n_subtopics)
print(responses.choices[0].message.content)
#save


# Generating instructions based on subtopics

In [12]:
# import file

import csv
lines = responses.choices[0].message.content.splitlines()
with open('output_subtopics.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for line in lines:
        writer.writerow(line.split(','))

n_instructions = 25

INSTRUCTION_PROMPT_TEMPLATE = """\
The objective is to create a dataset of user instructions in natural language that should be returned by Git commands.
Given a topic in Git, generate {n_instructions} possible concise instructions that could be given to an AI assitant about that topic.
Write some of these instructions as if given by someone with limited knowledge of Git terminologies and knowledge, 
like a beginner programmer. Your response should be in a list format.

The topic is: {sub_topic}
The list must be without numbers. The questions/instructions should be separated by a newline character. There must be no other text than the list.
"""
subtopic_list = responses.choices[0].message.content.split(",")
def generate_instructions(client, sub_topic, n_instructions):
    print(f"Generating Instructions for {sub_topic}.")
    prompt = INSTRUCTION_PROMPT_TEMPLATE.format(sub_topic=sub_topic, n_instructions=n_instructions)
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "user",
             "content": prompt}
        ],
        temperature=0.2,
        top_p=0.7,
    )
    return response.choices[0].message.content


def instructions_generator(client, subtopic_list, n_instructions):
    instruction_list = [generate_instructions(client, subtopic, n_instructions) for subtopic in subtopic_list]
    return instruction_list

instruction_list = instructions_generator(client, subtopic_list, n_instructions)

instruction_list_formatted = []
for instruction_set in instruction_list:
    instruction_list_formatted.extend([instruction.strip() for instruction in instruction_set.split("\n") if instruction])
print(instruction_list_formatted)


# Generating responses

In [14]:
# import file

import csv
lines = responses.choices[0].message.content.splitlines()
with open('output_instructions.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for line in lines:
        writer.writerow(line.split(','))

# Generating responses

RESPONSE_PROMPT_TEMPLATE = """\
Given an question/instruction related to Git, generate a response that could be given. 
Keep the response on-topic, informative, concise.

The user prompt is: {instruction}
"""
def generate_responses(client, instruction):
    prompt = RESPONSE_PROMPT_TEMPLATE.format(instruction=instruction)
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "user",
             "content": prompt}
        ],
        temperature=0.2,
        top_p=0.7,
        max_tokens=1024,
    )
    return response.choices[0].message.content

def response_generator(client, instruction_list):
    response_list = [generate_responses(client, instruction) for instruction in instruction_list]
    return response_list

instruction_response_list = response_generator(client, instruction_list_formatted)
instruction_response_pair_list = []
for instruction, response in zip(instruction_list_formatted, instruction_response_list):
    instruction_response_pair_list.append(
        {
            "instruction": instruction,
            "responses": response,
        }
    )


def get_scores_from_response(score_response_template):
    logprobs = score_response_template.choices[0].logprobs.content
    score_dict = {}
    for score in logprobs:
        score_dict[score.token] = score.logprob
    return score_dict

def get_response_and_scores(client, model, question, response_content):
    messages = [
        {
            "role": "user",
            "content": question
        },
        {
            "role": "assistant",
            "content": response_content
        }
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
    )
    scores = get_scores_from_response(response)
    return scores

synthetic_data = []

helpfulness_THRESHOLD = 3
verbosity_THRESHOLD = 2.5
synthetic_data = [data for i, data in enumerate(synthetic_data) 
                  if not (score_list[i]["helpfulness"] < helpfulness_THRESHOLD or 
                          score_list[i]["verbosity"] > verbosity_THRESHOLD)]