# Creating Dataset for bot that thinks

### First we´ll load some libraries and load the LLM (which takes 15min) and connect it to the rest of the code via LLM_connection

In [1]:
import torch
from trl import SFTTrainer
import os
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported
import time
import json

from src.Basic_LLM_functions import LLM_conection
from src.Dataset_generation import generate_full_question,save_question_and_log,subtopic_prompt_template,question_prompt_template
from src.Chain_of_thought import Chain_of_thought

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="LLMs/Meta-Llama-3.1-8B",  # Ensure this is the correct path
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
    local_files_only=True,
    trust_remote_code=False,
    force_download=False
)


DEBUG: Starting new HTTPS connection (1): raw.githubusercontent.com:443
DEBUG: https://raw.githubusercontent.com:443 "GET /unslothai/unsloth/main/unsloth/models/mapper.py HTTP/11" 200 2189
DEBUG: Starting new HTTPS connection (1): huggingface.co:443
DEBUG: https://huggingface.co:443 "HEAD /LLMs/Meta-Llama-3.1-8B/resolve/main/adapter_config.json HTTP/11" 401 0


==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 24.0 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


DEBUG: https://huggingface.co:443 "HEAD /unslothai/other/resolve/main/config.json HTTP/11" 200 0
DEBUG: Attempting to acquire lock 140221051725712 on /home/jaime/.cache/huggingface/hub/.locks/models--unslothai--other/c0fd5260818f672a9119328c53cc33df84f30698.lock
DEBUG: Lock 140221051725712 acquired on /home/jaime/.cache/huggingface/hub/.locks/models--unslothai--other/c0fd5260818f672a9119328c53cc33df84f30698.lock
DEBUG: https://huggingface.co:443 "GET /unslothai/other/resolve/main/config.json HTTP/11" 200 635
DEBUG: Attempting to release lock 140221051725712 on /home/jaime/.cache/huggingface/hub/.locks/models--unslothai--other/c0fd5260818f672a9119328c53cc33df84f30698.lock
DEBUG: Lock 140221051725712 released on /home/jaime/.cache/huggingface/hub/.locks/models--unslothai--other/c0fd5260818f672a9119328c53cc33df84f30698.lock
DEBUG: https://huggingface.co:443 "HEAD /unslothai/other/resolve/main/config.json HTTP/11" 200 0
DEBUG: Attempting to acquire lock 140221050403472 on /home/jaime/.cach

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LLMs/Meta-Llama-3.1-8B does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


In [3]:
tokenizer = get_chat_template(
    tokenizer,
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
    chat_template="chatml",
)

model = FastLanguageModel.for_inference(model)

Unsloth: Will map <|im_end|> to EOS = <|eot_id|>.


In [4]:
#We set up the LLM connection and we can finally start doing stuff
LLM_conection.set_model(model,tokenizer)

### Let´s create the questions

In [5]:
num_questions = 1000  # Change this to the number of questions you want to generate

for i in range(num_questions):
    print(f"Generating question {i+1}/{num_questions}")
    question_data = generate_full_question(model,tokenizer)
    save_question_and_log(question_data, subtopic_prompt_template, question_prompt_template)
    print(f"Main Topic: {question_data['main_topic']}")
    print(f"Subtopic: {question_data['subtopic']}")
    print(f"Question: {question_data['question']}")
    print("\n---\n")
    time.sleep(2)  # Add a delay to avoid hitting API rate limits

print("Question generation complete. Check the 'questions' and 'log' folders for output.")

Generating question 1/1000
Main Topic: Statistics and Data Science
Subtopic: Identifying patterns in skewed distributions using quantile-based regression techniques.
Question: In a hypothetical dataset containing exam scores of students from a particular school, where the scores are skewed towards higher values, a researcher is interested in understanding the relationship between the scores and the number of hours students spent studying. Using quantile-based regression techniques, the researcher aims to identify patterns in the data that could help in predicting the scores of students who have studied for a certain number of hours. If the 75th percentile of the scores corresponds to students who have studied for approximately 5 hours, and the 25th percentile corresponds to students who have studied for approximately 2 hours, what can be inferred about the scores of students who have studied for 3 hours, and how does this information relate to the overall distribution of scores in the 

In [6]:
import src.Dataset_generation as df_gen
df_gen.main_topics=["Mathematics"]

### Let´s create the answers via Chain of Thought

In [7]:
os.makedirs("Dataset_creation/answers", exist_ok=True)
answered_questions=os.listdir("Dataset_creation/answers")

In [8]:
txt_files_contents = []
folder_path="Dataset_creation/questions"
for filename in os.listdir(folder_path):
    if filename in answered_questions:
        continue
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r') as file:
            txt_files_contents.append((filename,file.read()))

In [23]:
dataset=[]
for filename,question in txt_files_contents:
    print(f"Processing {filename}")
    output=Chain_of_thought(question,verbose=0)
    print(f"Processing {filename}")
    
    print(output)
    if len(output["simplified_steps"])>0:
        print("HELLO")
        #Save the json to .json in a folder called answers
        new_filename=filename.replace(".txt","")
        with open(f"Dataset_creation/answers/{new_filename}.json", 'w') as f:
            json.dump(output, f, indent=4)
    
    #break#We only wanna do 1 chain of thought for now

Processing sample_2024-09-27-20-49-56_Mathematics.txt
Processing sample_2024-09-27-20-49-56_Mathematics.txt
{'question': "What is the most convincing argument for reframing the Sleeping Beauty problem as a scenario where Beauty is not actually sleeping, but rather, in a state of suspended animation, and how does this reframing impact the probability of her waking up on Sunday, given that the original problem's outcome is equally likely to occur on either Saturday or Sunday?", 'information': "This is a classic problem in philosophy and probability theory, known as the Sleeping Beauty problem. The original problem states that Sleeping Beauty is put to sleep on Sunday and a fair coin is flipped. If the coin lands heads up, Beauty is awakened on Sunday and given some information about the outcome of the coin flip. If the coin lands tails up, Beauty is awakened on Monday. The question is, what is the probability that the coin landed heads up, given that Beauty is awakened on Sunday?\n\nThe 

KeyboardInterrupt: 

### Let´s create the finetuning dataset

In [14]:
os.makedirs("Answers_finetuning_dataset", exist_ok=True)

answered_questions=os.listdir("answers")
#Make a for loading the json
for filename in answered_questions:
    if filename.endswith(".json"):
        with open(f"answers/{filename}", 'r') as f:
            data = json.load(f)
            steps=data["simplified_steps"]
            steps_combined=""
            for i,step in enumerate(steps,start=1):
                steps_combined+=f"{i}. {step}\n"
                
            prompt=f"""Given the question: {data['question']}.
                        We´ve found the answer to be: {data['answer']}.
                        The reasoning to get this answer has been the following:
                        {steps_combined}
                        Write out an output an LLM could give to this questions as if it were thinking through the problem, and talking to itself to make sure it´s reasoning throughout is correct.
                        Act like you are 2 agents, one that is reasoning, and another that is trying to figure out if the reasoning so far is correct.
                        And then, finally when it finishes it´s reasoning, put Reasoning: and then the reasoning, and after, put the answer by putting Answer: and then the answer.
                        """
            
            output=LLM_conection.Get_answer(prompt)
        with open(os.path.join('Answers_finetuning_dataset', filename), 'w') as file:
            file.write(output)
    

## Let´s create the parquet file

In [10]:
import os
import pandas as pd
import numpy as np

# Define folder paths
questions_folder = 'questions'
answers_folder = 'Answers_finetuning_dataset'
output_folder = 'Fine_tuning_data'
output_filename = 'CoT_data.jsonl'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Initialize a list to hold the conversation entries
conversations_list = []

# Get a list of question files
question_files = os.listdir(questions_folder)

# Process each question file
for q_file in question_files:
    # Get the base filename without extension
    q_name, q_ext = os.path.splitext(q_file)
    
    # Construct the corresponding answer filename
    a_file = q_file  # Assuming the filenames are the same
    a_path = os.path.join(answers_folder, a_file)
    
    # Check if the corresponding answer file exists
    if os.path.exists(a_path):
        # Read the question content
        with open(os.path.join(questions_folder, q_file), 'r', encoding='utf-8') as f:
            question = f.read().strip()
        
        # Read the answer content
        with open(a_path, 'r', encoding='utf-8') as f:
            answer = f.read().strip()
        
        # Create the conversation array
        conversation = np.array([
            {'from': 'human', 'value': question},
            {'from': 'gpt', 'value': answer}
        ], dtype=object)
        
        # Append the conversation to the list
        conversations_list.append(conversation)
    else:
        # Skip if the answer file does not exist
        print(f"Skipping {q_file}: corresponding answer file not found.")

# Create a DataFrame with the conversations
df = pd.DataFrame({'conversations': conversations_list})

# Save the DataFrame to a parquet file
#output_path = os.path.join(output_folder, output_filename)
#df.to_parquet(output_path)
#
#print(f"DataFrame saved to {output_path}")

output_jsonl_path = os.path.join(output_folder, output_filename)
df.to_json(output_jsonl_path, orient='records', lines=True)

print(f"Data saved to {output_jsonl_path}")


Skipping sample_2024-09-17-23-38-36_Law.txt: corresponding answer file not found.
Skipping sample_2024-09-17-23-39-10_History.txt: corresponding answer file not found.
Skipping sample_2024-09-18-00-19-01_Physics.txt: corresponding answer file not found.
Skipping sample_2024-09-18-01-56-01_Biology.txt: corresponding answer file not found.
Skipping sample_2024-09-18-02-47-35_Political_Science.txt: corresponding answer file not found.
Skipping sample_2024-09-18-02-56-03_Biology.txt: corresponding answer file not found.
Skipping sample_2024-09-18-02-58-17_Chemistry.txt: corresponding answer file not found.
Skipping sample_2024-09-18-03-05-26_Law.txt: corresponding answer file not found.
Skipping sample_2024-09-18-03-18-40_Economics.txt: corresponding answer file not found.
Skipping sample_2024-09-18-03-22-50_Earth_Sciences.txt: corresponding answer file not found.
Skipping sample_2024-09-18-03-34-36_Sociology.txt: corresponding answer file not found.
Skipping sample_2024-09-18-03-34-41_Psy