# Creating Dataset for bot that thinks

### First we´ll load some libraries and load the LLM (which takes 15min) and connect it to the rest of the code via LLM_connection

In [9]:
import torch
from trl import SFTTrainer
import os
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported
import time
import json

from src.Basic_LLM_functions import LLM_conection
from src.Dataset_generation import generate_full_question,save_question_and_log,subtopic_prompt_template,question_prompt_template
from src.Chain_of_thought import Chain_of_thought

In [2]:
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="LLMs/Meta-Llama-3.1-8B",#-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 24.0 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LLMs/Meta-Llama-3.1-8B does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


In [3]:
tokenizer = get_chat_template(
    tokenizer,
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
    chat_template="chatml",
)

model = FastLanguageModel.for_inference(model)

Unsloth: Will map <|im_end|> to EOS = <|eot_id|>.


In [4]:
#We set up the LLM connection and we can finally start doing stuff
LLM_conection.set_model(model,tokenizer)

### Let´s create the questions

In [5]:
num_questions = 1  # Change this to the number of questions you want to generate

for i in range(num_questions):
    print(f"Generating question {i+1}/{num_questions}")
    question_data = generate_full_question(model,tokenizer)
    save_question_and_log(question_data, subtopic_prompt_template, question_prompt_template)
    print(f"Main Topic: {question_data['main_topic']}")
    print(f"Subtopic: {question_data['subtopic']}")
    print(f"Question: {question_data['question']}")
    print("\n---\n")
    time.sleep(2)  # Add a delay to avoid hitting API rate limits

print("Question generation complete. Check the 'questions' and 'log' folders for output.")

Generating question 1/1
Main Topic: Mathematics
Subtopic: Considering the implications of fractal geometry on chaos theory in complex systems.
Question: What are the theoretical and practical implications of applying fractal geometry to model the emergence of chaotic behavior in complex systems, particularly in the context of the Mandelbrot set and the Lorenz attractor, and how do these insights inform our understanding of the interplay between determinism and randomness in these systems, considering the work of pioneers such as Benoit Mandelbrot and Edward Lorenz, and the potential applications of these findings in fields like meteorology and finance?

---

Question generation complete. Check the 'questions' and 'log' folders for output.


### Let´s create the answers via Chain of Thought

In [6]:
os.makedirs("answers", exist_ok=True)
answered_questions=os.listdir("answers")

In [7]:
txt_files_contents = []
folder_path="questions"
for filename in os.listdir(folder_path):
    if filename in answered_questions:
        continue
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r') as file:
            txt_files_contents.append((filename,file.read()))

In [10]:
dataset=[]
for filename,question in txt_files_contents:
    output=Chain_of_thought(question)
    
    if len(output["reasoning"])>0:
        #Save the json to .json in a folder called answers
        with open(f"answers/{filename}.json", 'w') as f:
            json.dump(output, f, indent=4)
    
    break#We only wanna do 1 chain of thought for now

----------------------
Attempt 1 for question: What are the key methodological adaptations required to apply the phenomenological analysis framework to the cultural significance of contemporary street art, and how can these adaptations be evaluated in terms of their effectiveness in uncovering the underlying themes and meanings that street artists intend to convey through their work?
----------------------
Additional information:
To apply the phenomenological analysis framework to the cultural significance of contemporary street art, the following key methodological adaptations are required:

1. **Participant selection and recruitment**: Identify a diverse group of participants who are familiar with street art, including artists, art critics, curators, and enthusiasts. Ensure that the sample is representative of the broader street art community, including both local and international perspectives.
2. **Data collection methods**: Employ a combination of qualitative methods, such as:
	* 