### <b><span style='color:#F1A424'>Table of Contents</span></b> <a class='anchor' id='top'></a>
<div style=" background-color:#3b3745; padding: 13px 13px; border-radius: 8px; color: white">
<li> <a href="#install_libraries">Install libraries</a></li>
<li><a href="#import_libraries">Import Libraries</a></li>
<li><a href="#load_data">Load Data</a></li>
<li><a href="#configuration">Configuration</a></li>
<li><a href="#configure_parameters">Configure Quantization and LORA-specific parameters</a></li>
<li><a href="#load_model">Load Model</a></li>
<li><a href="#training">Training</a></li>
<li><a href="#testing">Testing</a></li>
<li><a href="#save_model">Saving model for inference</a></li>
</div>




# <b><span style='color:#F1A424'>|</span> Install Libraries</b><a class='anchor' id='install_libraries'></a> [↑](#top) 

***

Install all the required libraries for this notebook.

In [None]:
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.33.1 trl==0.4.7
# !pip install accelerate peft==0.4.0 bitsandbytes transformers==4.33.1 trl==0.4.7
# !pip install -q bitsandbytes==0.41.1 transformers==4.38.2 accelerate==0.21.0 peft==0.4.0 trl==0.4.7
# !pip install -q bitsandbytes transformers accelerate peft trl
# !pip install bitsandbytes>=0.43.0 transformers>=4.43.0 accelerate>=0.28.0 peft>=0.5.0 trl>=0.7.11
!pip install -q --upgrade transformers==4.43.0
!pip install -q --upgrade bitsandbytes accelerate peft trl


In [None]:
# Mostra as versões das bibliotecas
import importlib

# Lista das bibliotecas a verificar
libraries = ["bitsandbytes", "transformers", "accelerate", "peft", "trl"]

# Itera pelas bibliotecas e exibe suas versões
for lib in libraries:
    try:
        module = importlib.import_module(lib)
        version = getattr(module, "__version__", "Versão desconhecida")
        print(f"{lib}: {version}")
    except ImportError:
        print(f"{lib}: Não instalada")


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# <b><span style='color:#F1A424'>|</span> Import Libraries</b><a class='anchor' id='import_libraries'></a> [↑](#top) 

***

Import all the required libraries for this notebook.

In [None]:
# Import necessary libraries
import pandas as pd
from tqdm import tqdm

import os
import torch
# import cuda
from datasets import load_dataset
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoModel,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from sklearn.metrics.pairwise import cosine_similarity

# <b><span style='color:#F1A424'>|</span> Load custom dataset</b><a class='anchor' id='load_data'></a> [↑](#top) 

***

Custom dataset is used in this notebook. You can use any data but dataset should contain two columns with name 'prompt' and 'response'. The prompt column should contain the input text.

In [None]:
import random
df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv")
df_mis_map = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")

# Prefixos para construir as colunas de prompt e response
question_prefix = "Question:"
incorrect_answer_prefix = "Incorrect Answer:"
correct_answer_prefix = "Correct Answer:"
response_start = "Misconception for incorrect answer:"

def simular_respostas(df):
    prompts = []
    responses = []
    
    # Iterar pelas linhas do DataFrame
    for _, row in df.iterrows():
        question = f"{question_prefix} {row['QuestionText']}"
        correct_answer = row['CorrectAnswer']
        
        # Obter todas as opções disponíveis
        options = ['AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText']
        misconception_columns = ['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']
        
        # Simular a resposta do indivíduo (75% de chance de errar para dar mais foco nos erros)
        if random.random() < 0.75:  # Simular erro
            incorrect_options = [opt for opt in options if row[opt] != correct_answer]
            chosen_option = random.choice(incorrect_options)talling all packages, you 
            chosen_misconception = misconception_columns[options.index(chosen_option)]
            
            # Criar prompt e response para a resposta errada
            prompt = f"{question}\n\n{incorrect_answer_prefix} {row[chosen_option]}\n\n{correct_answer_prefix} {correct_answer}"
            
            if not pd.isna(row[chosen_misconception]):  # Misconception associada
                response = f"{response_start} Misconception ID {int(row[chosen_misconception])}"
            else:  # Sem misconception associada
                response = f"{response_start} No specific misconception mapped."
        else:  # Simular acerto
            chosen_option = correct_answer
            prompt = f"{question}\n\n{correct_answer_prefix} {chosen_option}"
            response = "Correct answer. No misconception."

        # Adicionar ao DataFrame final
        prompts.append(prompt)
        responses.append(response)
    
    # Criar o DataFrame final
    df_final = pd.DataFrame({'prompt': prompts, 'response': responses})
    return df_final

# Gerar a base simulada
df_simulado = simular_respostas(df)

# Visualizar a base gerada
df = df_simulado

In [None]:
# DATASET DE TESTE
# Load and display the first few rows of the dataset
# df = pd.read_csv("/content/drive/MyDrive/task2_10k.csv")
# df = pd.read_csv("/kaggle/input/test-dataset/dataset_baixado.csv")
# df = df.drop(['product', 'category'], axis=1)
# df = df.rename(columns={'description': 'prompt'})
# df = df.rename(columns={'text': 'response'})

# df.head()talling all packages, you 

In [None]:
# Preprocess the dataset by removing hyperlinks and mentions
for i in range(len(df)):
    l = df['response'][i]
    text = l.replace("<hyperlink>","")
    l = text.replace("<mention>","")
    df['response'][i] = l

In [None]:
# Split the dataset into final test data and remaining data
final_test_data = df[8000:10000]
df = df.drop(final_test_data.index)

In [None]:
# Here test set is the validation set
# Split the data into train and test sets, with 90% in the train set
train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

# Save the dataframes to .jsonl files
train_df.to_json('train.jsonl', orient='records', lines=True)
test_df.to_json('test.jsonl', orient='records', lines=True)

# <b><span style='color:#F1A424'>|</span> Configuration</b><a class='anchor' id='configuration'></a> [↑](#top) 

***

Central repository for this notebook's hyperparameters.

In [None]:
# Set up model configuration and training parameters
# model_name = "NousResearch/llama-2-7b-chat-hf"
model_name = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"
# model_name = "meta-llama/Llama-3.2-1B" # teste
# model_name = "meta-llama/Llama-Guard-3-1B" # Teste
# model_name = "NousResearch/Llama-3.2-1B" # Teste
# model_name = "meta-llama/Llama-3.2-1B" # Teste
dataset_name = "/content/train.jsonl"
new_model = "llama-3.2-1b-eedi_misconceptions"
# Ver na liga quais parametros do LoRA a gente pode alterar para ter um melhor desempenho
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
output_dir = "./results"
num_train_epochs = 10
fp16 = False #False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 25
logging_steps = 5
max_seq_length = None
packing = False
# device_map = {"cpu": 0}
# device_map = {"cuda": 0}
# device_map = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device_map = "auto" #"cuda:0" #TIVE QUE TROCAR PARA AUTO, POIS NÂO ESTAVA FUNCIONANDO O PARALELISMO DO CUDA, AINDA NÂO SEI PQ!
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device_map)
print(device)

In [None]:
# Load datasets
train_dataset = load_dataset('json', data_files='/kaggle/working/train.jsonl', split="train")
valid_dataset = load_dataset('json', data_files='/kaggle/working/test.jsonl', split="train")

# Preprocess datasets
train_dataset_mapped = train_dataset.map(lambda examples: {'text': [prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)
valid_dataset_mapped = valid_dataset.map(lambda examples: {'text': [prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)


# <b><span style='color:#F1A424'>|</span> Configuration of Quantization and LORA parameters</b><a class='anchor' id='configure_parameters'></a> [↑](#top) 

***

As model size is big it is loaded in 4 bit.

In [None]:
# Configure quantization parameters
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Load pre-trained model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)

model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map=device_map, trust_remote_code=True, batched=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Configure LoRA-specific parameters
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# <b><span style='color:#F1A424'>|</span> Training</b><a class='anchor' id='training'></a> [↑](#top) 

***


In [None]:
# Testando as entradas do modelo
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token
# encoded = tokenizer("This is a test sentence", padding=True, truncation=True, max_length=10)
# print(encoded)

In [None]:
# train_dataset_mapped

In [None]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    # report_to="all",
    report_to=["none"],  # Desativa W&B
    evaluation_strategy="steps",
    eval_steps=50  # Evaluate every 50 steps
)
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_mapped, # ANTES ESTAVA ASSIM: train_dataset_mapped
    eval_dataset=valid_dataset_mapped,     # ANTES ESTAVA ASSIM: valid_dataset_mapped
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing
)

# Train the model
trainer.train()
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)

# <b><span style='color:#F1A424'>|</span> Testing</b><a class='anchor' id='testing'></a> [↑](#top) 

***

Testing on test data

In [None]:
# ESTOU PENSANDO EM NÃO FAZER TESTES, POR CONTA Q ISSO DEMORA MUITO
# Suppress logging messages to avoid unnecessary output
# logging.set_verbosity(logging.CRITICAL)

# Create text generation pipelines using the specified model and tokenizer
# Define two pipelines with different maximum lengths
# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=250)
# pipe2 = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500)

# Initialize an empty list to store generated text
# generated_text = []

# Iterate over the test data
"""
for i in tqdm(range(len(final_test_data))):
    # Extract the prompt from the test data
    prompt = final_test_data['prompt'].iloc[i]
    
    # Attempt to generate text using the first pipeline with a max length of 250
    try:
        result = pipe(prompt)
        # Append the generated text to the list, extracting the relevant part after '[/INST]'
        generated_text.append(result[0]['generated_text'].split('[/INST]')[1])
    except:
        # If an exception occurs, try the second pipeline with a max length of 500
        try:
            result = pipe2(prompt)
            # Append the generated text to the list, extracting the relevant part after '[/INST]'
            generated_text.append(result[0]['generated_text'].split('[/INST]')[1])
        except:
            # If both pipelines fail, append a default placeholder text
            generated_text.append("ABCD1234@#")

# The 'generated_text' list now contains the generated text for each prompt in the test data
"""

In [None]:
# Assign the generated text to a new column 'generated_text' in the 'final_test_data' DataFrame
# final_test_data['generated_text'] = generated_text

# Reset the index of the DataFrame for a cleaner representation in the CSV file
# final_test_data = final_test_data.reset_index(drop=True)

# Save the DataFrame to a CSV file at the specified path
# final_test_data.to_csv('/content/drive/MyDrive/llama3_finetune_output_1128.csv', index=False)

# <b><span style='color:#F1A424'>|</span> Saving Model for inference</b><a class='anchor' id='save_model'></a> [↑](#top) 

***


In [None]:
# Set the path where the merged model will be saved
model_path = "/result/model_merged/llama-3.2-1b-eedi_misconceptions"

# Reload the base model in FP16 and configure settings
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,  
    return_dict=True,        
    torch_dtype=torch.float16,  
    device_map=device_map,    
)

# Instantiate a PeftModel using the base model and the new model
model = PeftModel.from_pretrained(base_model, new_model)  # Combine the base model and the fine-tuned weights

# Merge the base model with LoRA weights and unload unnecessary parts
model = model.merge_and_unload()  # Finalize the model by merging and unloading any redundant components

# Reload the tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 
# tokenizer.pad_token = tokenizer


pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, trust_remote_code=True, max_new_tokens=55)

# <b><span style='color:#F1A424'>|</span> Functions</b><a class='anchor' id='save_model'></a> [↑](#top) 

In [None]:
# min / max example questions for prompt generation
# for each question all answers with non-NAN misconceptions will be used
import time
import sys

scoring = True
train_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv")
test_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
#if it's just the test stub - then we aren't scoring...
if len(test_df) < 10:
    scoring = False

evaluating_on_train = False
#causes us to do evaluation on the training data instead of test stub if we aren't scoring
eval_on_train_if_not_scoring = True
#how many train questions to use when swapping in train for test (MAP@25 score estimate)
#100 questions takes about 30 minutes (more questions = better scoring estimate)
questions_for_train_eval = 100

#set evaluating_on_train to True / swap in train for test
if scoring == False and eval_on_train_if_not_scoring:
    evaluating_on_train = True
    print("Doing evaluation / scoring on the train data")
    test_df = train_df.head(questions_for_train_eval)

#original text prefix
question_prefix = "Question:"

#LLM "response"
llm_correct_response_for_rewrite = "Provide me with the correct answer for a baseline."
llm_incorrect_response_for_rewrite = "Now - provide the incorrect answer and I will anaylze the difference to infer the misconception."

#modified text prefix
incorrect_answer_prefix = "Incorrect Answer:"
correct_answer_prefix = "Correct Answer:"

#providing this as the start of the response helps keep things relevant
response_start = "Misconception for incorrect answer: "

min_example_questions = 5
max_example_questions = 8

#example question messages limited to this many words
#assures we don't run out of GPU RAM (if notebook throws exception - try reducing...)
max_words_for_examples = 1400

#maximum new tokens Phi will generate for responses
max_new_tokens = 55


misc_map_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")

def generate_filtered_df(df, question, min_rows=5, max_rows=10, verbose=False, random_seed=42):
    # Set the random seed for numpy and pandas
    np.random.seed(random_seed)
    
    result_df = pd.DataFrame()
    construct_count = 0
    subject_count = 0
    random_count = 0
    
    question_id = question["QuestionId"]
    subject_id = question["SubjectId"]
    construct_id = question["ConstructId"]
    
    #don't include own question in DF (only really matters if testing on train)
    df = df[df['QuestionId'] != question_id]
    
    construct_df = df[df['ConstructId'] == construct_id]
    result_df = pd.concat([result_df, construct_df])
    construct_count = len(result_df)
    if verbose: print(f"Matched ConstructId {construct_id}: {construct_count} rows")
    
    # Step 2: If we don't have enough rows, add rows with the specified SubjectId
    if len(result_df) < min_rows:
        subject_df = df[(df['SubjectId'] == subject_id) & ~df.index.isin(result_df.index)]
        rows_to_add = min(len(subject_df), min_rows - len(result_df))
        result_df = pd.concat([result_df, subject_df.head(rows_to_add)])  # Use head() instead of sample()
        subject_count = len(result_df) - construct_count
        if verbose: print(f"Added rows from SubjectId {subject_id}: {subject_count} rows")
    
    # Step 3: If we still don't have enough rows, add random rows
    if len(result_df) < min_rows:
        remaining_df = df[~df.index.isin(result_df.index)]
        rows_to_add = min(len(remaining_df), min_rows - len(result_df))
        result_df = pd.concat([result_df, remaining_df.head(rows_to_add)])  # Use head() instead of sample()
        random_count = len(result_df) - (construct_count + subject_count)
        if verbose: print(f"Added random rows to meet minimum: {random_count} rows")
    
    # Step 4: If we have more than max_rows, use the first max_rows
    if len(result_df) > max_rows:
        result_df = result_df.head(max_rows)
        if verbose: print(f"Reduced to maximum: {max_rows} rows")
    
    if verbose: 
        print(f"\nFinal DataFrame composition:")
        print(f"ConstructId matches: {construct_count}")
        print(f"SubjectId matches: {subject_count}")
        print(f"Random additions: {random_count}")
        print(f"Total rows: {len(result_df)}")
    
    return result_df.reset_index(drop=True)

def get_train_messages_for_df(filtered_train_df, skip_nan_misconceptions=True, answers=['A', 'B', 'C', 'D'], verbose = False):
    messages = []
    current_size = 0
    
    for _, row in filtered_train_df.iterrows():
        for answer_choice in answers:
            if answer_choice == row['CorrectAnswer']:
                continue
            
            misconception_id = row[f'Misconception{answer_choice}Id']
            
            if pd.isna(misconception_id) and skip_nan_misconceptions:
                continue
            
            if not pd.isna(misconception_id):
                new_message = [
                    f"{row['ConstructName']}: {row['QuestionText']}",
                    row[f'Answer{row["CorrectAnswer"]}Text'],
                    row[f'Answer{answer_choice}Text'],
                    misc_map_df.loc[int(misconception_id), 'MisconceptionName']
                ]
                
                # Calculate size of new message
                new_message_size = sum(sys.getsizeof(item) for item in new_message)
                                
                messages.append(new_message)
                current_size += new_message_size
            
    # Print size of returned data
    if verbose: print(f"Size of returned data: {current_size} bytes")
    
    return messages

def clean_response(my_string, response_start):
    # Trim leading spaces first
    my_string = my_string.lstrip()
    
    # Remove response_start if present
    if my_string.startswith(response_start):
        my_string = my_string[len(response_start):]
    
    # Find indices of first period and first linefeed
    period_index = my_string.find('.')
    linefeed_index = my_string.find('\n')
    
    # Determine where to truncate
    truncate_index = len(my_string)  # Default to end of string
    if period_index != -1:
        truncate_index = period_index
    if linefeed_index != -1 and linefeed_index < truncate_index:
        truncate_index = linefeed_index
    
    # Truncate the string
    my_string = my_string[:truncate_index]
    
    return my_string.strip()


def predict_misconception(question, question_letter_to_test, example_sequences, max_word_count=max_words_for_examples, verbose=False):
    correct_question_letter = question["CorrectAnswer"]
    question_text = f"{question['ConstructName']}: \n {question['QuestionText']}\n"
    correct_answer_text = question[f"Answer{correct_question_letter}Text"]
    incorrect_answer_text = question[f"Answer{question_letter_to_test}Text"]
    if correct_question_letter == question_letter_to_test:
        print("WARNING: Tested letter is for a correct answer!")

    def calculate_word_count(text):
        return len(text.split())

    # Construct the actual prompt messages
    actual_prompt_messages = [
        {"role": "user", "content": f"{question_prefix} {question_text}"},
        {"role": "assistant", "content": llm_correct_response_for_rewrite},
        {"role": "user", "content": f"{correct_answer_prefix} {correct_answer_text}"},
        {"role": "assistant", "content": llm_incorrect_response_for_rewrite},
        {"role": "user", "content": f"{incorrect_answer_prefix} {incorrect_answer_text}"}
    ]

    # Calculate the word count of actual prompt messages
    actual_prompt_word_count = sum(calculate_word_count(msg["content"]) for msg in actual_prompt_messages)

    # Construct example messages, stopping if we reach the word limit
    example_messages = []
    current_word_count = actual_prompt_word_count

    for examp_question, examp_correct_answer, examp_incorrect_answer, examp_misconception in example_sequences:
        example_set = [
            {"role": "user", "content": f"{question_prefix} {examp_question}"},
            {"role": "assistant", "content": llm_correct_response_for_rewrite},
            {"role": "user", "content": f"{correct_answer_prefix} {examp_correct_answer}"},
            {"role": "assistant", "content": llm_incorrect_response_for_rewrite},
            {"role": "user", "content": f"{incorrect_answer_prefix} {examp_incorrect_answer}"},
            {"role": "assistant", "content": f"{response_start} {examp_misconception}"}
        ]
        
        example_set_word_count = sum(calculate_word_count(msg["content"]) for msg in example_set)
        
        if current_word_count + example_set_word_count > max_word_count:
            if verbose: print("Word count limit reached.")
            break  # Stop adding new example sets if we would exceed the limit
        
        example_messages.extend(example_set)
        current_word_count += example_set_word_count

    # Combine example messages and actual prompt messages
    messages = example_messages + actual_prompt_messages

    if verbose:
        print("Example Messages:")
        for message in example_messages:
            display(message)
        print("\nActual Prompt Messages:")
        for message in actual_prompt_messages:
            display(message)
        print(f"\nTotal word count: {current_word_count}")

    decoded = pipe(messages)
    return decoded

def process_test_questions(df):
    results = []
    start_time = time.time()
    total_items = 0
    
    for question_index in range(len(df)):
        question = df.iloc[question_index]
        
        # Verificar se a coluna 'QuestionId' está presente
        if "QuestionId" not in question or "CorrectAnswer" not in question:
            print(f"Erro: Coluna ausente na linha {question_index}")
            continue  # Pula para a próxima linha caso falte a coluna
        
        correct_answer = question["CorrectAnswer"]
        
        question_id = question["QuestionId"]
        for answer_choice in ['A', 'B', 'C', 'D']:
            if answer_choice != correct_answer:
                filtered_df = generate_filtered_df(train_df, question, min_rows=min_example_questions, max_rows=max_example_questions)
                example_sequences = get_train_messages_for_df(filtered_df)
                response = predict_misconception(question, answer_choice, example_sequences, verbose=False)

                just_response = clean_response(response[0]['generated_text'][-1]['content'], response_start)

                result = {
                    'QuestionId_Answer': f"{question_id}_{answer_choice}",
                    'MiscPredText': just_response
                }
                
                # Verificar e adicionar 'TrainMiscId' se necessário
                if evaluating_on_train:
                    misc_id_column = f"Misconception{answer_choice}Id"
                    if misc_id_column in question:
                        try:
                            misc_id = int(question[misc_id_column])
                            result['TrainMiscId'] = misc_id
                        except (ValueError, TypeError):
                            # Caso o valor não seja válido
                            result['TrainMiscId'] = None
                    else:
                        result['TrainMiscId'] = None
                        print(f"Warning: {misc_id_column} not found for question {question_id}")
                
                results.append(result)
                total_items += 1
                print(".", end="", flush=True)
    
    end_time = time.time()
    total_time = end_time - start_time
    avg_time_per_item = total_time / total_items if total_items > 0 else 0
    
    print(f"\nTotal execution time: {total_time:.2f} seconds")
    print(f"Total items processed: {total_items}")
    print(f"Average time per item: {avg_time_per_item:.2f} seconds")
    print(f"Time for 1000 questions * 3 incorrect answers (3000 items): {(avg_time_per_item * 3000) / 3600} hours")
    
    return pd.DataFrame(results)

def create_submission_dataframe(predicted_misc, test_sorted_indices):
    results = []
    
    # Iterate through each row of predicted_misc and corresponding sorted indices
    for (_, row), indices in zip(predicted_misc.iterrows(), test_sorted_indices):
        # Get the QuestionId_Answer
        question_id_answer = row['QuestionId_Answer']
        
        # Get the top 25 misconception indices and join them as a space-separated string
        top_25_indices = ' '.join(map(str, indices[:25]))
        
        result = {
            'QuestionId_Answer': question_id_answer,
            'MisconceptionId': top_25_indices
        }
        
        # If evaluating_on_train, include the TrainMiscId
        if evaluating_on_train and 'TrainMiscId' in row:
            result['TrainMiscId'] = row['TrainMiscId']
        
        # Append the result to our list
        results.append(result)

    # Create the submission dataframe
    submission_df = pd.DataFrame(results)
    
    return submission_df

In [None]:
predicted_misc = process_test_questions(test_df)


device = "cuda:0"
bge_tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/bge-large-en-v1.5/transformers/default/1/bge-large-en-v1.5')
bge_model = AutoModel.from_pretrained('/kaggle/input/bge-large-en-v1.5/transformers/default/1/bge-large-en-v1.5')
bge_model.eval()
bge_model.to(device)


start_time = time.time()

MisconceptionName = list(misc_map_df['MisconceptionName'].values)
per_gpu_batch_size = 8

def prepare_inputs(text, tokenizer, device):
    tokenizer_outputs = tokenizer.batch_encode_plus(
        text,
        padding        = True,
        return_tensors = 'pt',
        max_length     = 1024,
        truncation     = True
    )
    result = {
        'input_ids': tokenizer_outputs.input_ids.to(device),
        'attention_mask': tokenizer_outputs.attention_mask.to(device),
    }
    return result

all_ctx_vector = []
for mini_batch in tqdm(range(0, len(MisconceptionName[:]), per_gpu_batch_size)):
    mini_context          = MisconceptionName[mini_batch:mini_batch+ per_gpu_batch_size]
    encoded_input         = prepare_inputs(mini_context,bge_tokenizer,device)
    sentence_embeddings   = bge_model(**encoded_input)[0][:, 0]
    sentence_embeddings   = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    all_ctx_vector.append(sentence_embeddings.detach().cpu().numpy())

all_ctx_vector = np.concatenate(all_ctx_vector, axis=0)
print("Sentence embeddings:", sentence_embeddings.shape)

# Stop the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

test_texts = list(predicted_misc['MiscPredText'].values)
all_text_vector = []
per_gpu_batch_size = 8

for mini_batch in tqdm(
        range(0, len(test_texts[:]), per_gpu_batch_size)):
    mini_context = test_texts[mini_batch:mini_batch
                                           + per_gpu_batch_size]
    encoded_input = prepare_inputs(mini_context,bge_tokenizer,device)
    sentence_embeddings = bge_model(
        **encoded_input)[0][:, 0]
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    
    all_text_vector.append(sentence_embeddings.detach().cpu().numpy())

all_text_vector = np.concatenate(all_text_vector, axis=0)
print(all_text_vector.shape)

test_cos_sim_arr = cosine_similarity(all_text_vector, all_ctx_vector)
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)

submission_df = create_submission_dataframe(predicted_misc, test_sorted_indices)
submission_df.head(10)

# <b><span style='color:#F1A424'>|</span> Submit</b><a class='anchor' id='save_model'></a> [↑](#top) 

In [None]:
if evaluating_on_train:
    submission_df = submission_df.drop("TrainMiscId", axis=1)

submission_df.to_csv("submission.csv", index=False)
submission_df