# Imports de bibliotecas necessárias

In [1]:
import json
import pandas as pd
import re
import unicodedata
from datasets import Dataset
from transformers import  Trainer, TrainingArguments, BertTokenizer, BertForQuestionAnswering, pipeline, AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch




# Carregamento e limpeza do arquivo

In [12]:
data = []

# Carregar o arquivo JSON
with open('trn.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))

# Converter para DataFrame
df = pd.DataFrame(data)

# Print da quantidade de linhas antes da limpeza do arquivo
print(f"linhas antes da limpeza:{len(df)}")

# Remove registros inválidos (nulos)
df.dropna(subset=['title', 'content'], inplace=True)

# Remove linhas duplicadas
df.drop_duplicates(subset=['title', 'content'], inplace=True)

# Utiliza clean_text para:
# Remover caracteres especiais
# Normaliza e remove caracteres não ASCII,  afim de facilitar processamento
# Converte para lowercase e remove espaços em branco
def clean_text(text):
    if isinstance(text, str):  # Verificar se é uma string
        text = re.sub(r'[^\w\s]', '', text)
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
        text = text.lower().strip()
    return text

df['title'] = df['title'].apply(clean_text)
df['content'] = df['content'].apply(clean_text)

# Remove registros com conteúdo vazio
df = df[df['content'].str.strip() != '']
df = df[df['title'].str.strip() != '']

# Print da quantidade de linhas após da limpeza do arquivo
print(f"linhas após da limpeza:{len(df)}")

# Path to the output JSON file
file_path = 'data_cleaned.json'

# Grava o df como json para processamento
df.to_json(file_path, orient="records", lines=True, force_ascii=False)


linhas antes da limpeza:2248619
linhas após da limpeza:1366477


In [2]:
# Divide as informações de title e content em perguntas e respostas para treinar o modelo
# Realiza a Tokenizacao
# Retorna o dict que contem as questões e contextos tokenizados e as respectivas posicoes de inicio e fim 
def preprocess_function(examples):
    questions = examples['title']  # Perguntas
    contexts = examples['content']  # Respostas
    
    encodings = tokenizer(questions, contexts, truncation=True, padding=True, max_length=512)
    
    start_positions = []
    end_positions = []

    for i in range(len(examples['content'])):
        response = examples['content'][i]
        
        start_idx = response.find(response)
        end_idx = start_idx + len(response) - 1
        
        start_positions.append(start_idx)
        end_positions.append(end_idx)
    
    encodings['start_positions'] = start_positions
    encodings['end_positions'] = end_positions    
    
    return encodings


In [3]:
from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering, AutoTokenizer
from datasets import Dataset
import pandas as pd
import torch

# Carregar o modelo pré treinado 
# SQuAD (Stanford Question Answering Dataset) 
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Carregar o tokenizador
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Converte o arquivo json para treinamento em dataframe
df_to_train = pd.read_json('data_cleaned.json', orient='records', lines=True)

# Seleciona os primeiros X registros para treinamento
dataset = Dataset.from_pandas(df_to_train.head(1000))

# Realiza a tokenizacao para cada entrada do dataset através da função preprocess_function
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Divide o dataset em treino e teste
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

train_data = split_dataset['train']
eval_data = split_dataset['test']

# Define os parametros de treinamento:
training_args = TrainingArguments(
    output_dir='./results_v2',
    eval_strategy='epoch', 
    learning_rate=3e-5,
    per_device_train_batch_size=8, 
    num_train_epochs=4, 
    weight_decay=0.01,
)

# Inicializa o trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data
)

# Treinar o modelo
trainer.train()

# Salva o modelo treinado
trainer.save_model("./results_v2/qa_fine_tuned_model")


tokenizer.save_pretrained("./results_v2/qa_fine_tuned_model")


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  0%|          | 0/452 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 2.71016263961792, 'eval_runtime': 2.6001, 'eval_samples_per_second': 38.46, 'eval_steps_per_second': 5.0, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 2.5320568084716797, 'eval_runtime': 2.6063, 'eval_samples_per_second': 38.369, 'eval_steps_per_second': 4.988, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 2.561081886291504, 'eval_runtime': 2.6174, 'eval_samples_per_second': 38.206, 'eval_steps_per_second': 4.967, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 2.7695562839508057, 'eval_runtime': 2.6869, 'eval_samples_per_second': 37.218, 'eval_steps_per_second': 4.838, 'epoch': 4.0}
{'train_runtime': 306.4324, 'train_samples_per_second': 11.748, 'train_steps_per_second': 1.475, 'train_loss': 2.23751344933974, 'epoch': 4.0}


('./results_v2/qa_fine_tuned_model\\tokenizer_config.json',
 './results_v2/qa_fine_tuned_model\\special_tokens_map.json',
 './results_v2/qa_fine_tuned_model\\vocab.txt',
 './results_v2/qa_fine_tuned_model\\added_tokens.json',
 './results_v2/qa_fine_tuned_model\\tokenizer.json')

# Execução modelos

Execução do modelo BERT uncased, sem treinamento com arquivo especializado. 

In [6]:
from transformers import BertForQuestionAnswering, BertTokenizer, pipeline


model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")  # Caminho onde você salvou o modelo
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  # Ou o tokenizer que você usou

context = "high quality 3 layer ballet tutu 12 inches in length"
question = "girls ballet tutu neon pink" 

question_answerer = pipeline("question-answering" , model=model, tokenizer=tokenizer, device=0)

answer  = question_answerer(question=question, context=context)

print(f"Resposta: {answer}")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Resposta: {'score': 0.011804047971963882, 'start': 28, 'end': 42, 'answer': 'tutu 12 inches'}


Execução do modelo bert large (SQUAD - Stanford Question Answering Dataset).

In [7]:

model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")  # Caminho onde você salvou o modelo
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")  # Ou o tokenizer que você usou

context = "high quality 3 layer ballet tutu 12 inches in length"
question = "girls ballet tutu neon pink" 

question_answerer = pipeline("question-answering" , model=model, tokenizer=tokenizer, device=0)

answer  = question_answerer(question=question, context=context)

print(f"Resposta: {answer}")

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Resposta: {'score': 0.1022646576166153, 'start': 0, 'end': 20, 'answer': 'high quality 3 layer'}


Execução do modelo BERT treinado com o arquivo utilizado no trabalho.

In [4]:
from transformers import BertForQuestionAnswering, BertTokenizer
import torch

model = BertForQuestionAnswering.from_pretrained("./results_v2/qa_fine_tuned_model")  # Caminho onde você salvou o modelo
tokenizer = BertTokenizer.from_pretrained("./results_v2/qa_fine_tuned_model")  # Ou o tokenizer que você usou

context = "high quality 3 layer ballet tutu 12 inches in length"
question = "girls ballet tutu neon pink" 

question_answerer = pipeline("question-answering" , model=model, tokenizer=tokenizer, device=0)

answer  = question_answerer(question=question, context=context)

print(f"Resposta: {answer}")

Device set to use cuda:0


Resposta: {'score': 1.5885715356489527e-06, 'start': 0, 'end': 42, 'answer': 'high quality 3 layer ballet tutu 12 inches'}
