META 3
PLN - PROCESSAMENTO DE LINGUAGEM NATURAL (LECD) 2023/2024
NOME: Guilherme Cardoso / Francisco Rua

### IMPORTS 

In [None]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from googletrans import Translator

nlp = spacy.load("pt_core_news_sm") #modelo do spacy em PT


### CARREGAR BANCO DE DADOS

In [None]:
dialogues = pd.read_json("dialogosCoimbra.json")

hotels = pd.read_json("hotelsCoimbra_db.json")

attractions = pd.read_json("attractionsCoimbra_db.json")

trains = pd.read_json("trainsCoimbra_db.json")

restaurants = pd.read_json("restaurantsCoimbra_db.json")

#print(dialogues)
#print(hotels)
#print(attractions)
#print(trains)
#print(restaurants)

### ANALISAR DADOS

In [None]:
#dialogues.info() #0 valores em falta
#hotels.info() #0 valores em falta
#attractions.info() #0 valores em falta
#trains.info() #0 valores em falta
#restaurants.info() #79 valores em falta (diferentes colunas)

hotels_duplicates = hotels.duplicated(subset=['id', 'nameMultiWoz'])
total_duplicates = hotels_duplicates.sum()
#print(str(total_duplicates) + ' - Hotels') #0 valores duplicados

attractions_duplicates = attractions.duplicated(subset=['id', 'nameMultiWoz'])
total_duplicates = attractions_duplicates.sum()
#print(str(total_duplicates) + ' - Attractions') #0 valores duplicados

trains_duplicates = trains.duplicated(subset=['id', 'nameMultiWoz'])
total_duplicates = trains_duplicates.sum()
#print(str(total_duplicates) + ' - Trains') #0 valores duplicados

restaurants_duplicates = restaurants.duplicated(subset=['id', 'nameMultiWoz'])
total_duplicates = restaurants_duplicates.sum()
#print(str(total_duplicates) + ' - Restaurants') #0 valores duplicados


### SUBSTITUIÇÃO DOS VALORES EM FALTA

In [None]:
restaurants['area'].fillna('!', inplace=True)
restaurants['introduction'].fillna('!', inplace=True)
restaurants['phone'].fillna('!', inplace=True)
restaurants['signature'].fillna('!', inplace=True)

#restaurants.info() #0 valores em falta

### EXTRAÇÃO DE PERGUNTAS / RESPOSTAS

In [None]:
# Usando list comprehensions para extrair perguntas e respostas
questions = [sentence['utterance'] for _, d in dialogues.iterrows() for sentence in d['turns'] if sentence.get('speaker') == "USER"]
answers = [sentence['utterance'] for _, d in dialogues.iterrows() for sentence in d['turns'] if sentence.get('speaker') == "SYSTEM"]

#print(questions)
# print(answers)


### NORMALIZAÇÃO DO TEXTO

In [None]:
import re

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Remove espaços extras
    text = re.sub(r'[^\w\s]', '', text)  # Remove caracteres especiais
    return text

questions = [normalize_text(q) for q in questions]
answers = [normalize_text(a) for a in answers]


### ANÁLISE BÁSICA DE PERGUNTAS E RESPOSTAS

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter

# Análise do comprimento das perguntas e respostas
questions_lengths = [len(q.split()) for q in questions]
answers_lengths = [len(a.split()) for a in answers]

print(f"Comprimento médio das perguntas: {pd.Series(questions_lengths).mean()}")
print(f"Comprimento médio das respostas: {pd.Series(answers_lengths).mean()}")

# Frequência de palavras
all_words = ' '.join(questions + answers)
word_freq = Counter(all_words.split())

# Visualização com WordCloud
wordcloud = WordCloud(width = 800, height = 400, background_color ='white').generate_from_frequencies(word_freq)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


### DIVIDIR DADOS PARA TREINO E TESTE

In [None]:
from sklearn.model_selection import train_test_split

train_questions, test_questions, train_answers, test_answers = train_test_split(
    questions, answers, test_size=0.2, random_state=42)  #20% dos dados são reservados para teste


### TOKENIZAÇÃO DOS DADOS

In [None]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
import torch

# Inicializa o tokenizer do BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def find_answer_positions(tokenizer, contexts, answers):
    start_positions = []
    end_positions = []
    for context, answer in zip(contexts, answers):
        # Tokenize the context and the answer separately
        context_enc = tokenizer.encode(context, add_special_tokens=True)
        answer_enc = tokenizer.encode(answer, add_special_tokens=True)
        
        # Remove the CLS and SEP tokens from the answer encoding
        answer_enc = answer_enc[1:-1]
        
        # Search for the answer encoding sequence within the context encoding
        for index in range(len(context_enc) - len(answer_enc) + 1):
            if context_enc[index:index+len(answer_enc)] == answer_enc:
                # Found the answer start and end positions
                start_positions.append(index)
                end_positions.append(index + len(answer_enc) - 1)
                break
        else:
            # Answer not found in the context, this is an issue
            print(f"Answer not found in the context: {context} / {answer}")
            # For now, we can set them to the index of the first token, but you'll need to handle this
            start_positions.append(0)
            end_positions.append(0)
            
    return start_positions, end_positions

# Assuming train_contexts and train_answers are your training questions and answers lists
train_start_positions, train_end_positions = find_answer_positions(tokenizer, train_questions, train_answers)
test_start_positions, test_end_positions = find_answer_positions(tokenizer, test_questions, test_answers)

# Convert the positions into tensors
train_start_positions = torch.tensor(train_start_positions)
train_end_positions = torch.tensor(train_end_positions)
test_start_positions = torch.tensor(test_start_positions)
test_end_positions = torch.tensor(test_end_positions)

# Tokenize all the contexts (questions in this case) with padding to the maximum length
train_encodings = tokenizer(train_questions, padding=True, truncation=True, return_tensors='pt')
test_encodings = tokenizer(test_questions, padding=True, truncation=True, return_tensors='pt')

# Create the TensorDatasets
train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    train_start_positions,
    train_end_positions
)

test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    test_start_positions,
    test_end_positions
)

# Create the DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)


### TREINAR O MODELO

In [None]:
from transformers import BertForQuestionAnswering, AdamW
from torch.utils.data import RandomSampler, DataLoader

# Carregar o modelo pré-treinado
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Mover o modelo para a GPU, se disponível
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Configurar o otimizador
optimizer = AdamW(model.parameters(), lr=5e-5)

# Definir o número de épocas para o treinamento
num_epochs = 3

# Loop de treinamento
for epoch in range(num_epochs):
    # Colocar o modelo em modo de treinamento
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        # Mover o batch para o dispositivo apropriado
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'start_positions': batch[2],
            'end_positions': batch[3]
        }
        
        # Zerar os gradientes do modelo
        model.zero_grad()
        
        # Realizar um passo de treinamento
        outputs = model(**inputs)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    # Calcular a perda média sobre todos os batches
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Average training loss: {avg_train_loss}')

# Salvar o modelo treinado
model_path = "./bert_for_qa.pth"
torch.save(model.state_dict(), model_path)


### AVALIAÇÃO DO MODELO

In [None]:

model.eval()

total_eval_loss = 0
for batch in test_dataloader:
    # Mover o batch para o dispositivo apropriado
    batch = tuple(t.to(device) for t in batch)
    inputs = {
        'input_ids': batch[0],
        'attention_mask': batch[1],
        'start_positions': batch[2],
        'end_positions': batch[3]
    }

    with torch.no_grad():
        # Realizar uma passagem de avaliação
        outputs = model(**inputs)
        loss = outputs[0]
        total_eval_loss += loss.item()

# Calcular a perda média sobre todos os batches
avg_eval_loss = total_eval_loss / len(test_dataloader)
print(f'Average evaluation loss: {avg_eval_loss}')


### EXTRACT KEYWORDS

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Certifique-se de ter os pacotes necessários
nltk.download('punkt')
nltk.download('stopwords')

def extract_keywords(question):
    stop_words = set(stopwords.words('english'))  # Use 'portuguese' para stop words em português
    word_tokens = word_tokenize(question.lower())

    keywords = [word for word in word_tokens if word not in stop_words and word.isalpha()]
    return keywords


In [None]:
def create_keyword_index(dataframe):
    keyword_index = {}
    for coluna in dataframe.columns:
        if dataframe[coluna].dtype == 'object':  # Verifica se a coluna é textual
            for index, item in enumerate(dataframe[coluna]):
                if isinstance(item, str):
                    for word in item.lower().split():
                        if word not in keyword_index:
                            keyword_index[word] = []
                        keyword_index[word].append(index)
    return keyword_index

# Cria índices de palavras-chave para cada dataframe
hotels_index = create_keyword_index(hotels)
attractions_index = create_keyword_index(attractions)
trains_index = create_keyword_index(trains)
restaurants_index = create_keyword_index(restaurants)

def find_context_in_json(keywords, dataframes_indices):
    # Verifica em qual índice as palavras-chave aparecem mais frequentemente
    max_count = 0
    relevant_context = None

    for word in keywords:
        for df_name, index in dataframes_indices.items():
            if word in index:
                count = len(index[word])
                if count > max_count:
                    max_count = count
                    relevant_context = df_name

    return relevant_context


In [None]:
def get_context_text(df_name, keywords, dataframes):
    
    df = dataframes[df_name]

    # Compila todos os textos que contêm as palavras-chave
    context_texts = []
    for keyword in keywords:
        for coluna in df.columns:
            if df[coluna].dtype == 'object':  # Verifica se a coluna é textual
                matching_texts = df[df[coluna].str.contains(keyword, na=False, case=False)]
                context_texts.extend(matching_texts[coluna].tolist())

    context = " ".join(set(context_texts))  # Usando set para remover duplicatas
    return context

In [None]:
def answer_question(question, context, model, tokenizer):
    model.eval()
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt', add_special_tokens=True)
    input_ids = inputs["input_ids"].tolist()[0]

    # Localiza o índice do token [SEP] que separa a pergunta do contexto
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # Obtém os tokens da resposta a partir do input_ids original
    answer_tokens = input_ids[sep_index + 1:]

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer_tokens))

    return answer


In [None]:
def interactive_qa(model, tokenizer, dataframes_indices, dataframes):
    while True:
        try:
            question = input("Digite a pergunta (ou 'sair' para encerrar): ")
            if question.lower() == 'sair':
                print("Saindo do sistema de QA.")
                break

            keywords = extract_keywords(question)
            context_df_name = find_context_in_json(keywords, dataframes_indices)

            if context_df_name is not None:
                context_text = get_context_text(context_df_name, keywords, dataframes)
                answer = answer_question(question, context_text, model, tokenizer)
                print(f"Resposta: {answer}\n")
            else:
                print("Não foi possível encontrar um contexto relevante para a pergunta.")

        except Exception as e:
            print(f"Ocorreu um erro: {e}")
            break

dataframes_indices = {
    'hotels': hotels_index,
    'attractions': attractions_index,
    'trains': trains_index,
    'restaurants': restaurants_index
}

dataframes = {
    'hotels': hotels,
    'attractions': attractions,
    'trains': trains,
    'restaurants': restaurants
}


# Inicia a interface interativa
interactive_qa(model, tokenizer, dataframes_indices, dataframes)
