## Common imports

In [9]:
import pandas as pd
import numpy as np
import json
import os
import torch
import re
import string
from typing import List, Tuple
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import download
import gtts
from IPython.display import Audio


In [None]:
download('punkt')
download('stopwords')

In [None]:
# Reading the dataset into a list

import os
files = []
for dirname, _, filenames in os.walk(os.getcwd() + '/Covid'):
    for filename in filenames:
        if filename.lower().endswith((".json")):
            files.append(os.path.join(dirname, filename))

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class KnowledgeRetriever:
    def __init__(self, knowledge_base_path: str):
        # Load the knowledge base
        self.data = pd.read_csv(knowledge_base_path, header=None, names=['paper_id', 'text', 'relation'])

        # Combine text for each paper_id
        self.combined_text = self.data.groupby('paper_id')['text'].apply(lambda x: ' '.join(x)).reset_index()

        # Vectorize the text using TF-IDF
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(self.combined_text['text'])

    def get_matching_articles(self, query: str, top_n: int = 5):
        """
        Retrieve the top_n articles that match the given query.

        :param query: A preprocessed user query.
        :param top_n: The number of top articles to retrieve.
        :return: A list of tuples containing the article ID and similarity score.
        """
        query_vector = self.vectorizer.transform([query])
        similarity_scores = cosine_similarity(query_vector, self.tfidf_matrix)
        ranked_indices = similarity_scores.argsort().flatten()[::-1]

        top_paper_ids = self.combined_text.iloc[ranked_indices[:top_n]]['paper_id'].tolist()
        top_scores = similarity_scores[0, ranked_indices[:top_n]].tolist()

        return list(zip(top_paper_ids, top_scores))
    
    def get_article_text(self, paper_id):
        for item in files[:20]:  # Remember to match this to the number of articles originally processed
            with open(item, 'r') as object:
                data = object.read()
            obj = json.loads(data)
            paperid = obj['paper_id']

            if paperid == paper_id:  # Filter the appropriate article based on id
                title = obj['metadata']['title']
                abstract = obj['abstract']
                body = obj['body_text']

                article_text = title + " " + " ".join([item['text'] for item in abstract]) + " " + " ".join([item['text'] for item in body])
                break
        else:
            article_text = None

        return article_text


In [12]:
from transformers import BertTokenizer, BertForQuestionAnswering
import tqdm as notebook_tqdm

class AnswerGenerator:
    def __init__(self, model_name = "bert-large-uncased-whole-word-masking-finetuned-squad") :
        # Initialize the device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load the pre-trained BERT model and tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForQuestionAnswering.from_pretrained(model_name)

        # move the model to the device
        self.model.to(self.device)

    def generate_answer(self, question, context) :
        """
        Generate a natural language answer from given question and context.
        :param question: A natural language question.
        :param context: The context from which the answer can be extracted.
        :return: A natural language answer to the question
        """

        # Encode the input text using the tokenizer
        inputs = self.tokenizer.encode_plus(question, context, return_tensors = "pt", max_length = 512, truncation = True)

        # Move the input tensors to the device
        input_ids = inputs["input_ids"].to(self.device)
        token_type_ids = inputs["token_type_ids"].to(self.device)
        attention_mask = inputs["attention_mask"].to(self.device)

        # Pass the input tensors through the model to get the start and end scores for the answer span
        outputs = self.model(input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

        #Find the indices of the highest start and end scores
        start_idx = torch.argmax(start_scores).item()
        end_idx = torch.argmax(end_scores).item()

        # Decode the answer  by converting the token IDs back to tokens and then to natural language
        answer = self.tokenizer.convert_tokens_to_string(self.tokenizer.convert_ids_to_tokens(input_ids[0][start_idx:end_idx + 1]))

        return answer

### Pre-processing the text

In [14]:
import re

def preprocess_text(text) :
	# remove special characters
	text = re.sub('[^a-zA-Z0-9]', ' ', text)

	#Replace multiple spaces with a single space
	text = re.sub('\s+', ' ', text).strip()

	return text

In [16]:

def main_loop():
    """
    In the `main_loop()` function, we first create instances of `KnowledgeRetrieval` and `AnswerGenerator`. 
    Then, for each user question, we preprocess the question, retrieve the top relevant articles, and get the text of the most relevant article. 
    Finally, we use the `AnswerGenerator` instance to generate a natural language answer using the BERT model.
    """

    knowledge_retrieval = KnowledgeRetriever('triples.csv')
    answer_generator = AnswerGenerator()
    
    while True:
        question = input("Please enter your question or type 'quit' to exit: ")
        if question.strip().lower() == 'quit':
            break
        
        # Preprocess the question
        question = preprocess_text(question)
            
        # Retrieve the top relevant articles using the get_matching_articles()
        top_articles = knowledge_retrieval.get_matching_articles(question, top_n=1)

        # Retrieve the text of the most relevant article
        most_relevant_article_id, _ = top_articles[0]
        most_relevant_text = knowledge_retrieval.get_article_text(most_relevant_article_id)
        
        # Generate a natural language answer based on BERT
        answer = answer_generator.generate_answer(question, most_relevant_text)
        
        print("Answer: {}".format(answer))
        
        # Check if the user wants spoken responses
        spoken_response = input("Would you like the response to be spoken? (yes/no): ").strip().lower()

        if spoken_response == "yes":
            # Convert the answer text to audio using the text_to_speech() function
            tts = gtts.gTTS(answer, lang="en")
            tts.save("answer.mp3")

            # Play the audio
            display(Audio("answer.mp3", autoplay=True))
        
    print("\n Thank you for using our answering system! Goodbye!")
    
if __name__ == "__main__":
    main_loop()




Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Answer: [SEP]


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Answer: [SEP]



 Thank you for using our answering system! Goodbye!
