In [1]:
import functions
import nltk
#nltk.download('punkt')  # Download the necessary tokenizer data (only required once)
import re
import pandas as pd
import numpy as np
import openai
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()
api_key = os.environ.get("OPENAI_API_KEY")
openai.api_key = api_key  # Replace with your actual API key

In [2]:
file_path = "input_sample/BERT.pdf"
#test_query_raw = "What is the key reference of the paper?" # reference related
test_query_raw = "How does BERT address the limitations of current techniques?" #direct query
#test_query_raw = "what is the key insight of the proposed method?" #indirect query
raw_text = functions.extract_raw_text_from_pdf(file_path)
in_text_citations, cleaned_text = functions.extract_in_text_citations(raw_text)#cleaned_text is changed_text

In [3]:
raw_citation_context = pd.DataFrame(columns=['citation', 'context'])
for citation in in_text_citations:
    context = functions.get_context(raw_text, citation)
    new_row = pd.DataFrame({'citation': [citation], 'context': [context]})
    raw_citation_context = pd.concat([raw_citation_context, new_row], ignore_index=True)

#raw_citation_context

In [4]:

# extract Refernces part
references = re.findall(r'References.*', cleaned_text) 
# check if there is "Appendix" part after references
appendix = re.findall(r'Appendix.*', cleaned_text)
# find the word "References" and remove everything after it
cleaned_text0 = re.sub(r'References.*', '', cleaned_text) + ' '.join(appendix)
# add some key words to the beginning of the text
cleaned_text = "title " + "author " + cleaned_text0

In [5]:
all_sentences = cleaned_text.split(' ')
# split the text into chunks of 350 words with 50 words overlap
chunks = [' '.join(all_sentences[i:i+250]) for i in range(0, len(all_sentences), 200)]

In [6]:
# create a dataframe with the chunks
raw_df = pd.DataFrame(chunks, columns=['text'])
# process the sentences
df = raw_df['text'].apply(functions.process_sentence)
df

0     [title, author, bert, pretraining, deep, bidir...
1     [aim, predict, relationship, sentence, analyzi...
2     [harmful, applying, ﬁnetuning, based, approach...
3     [us, shallow, concatenation, independently, tr...
4     [sentence, lefttoright, generation, next, sent...
                            ...                        
60    [semantically, equivalent, qnli, question, nat...
61    [task, consisting, sentence, extracted, movie,...
62    [report, singletask, ﬁnetuning, result, paper,...
63    [ablation, study, evaluate, effect, different,...
64    [target, token, themask, symbol, mlm, ame, mea...
Name: text, Length: 65, dtype: object

In [7]:

in_test_query = functions.process_sentence(test_query_raw)
# remove "paper","text","article","thesis" from the query
in_test_query = [x for x in in_test_query if x not in ["paper","text","article","thesis"]]

# if the query contains 'reference','references' or 'citation' ,'citations'
if any(x in in_test_query for x in ['reference','references','citation','citations']):
    df_chosen_temp = raw_citation_context
    # set the citation as the index of the dataframe
    df_chosen_temp = df_chosen_temp.set_index('citation')
    # df_chosen should be a dataframe series
    df_chosen = df_chosen_temp['context']
    # process the sentences
    df_chosen = df_chosen.apply(functions.process_sentence)
    test_query = in_test_query

else:
    # find all the rows containing at least one word in the query
    selected_rows = []
    for i in range(len(df)):
        if any(x in df[i] for x in in_test_query):
            selected_rows.append(i)

    # check if the query if direct in the text or not
    if len(selected_rows) >= 1:
        # extract the selected_rows from the dataframe with thier index and create a new dataframe
        df_selected = df.iloc[selected_rows]
        df_chosen = df_selected 
        test_query = in_test_query
    else:
        df_chosen = df
         # expand query in some case
        expand_query = in_test_query.copy()
        for word in in_test_query:
            synonyms = functions.find_synonyms(word)
            try:
                expand_query.append(synonyms.pop())
            except KeyError:
                continue
        test_query = expand_query   

In [8]:
print(len(selected_rows))

53


In [9]:
# load the pre-trained glove word embeddings
embeddings_dict = {}
with open("glove/glove.6B.50d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_dict[word] = vector

# create a vocabulary
vocabulary = set()
for sentence in df:
    for w in sentence:
        vocabulary.add(w)
vocabulary = list(vocabulary)

corpus = []
for sentence in df:
    for word in sentence:
        corpus.append(word)

# create a dictionary to store the unigram probability of each word
unigram_probabilities = {}
for word in vocabulary:
    unigram_probabilities[word] = functions.calculate_unigram_probability(corpus, word)

In [10]:

df_dict = df_chosen.to_dict() # set in correct input format
df_dict_vec = functions.sentence_embedding(embeddings_dict, df_dict, 0.5, unigram_probabilities)

In [11]:
test_query_dict = {0: test_query}
test_query_vec = functions.sentence_embedding(embeddings_dict, test_query_dict, 0.5, unigram_probabilities)

In [12]:
# get the top 3 similar chunks
def get_top3_similar_chunks(query):
    ranking = {}
    for q in df_dict_vec:
        ranking[q] = functions.cosine_similarity(query, df_dict_vec[q])
    ranking = sorted(ranking.items(), key=lambda x: x[1], reverse=True)
    if len(ranking) < 3:
        return ranking
    return ranking[:3]

# get the top 5 similar chunks
def get_top5_similar_chunks(query):
    ranking = {}
    for q in df_dict_vec:
        ranking[q] = functions.cosine_similarity(query, df_dict_vec[q])
    ranking = sorted(ranking.items(), key=lambda x: x[1], reverse=True)
    if len(ranking) < 5:
        return ranking
    return ranking[:5]

## if the query is about refernce

In [13]:
# if the query contains 'reference','references' or 'citation' ,'citations'
if any(x in in_test_query for x in ['reference','references','citation','citations']):

    # get top 5 similar chunks
    top_5_dict = get_top5_similar_chunks(test_query_vec[0])
    top_5_dict

    # concanate all the key values of the top 5 chunks
    top_5_chunks = []
    for i in range(len(top_5_dict)):
        top_5_chunks.append(top_5_dict[i][0])

    #remove "()" in the in_text_citations
    in_text_citations0 = [re.sub(r'[()]', '', x) for x in top_5_chunks]
    # extract the citations contains ";" and split them into two citations
    in_text_citations1 = [x.split('; ') for x in in_text_citations0]
    # flatten the list
    in_text_citations2 = [item for sublist in in_text_citations1 for item in sublist]
    # create a doctionary to store the citations, the key is the citation and the value is the counts
    in_text_citations_dict = {}
    for i in in_text_citations2:
        if i in in_text_citations_dict:
            in_text_citations_dict[i] += 1
        else:
            in_text_citations_dict[i] = 1

    # order the dictionary by the counts
    in_text_citations_dict = dict(sorted(in_text_citations_dict.items(), key=lambda item: item[1], reverse=True))


    print("Answer: The top 5 important references are: \n (listed in order) \n", list(in_text_citations_dict.keys())[:5])



else:
    top_3_dict = get_top3_similar_chunks(test_query_vec[0])

    # the the index from the dictionary keys
    top_3_index = [int(i[0]) for i in top_3_dict]
    final_chunk = ''
    for i in top_3_index:
        final_chunk += raw_df.iloc[i]["text"]


    def ask_question(paragraph, question):
        chat_history = [
            {'role': 'system', 'content': 'You are a helpful assistant.'},
            {'role': 'user', 'content': 'Here are some texts from a paper: ' + paragraph},
            {'role': 'assistant', 'content': question}
        ]

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=chat_history
        )

        answer = response.choices[0].message.content
        return answer

    paragraph = final_chunk
    question = test_query_raw



    answer = ask_question(paragraph, question)
    print("Answer:", answer)
    

Answer: BERT (Bidirectional Encoder Representations from Transformers) addresses the limitations of current techniques, particularly the unidirectionality constraint of standard language models. BERT pretrains deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As such, BERT alleviates the previously mentioned unidirectionality constraint by using a deep bidirectional architecture, allowing for better representation of contexts for both sentence-level and token-level tasks. This allows for better performance in these tasks, as BERT can be fine-tuned with just one additional output layer, without substantial task-specific architecture modifications.


## if the query is NOT about references

In [14]:


# top_3_dict = get_top3_similar_chunks(test_query_vec[0])

# # the the index from the dictionary keys
# top_3_index = [int(i[0]) for i in top_3_dict]
# final_chunk = ''
# for i in top_3_index:
#     final_chunk += raw_df.iloc[i]["text"]


# def ask_question(paragraph, question):
#     chat_history = [
#         {'role': 'system', 'content': 'You are a helpful assistant.'},
#         {'role': 'user', 'content': 'Here are some texts from a paper: ' + paragraph},
#         {'role': 'assistant', 'content': question}
#     ]

#     response = openai.ChatCompletion.create(
#         model="gpt-3.5-turbo",
#         messages=chat_history
#     )

#     answer = response.choices[0].message.content
#     return answer

# paragraph = final_chunk
# question = test_query_raw



# answer = ask_question(paragraph, question)
# print("Answer:", answer)

