In [1]:
import functions
import nltk
#nltk.download('punkt')  # Download the necessary tokenizer data (only required once)
import re
import pandas as pd
import numpy as np
import openai
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()
api_key = os.environ.get("OPENAI_API_KEY")
openai.api_key = api_key  # Replace with your actual API key

In [2]:
file_path = "BERT.pdf"
test_query_raw = "What is the key reference of the paper?" # reference related
#test_query_raw = "What performance did BERT achieve on natural language processing tasks?" #direct query
#test_query_raw = "what is the key insight of the proposed method?" #indirect query
raw_text = functions.extract_raw_text_from_pdf(file_path)
in_text_citations, cleaned_text = functions.extract_in_text_citations(raw_text)#cleaned_text is changed_text

In [3]:
raw_citation_context = pd.DataFrame(columns=['citation', 'context'])
for citation in in_text_citations:
    context = functions.get_context(raw_text, citation)
    new_row = pd.DataFrame({'citation': [citation], 'context': [context]})
    raw_citation_context = pd.concat([raw_citation_context, new_row], ignore_index=True)

#raw_citation_context

In [4]:

# extract Refernces part
references = re.findall(r'References.*', cleaned_text) 
# check if there is "Appendix" part after references
appendix = re.findall(r'Appendix.*', cleaned_text)
# find the word "References" and remove everything after it
cleaned_text0 = re.sub(r'References.*', '', cleaned_text) + ' '.join(appendix)
# add some key words to the beginning of the text
cleaned_text = "title " + "author " + cleaned_text0

In [5]:
all_sentences = cleaned_text.split(' ')
# split the text into chunks of 350 words with 50 words overlap
chunks = [' '.join(all_sentences[i:i+300]) for i in range(0, len(all_sentences), 250)]

In [6]:
# create a dataframe with the chunks
raw_df = pd.DataFrame(chunks, columns=['text'])
# process the sentences
df = raw_df['text'].apply(functions.process_sentence)
df

0     [title, author, bert, pretraining, deep, bidir...
1     [task, featurebased, andﬁnetuning, featurebase...
2     [lefttoright, language, model, pretraining, ml...
3     [objective, used, well, objective, discriminat...
4     [supervised, downstream, task, advantage, appr...
5     [transfer, learning, large, pretrained, model,...
6     [total, parameters110m, bert, large, l24, h102...
7     [figure, 1, denote, input, embedding, e, ﬁnal,...
8     [literature, case, ﬁnal, hidden, vector, corre...
9     [sentence, aandbfor, pretraining, example, 50,...
10    [critical, use, documentlevel, corpus, rather,...
11    [gpu, starting, exact, pretrained, model7we, d...
12    [httpsgluebenchmarkcomleaderboard, number, tas...
13    [obtains, 728, date, writing, ﬁnd, bert, large...
14    [batch, size, 32, table, 2, show, top, leaderb...
15    [742, 771, published, unet, ensemble, 714, 749...
16    [dev, set, maximize, f1, use, triviaqa, data, ...
17    [849, 865, 926, 879, ltr, nsp, 821, 843, 7

In [7]:

in_test_query = functions.process_sentence(test_query_raw)
# remove "paper","text","article","thesis" from the query
in_test_query = [x for x in in_test_query if x not in ["paper","text","article","thesis"]]

selected_rows = []

# if the query contains 'reference','references' or 'citation' ,'citations'
if any(x in in_test_query for x in ['reference','references','citation','citations']):
    df_chosen_temp = raw_citation_context
    # set the citation as the index of the dataframe
    df_chosen_temp = df_chosen_temp.set_index('citation')
    # df_chosen should be a dataframe series
    df_chosen = df_chosen_temp['context']
    # process the sentences
    df_chosen = df_chosen.apply(functions.process_sentence)
    test_query = in_test_query

else:
    # find all the rows containing at least one word in the query
    
    for i in range(len(df)):
        if any(x in df[i] for x in in_test_query):
            selected_rows.append(i)

    # check if the query if direct in the text or not
    if len(selected_rows) >= 1:
        # extract the selected_rows from the dataframe with thier index and create a new dataframe
        df_selected = df.iloc[selected_rows]
        df_chosen = df_selected 
        test_query = in_test_query
    else:
        df_chosen = df
         # expand query in some case
        expand_query = in_test_query.copy()
        for word in in_test_query:
            synonyms = functions.find_synonyms(word)
            try:
                expand_query.append(synonyms.pop())
            except KeyError:
                continue
        test_query = expand_query   

In [8]:
print(len(selected_rows))

0


In [9]:
# load the pre-trained glove word embeddings
embeddings_dict = {}
with open("glove/glove.6B.50d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_dict[word] = vector

# create a vocabulary
vocabulary = set()
for sentence in df:
    for w in sentence:
        vocabulary.add(w)
vocabulary = list(vocabulary)

corpus = []
for sentence in df:
    for word in sentence:
        corpus.append(word)

# create a dictionary to store the unigram probability of each word
unigram_probabilities = {}
for word in vocabulary:
    unigram_probabilities[word] = functions.calculate_unigram_probability(corpus, word)

In [10]:

df_dict = df_chosen.to_dict() # set in correct input format
df_dict_vec = functions.sentence_embedding(embeddings_dict, df_dict, 0.5, unigram_probabilities)

In [11]:
test_query_dict = {0: test_query}
test_query_vec = functions.sentence_embedding(embeddings_dict, test_query_dict, 0.5, unigram_probabilities)

In [12]:
# get the top 3 similar chunks
def get_top3_similar_chunks(query):
    ranking = {}
    for q in df_dict_vec:
        ranking[q] = functions.cosine_similarity(query, df_dict_vec[q])
    ranking = sorted(ranking.items(), key=lambda x: x[1], reverse=True)
    if len(ranking) < 2:
        return ranking
    return ranking[:2]

# get the top 5 similar chunks
def get_top5_similar_chunks(query):
    ranking = {}
    for q in df_dict_vec:
        ranking[q] = functions.cosine_similarity(query, df_dict_vec[q])
    ranking = sorted(ranking.items(), key=lambda x: x[1], reverse=True)
    if len(ranking) < 5:
        return ranking
    return ranking[:5]

In [13]:
df_chosen

citation
(Peters et al., 2018a; Radford et al., 2018)    [unlike, recent, language, representation, mod...
(Bowman et al., 2015; Williams et al., 2018)    [include, sentencelevel, task, natural, langua...
(Peters et al., 2018a)                          [featurebased, approach, elmo, peter, et, al, ...
(Radford et al., 2018)                          [ﬁnetuning, approach, generative, pretrained, ...
(Vaswani et al., 2017)                          [example, openai, gpt, author, use, lefttorigh...
                                                                      ...                        
(Warstadt et al., 2018)                         [cola, corpus, linguistic, acceptability, bina...
(Cer et al., 2017)                              [stsb, semantic, textual, similarity, benchmar...
(Dolan and Brockett, 2005)                      [2018, paraphrasing, dolan, brockett, 2005, ai...
(Bentivogli et al., 2009)                       [rte, recognizing, textual, entailment, binary...
(Levesque e

In [14]:
raw_citation_context

Unnamed: 0,citation,context
0,"(Peters et al., 2018a; Radford et al., 2018)",Unlike recent language representation models ...
1,"(Bowman et al., 2015; Williams et al., 2018)",These include sentence-level tasks such as na...
2,"(Peters et al., 2018a)","The feature-based approach, such as ELMo (Pe..."
3,"(Radford et al., 2018)","The ﬁne-tuning approach, such as the Generati..."
4,"(Vaswani et al., 2017)","For example, in OpenAI GPT, the authors use a..."
...,...,...
88,"(Warstadt et al., 2018)",CoLA The Corpus of Linguistic Acceptability i...
89,"(Cer et al., 2017)",STS-B The Semantic Textual Similarity Benchma...
90,"(Dolan and Brockett, 2005)",", 2018) and paraphrasing (Dolan and Brockett,..."
91,"(Bentivogli et al., 2009)",RTE Recognizing Textual Entailment is a binar...


In [26]:
test5 = get_top5_similar_chunks(test_query_vec[0])
test5[0][0]

'(Peters et al., 2018a; Radford et al., 2018)'

In [27]:
import pandas as pd

def get_index_of_value(df, value):
    try:
        index = df[df == value].stack().index[0]
        return index
    except IndexError:
        return None
    
# for loop to get the top 5 index of the citation and concatenate all the results
result = ""
for i in range(5):
    index, column = get_index_of_value(raw_citation_context, test5[i][0])
    result += test5[i][0] + ":"+ raw_citation_context.iloc[index]['context'] + ";"

# # Example usage
# index, column = get_index_of_value(raw_citation_context, test5[0][0])
# # use the index get the context
# result = " " + test5[0][0] + ":"+ raw_citation_context.iloc[index]['context'] + "\n"

result


'(Peters et al., 2018a; Radford et al., 2018): Unlike recent language representation models  (Peters et al., 2018a; Radford et al., 2018) , BERT is designed to pretrain deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers;(Bowman et al., 2015; Williams et al., 2018): These include sentence-level tasks such as natural language inference  (Bowman et al., 2015; Williams et al., 2018)  and paraphrasing (Dolan and Brockett, 2005), which aim to predict the relationships between sentences by analyzing them holistically, as well as token-level tasks such as named entity recognition and question answering, where models are required to produce ﬁne-grained output at the token level (Tjong Kim Sang and De Meulder, 2003; Rajpurkar et al;(Peters et al., 2018a): The feature-based approach, such as ELMo  (Peters et al., 2018a) , uses task-speciﬁc architectures that include the pre-trained representations as additional features;(Rad

In [28]:
len(result.split(' '))

223

In [29]:
# if the query contains 'reference','references' or 'citation' ,'citations'
if any(x in in_test_query for x in ['reference','references','citation','citations']):

    # get top 5 similar chunks
    top_5_dict = get_top5_similar_chunks(test_query_vec[0])
    top_5_dict

    # concanate all the key values of the top 5 chunks
    top_5_chunks = []
    for i in range(len(top_5_dict)):
        top_5_chunks.append(top_5_dict[i][0])

    #remove "()" in the in_text_citations
    in_text_citations0 = [re.sub(r'[()]', '', x) for x in top_5_chunks]
    # extract the citations contains ";" and split them into two citations
    in_text_citations1 = [x.split('; ') for x in in_text_citations0]
    # flatten the list
    in_text_citations2 = [item for sublist in in_text_citations1 for item in sublist]
    # create a doctionary to store the citations, the key is the citation and the value is the counts
    in_text_citations_dict = {}
    for i in in_text_citations2:
        if i in in_text_citations_dict:
            in_text_citations_dict[i] += 1
        else:
            in_text_citations_dict[i] = 1

    # order the dictionary by the counts
    in_text_citations_dict = dict(sorted(in_text_citations_dict.items(), key=lambda item: item[1], reverse=True))
    #print(top_5_chunks)


    #print("Answer: The top 5 important references are: \n (listed in order) \n", list(in_text_citations_dict.keys())[:5])
    def ask_question(paragraph, question):
        chat_history = [
            {'role': 'system', 'content': 'You are a helpful assistant.'},
            {'role': 'user', 'content': 'Here are some texts from a paper, the format is "in-text citations: the content of that citation;"' + paragraph},
            {'role': 'assistant', 'content': question}
        ]

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=chat_history
        )

        answer = response.choices[0].message.content
        return answer

    paragraph = result
    #question = "A reference might be used in a paper for different purposes. For example, some are used as representative works in a research direction, some are used as the reference for a dataset. What we are looking for here is a paper that directly influence the design of the research method in the current paper." + test_query_raw
    question = test_query_raw


    answer = ask_question(paragraph, question)
    print("Answer:", answer)


else:
    top_3_dict = get_top3_similar_chunks(test_query_vec[0])

    # the the index from the dictionary keys
    top_3_index = [int(i[0]) for i in top_3_dict]
    final_chunk = ''
    for i in top_3_index:
        final_chunk += raw_df.iloc[i]["text"]


    def ask_question(paragraph, question):
        chat_history = [
            {'role': 'system', 'content': 'You are a helpful assistant.'},
            {'role': 'user', 'content': 'Here are some texts from a paper: ' + paragraph},
            {'role': 'assistant', 'content': question}
        ]

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=chat_history
        )

        answer = response.choices[0].message.content
        return answer

    paragraph = final_chunk
    question = test_query_raw



    answer = ask_question(paragraph, question)
    print("Answer:", answer)
    

  similarity = dot_product / (norm_a * norm_b)


Answer: The key reference of the paper is Peters et al., 2018a.
