In [28]:
import pandas as pd
import PyPDF2
from PyPDF2 import PdfReader
# import bertopic
# from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
from matplotlib import pyplot as plt
import warnings
from transformers import pipeline
import re
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 500)


In [29]:
def extract_text_from_pdfs(pdf_files):
    # Create an empty data frame
    df = pd.DataFrame(columns=['file', 'text'])

    # Iterate over the PDF files
    for pdf_file in pdf_files:
        # Open the PDF file
        with open(pdf_file, 'rb') as f:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(f)

            # Get the number of pages in the PDF
            num_pages = len(pdf_reader.pages) 

            # Initialize a string to store the text from the PDF
            text = ""

            # Iterate over all the pages
            for page_num in range(num_pages):
                if page_num < 48:
                # Get the page object
                    page = pdf_reader.pages[page_num]

                # Extract the text from the page
                    page_text = page.extract_text()

                # Add the page text to the overall text
                    text += page_text

            # Add the file name and the text to the data frame
            df = df.append({'file': pdf_file.name, 'text': text}, ignore_index=True)

    # Return the data frame
    return df


In [30]:
from pathlib import Path
path='data/'
files = Path(path).glob("NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf")
df = extract_text_from_pdfs(files)
df

Unnamed: 0,file,text
0,NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf,"NLP (Natural Language Processing)\nfor NLP (Natural Language Programming)\nRada Mihalcea1, Hugo Liu2, and Henry Lieberman2\n1Computer Science Department, University of North Texas\nrada@cs.unt.edu\n2Media Arts and Sciences, Massachusetts Institute of Technology\n{hugo, henry }@media.mit.edu\nAbstract. Natural Language Processing holds great promise for making com-\nputer interfaces that are easier to use for people, since people will (hopefully) be\nable to talk to the computer in their own ..."


In [31]:
# Import the re module for regular expressions
import re

def preprocess_text(text_list):
    # Initialize a list to store the pre-processed text
    processed_text = []

    # Iterate over the text in the list
    for text in text_list:
      num_words = len(text.split(" "))
      if num_words > 15:
        processed_text.append(text)

    # Return the pre-processed text
    return processed_text


def remove_short_sentences(df):
  df['sentences'] = df['sentences'].apply(preprocess_text)
  return df

In [32]:
df['sentences'] = df['text'].apply(lambda long_str: long_str.replace("\n", " ").split("."))
df = remove_short_sentences(df)
df

Unnamed: 0,file,text,sentences
0,NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf,"NLP (Natural Language Processing)\nfor NLP (Natural Language Programming)\nRada Mihalcea1, Hugo Liu2, and Henry Lieberman2\n1Computer Science Department, University of North Texas\nrada@cs.unt.edu\n2Media Arts and Sciences, Massachusetts Institute of Technology\n{hugo, henry }@media.mit.edu\nAbstract. Natural Language Processing holds great promise for making com-\nputer interfaces that are easier to use for people, since people will (hopefully) be\nable to talk to the computer in their own ...","[NLP (Natural Language Processing) for NLP (Natural Language Programming) Rada Mihalcea1, Hugo Liu2, and Henry Lieberman2 1Computer Science Department, University of North Texas rada@cs, Natural Language Processing holds great promise for making com- puter interfaces that are easier to use for people, since people will (hopefully) be able to talk to the computer in their own language, rather than learn a specialized language of computer commands, For programming, however, the necessity of ..."


In [33]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2') 
from sklearn.metrics.pairwise import cosine_similarity
cosine_threshold = 0.3 # set threshold for cosine similarity value

queries = ['Natural Language Processing holds great promise for making computer interfaces that are easier to use for people'] #search query

print("\n Semantic Search Results")

results = []
for i, document in enumerate(df['sentences']):
  sentence_embeddings = model.encode(document)
  query_embedding    = model.encode(queries)
  for j, sentence_embedding in enumerate(sentence_embeddings):
    distance = cosine_similarity(sentence_embedding.reshape((1,-1)), query_embedding.reshape((1,-1)))[0][0]
    
    sentence = df['sentences'].iloc[i][j]
    results += [(i, sentence, distance)]
results = sorted(results, key=lambda x: x[2], reverse=True)


 Semantic Search Results


In [34]:
print(f"Query: {queries}")
print(f"Order by most relevant sentences in corpus:\n")

for idx, sentence, distance in results:
        if (distance > cosine_threshold):
                print(f"{sentence.strip()}, \n{df['file'].iloc[idx]}\nCosine Score: {distance:.4f})")
                print('-----------------------')
                df = df.append({'file': 'NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf', 'query': 'Natural Language Processing holds great promise for making computer interfaces that are easier to use for people', 'sentence':sentence,
                 'cosine_score': (distance)}, ignore_index=True)

Query: ['Natural Language Processing holds great promise for making computer interfaces that are easier to use for people']
Order by most relevant sentences in corpus:

Natural Language Processing holds great promise for making com- puter interfaces that are easier to use for people, since people will (hopefully) be able to talk to the computer in their own language, rather than learn a specialized language of computer commands, 
NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf
Cosine Score: 0.8245)
-----------------------
We thus see natural language programming as a potential large scale end-user (or rather, end- computer) application of text processing tools, which puts forward challenges for the natural language processing community and could eventually trigger advances in this ﬁeld, 
NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf
Cosine Score: 0.6969)
-----------------------
As it turns out, advances in natural language pro

In [36]:
df[~df['sentence'].isnull()][['file', 'query', 'sentence', 'cosine_score']].sort_values(by = 'cosine_score', ascending = False)

Unnamed: 0,file,query,sentence,cosine_score
1,NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf,Natural Language Processing holds great promise for making computer interfaces that are easier to use for people,"Natural Language Processing holds great promise for making com- puter interfaces that are easier to use for people, since people will (hopefully) be able to talk to the computer in their own language, rather than learn a specialized language of computer commands",0.824523
2,NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf,Natural Language Processing holds great promise for making computer interfaces that are easier to use for people,"We thus see natural language programming as a potential large scale end-user (or rather, end- computer) application of text processing tools, which puts forward challenges for the natural language processing community and could eventually trigger advances in this ﬁeld",0.696854
3,NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf,Natural Language Processing holds great promise for making computer interfaces that are easier to use for people,"As it turns out, advances in natural language processing helped the task of natural language programming",0.662843
4,NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf,Natural Language Processing holds great promise for making computer interfaces that are easier to use for people,But we believe that natural language processing could also beneﬁt from natural lan- guage programming,0.652010
5,NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf,Natural Language Processing holds great promise for making computer interfaces that are easier to use for people,"We then describe in 1Here, the obvious use of programming languages for coding natural language processing sys- tems is not considered as a “meaningful” interaction",0.634438
...,...,...,...,...
80,NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf,Natural Language Processing holds great promise for making computer interfaces that are easier to use for people,"Almost all natural languages are built atop the basic construction called independent clause ,w h i c h at its heart has a who-does-what structure, or subject-verb-di rectObject-i ndirectObject (SVO) construction",0.327958
81,NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf,Natural Language Processing holds great promise for making computer interfaces that are easier to use for people,"While this is still a long term goal, in this section we show how we can automatically generate computer program skeletons that can be used as a starting point for creating procedural computer programs",0.321752
82,NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf,Natural Language Processing holds great promise for making computer interfaces that are easier to use for people,"Note that although all steps, as identiﬁed by the step ﬁnding process, can play the role of informative comments in addition to the programming statements they generate, only those steps that are not explicitly marked as comments by the comment identiﬁcation process can be turned into programming statements",0.310989
83,NLP (Natural Language Processing) for NLP (Natural Language Programming).pdf,Natural Language Processing holds great promise for making computer interfaces that are easier to use for people,"For example, consider the following progression of descriptions and the simplest common denominator representation implied by all utter- ances up to that step",0.310474


In [37]:
# extract all the sentences from results that have a cosine similarity score larger than the threshold 
# and put in a list
texts = []
for idx, sentence, distance in results:
    if distance > cosine_threshold:
        text = sentence
        texts.append(text)
#turn the list to string
final_text = "".join(texts)

In [38]:
from transformers import BertForQuestionAnswering, AutoTokenizer
modelname = 'deepset/bert-base-cased-squad2'
model_qa = BertForQuestionAnswering.from_pretrained(modelname)
#initalized the token, and convert the list of strings (tokens) into a list of integers (token IDs).  This is done using an internal dictionary that contains every token understood by Bert.
tokenizer = AutoTokenizer.from_pretrained(modelname)

Downloading (…)lve/main/config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [42]:
nlp = pipeline('question-answering', model=model_qa, tokenizer=tokenizer)

context = final_text

nlp({
    'question': 'what are the authors?',
    'context': context
})

{'score': 0.02184503711760044,
 'start': 5493,
 'end': 5508,
 'answer': 'Lieberman & Liu'}