In [1]:
##PDF SCRAPING LIBRARIES

from langchain_community.document_loaders import PDFMinerLoader,PyPDFLoader,PDFMinerPDFasHTMLLoader
import re
from langchain_community.docstore.document import Document


#PDF SCRAPING

##PyPDFLoader###
def scrape_pdf_PyPDF(url):
    

    loader = PyPDFLoader(url,extract_images=True)
    pages = loader.load_and_split()

    return pages




##PDFMinerLoader###
def scrape_pdf_PDFMiner(url):
    

    loader = PDFMinerPDFasHTMLLoader(url)
    data = loader.load()[0]
    
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(data.page_content,'html.parser')
    content = soup.find_all('div')
    
    
    cur_fs = None
    cur_text = ''
    snippets = []   # first collect all snippets that have the same font size
    for c in content:
        sp = c.find('span')
        if not sp:
            continue
        st = sp.get('style')
        if not st:
            continue
        fs = re.findall('font-size:(\d+)px',st)
        if not fs:
            continue
        fs = int(fs[0])
        if not cur_fs:
            cur_fs = fs
        if fs == cur_fs:
            cur_text += c.text
        else:
            snippets.append((cur_text,cur_fs))
            cur_fs = fs
            cur_text = c.text
    #snippets.append((cur_text,cur_fs))



    print(snippets)


    cur_idx = -1
    semantic_snippets = []
    # Assumption: headings have higher font size than their respective content
    for s in snippets:
        # if current snippet's font size > previous section's heading => it is a new heading
        if not semantic_snippets or s[1] > semantic_snippets[cur_idx].metadata['heading_font']:
            metadata={'heading':s[0], 'content_font': 0, 'heading_font': s[1]}
            metadata.update(data.metadata)
            semantic_snippets.append(Document(page_content='',metadata=metadata))
            cur_idx += 1
            continue

        # if current snippet's font size <= previous section's content => content belongs to the same section (one can also create
        # a tree like structure for sub sections if needed but that may require some more thinking and may be data specific)
        if not semantic_snippets[cur_idx].metadata['content_font'] or s[1] <= semantic_snippets[cur_idx].metadata['content_font']:
            semantic_snippets[cur_idx].page_content += s[0]
            semantic_snippets[cur_idx].metadata['content_font'] = max(s[1], semantic_snippets[cur_idx].metadata['content_font'])
            continue

        # if current snippet's font size > previous section's content but less than previous section's heading than also make a new
        # section (e.g. title of a PDF will have the highest font size but we don't want it to subsume all sections)
        metadata={'heading':s[0], 'content_font': 0, 'heading_font': s[1]}
        metadata.update(data.metadata)
        semantic_snippets.append(Document(page_content='',metadata=metadata))
        cur_idx += 1


    return semantic_snippets


In [2]:
#url='./PDF_FOLDER/adaptive_pooling.pdf'
url="./PDF_SOURCE/shannon_51.pdf"

In [3]:
results1=scrape_pdf_PyPDF(url)

In [4]:
results1[7]

Document(metadata={'source': './PDF_SOURCE/shannon_51.pdf', 'page': 5}, page_content='2\n1\n2\n16\n58\n19\n5\n1\n4\n3\n2\n1\n1\n1\n1\n1\n1\n1\n1\n7\n48\n17\n3\n4\n3\n2\n8\n2\n4\n3\n1\n1\n1\n1\n1\n18\n66\n15\n5\n4\n6\n1\n1\n1\n1\n»\n66\n13\n9\n4\n1\n1\n1\n2\n1\n1\n110\n67\n10\n4\n4\n6\n1\n1\n1\n1\n3\n1\n1\nu\n62\n9\n7\n5\n5\n4\n1\n1\n1\n2\n1\n1\n112\n58\n14\n7\n6\n2\n2\n4\n2\n1\n1\n1\n1\n113\n66\n9\n4\n4\n3\n3\n1\n2\n2\n1\n1\n1\n1\n214\n72\n6\n9\n3\n4\n1\n1\n2\n215\n60\n18\n5\n5\n1\n4\n3\n1\n1\n1\n1100\n80\n7\n3\n4\n2\n1\n1\n1\n1\nthe entry 19 in column 6, row 2, means that with five letters known thi cor\nrect letter was obtained on the second guess nineteen times out of the hun\ndred. The first two columns of this table were not obtained by the experi-\nmental procedure outlined above but were calculated directly from the\nknown letter and digram frequencies. Thus with no known letters the most\nprobable symbol is the space (probability .182); the next guess, if this is\nwrong, should

In [5]:
result=scrape_pdf_PDFMiner(url)

[('Prediction and Entropy of Printed English\n', 11), ('By  C.  E.  SHANNON\n(Manuscript  Received  Sept.  75,\nA  new  method  of  estimating  the  entropy  and  redundancy  of  a  language  is\ndescribed.  This  method  exploits  the  knowledge  of  the  language  statistics  pos-\nsessed  by  those who  speak  the  language,  and  depends  on  experimental  results\nin  prediction  of  the  next  letter  when  the  preceding  text  is  known.  Results  of\nexperiments in prediction are given, and some properties of an ideal  predictor are\ndeveloped.\n', 8), ('1.  INTRODUCTION\n', 10), ('IN  A  previous  paper1  the  entropy  and  redundancy  of  a  language  have\n', 26), ('been  defined.  The  entropy  is  a  statistical  parameter  which  measures,\nin  a  certain  sense,  how  much  information  is  produced  on  the  average  for\neach letter of a text in the language. If the language is translated into binary\ndigits (0 or 1) in the most efficient way, the entropy // is the av

In [6]:
len(result)

56

In [7]:
from pyvis.network import Network
import networkx as nx

G = nx.DiGraph()

G.add_node(1, label=result[1].metadata['heading'] , title=result[1].metadata['heading'] , color="red")

for i in range(2,len(result)):
    topic_name="Topic_"+str(i)
    content_name=topic_name+'_'+'content'
    G.add_node(topic_name, label=result[i].metadata['heading'] , title=result[i].metadata['heading'],color="blue")
    G.add_node(content_name, label=content_name , title=result[i].page_content, color="green")
    G.add_edge(1,topic_name)
    G.add_edge(topic_name,content_name)


In [8]:

nt = Network('800px', '2000px',notebook=True)
# populates the nodes and edges data structures
nt.from_nx(G)
nt.toggle_physics(True)
nt.show('nx1.html')


nx1.html


In [49]:
G

<networkx.classes.digraph.DiGraph at 0x7ef0d95a2d40>

In [140]:
G.edges

OutEdgeView([(1, 'Topic_2'), (1, 'Topic_3'), (1, 'Topic_4'), (1, 'Topic_5'), (1, 'Topic_6'), (1, 'Topic_7'), (1, 'Topic_8'), (1, 'Topic_9'), (1, 'Topic_10'), (1, 'Topic_11'), (1, 'Topic_12'), (1, 'Topic_13'), (1, 'Topic_14'), ('Topic_2', 'Topic_2_content'), ('Topic_3', 'Topic_3_content'), ('Topic_4', 'Topic_4_content'), ('Topic_5', 'Topic_5_content'), ('Topic_6', 'Topic_6_content'), ('Topic_7', 'Topic_7_content'), ('Topic_8', 'Topic_8_content'), ('Topic_9', 'Topic_9_content'), ('Topic_10', 'Topic_10_content'), ('Topic_11', 'Topic_11_content'), ('Topic_12', 'Topic_12_content'), ('Topic_13', 'Topic_13_content'), ('Topic_14', 'Topic_14_content')])

nx.html
