In [1]:
# PDF2TEXT

import pandas as pd
import numpy as np
import string
import os, io

from pathlib import Path
import requests
    
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

def get_pdf_from_url(pdf_path, pdf_url):
    pdf = Path(pdf_path)
    response = requests.get(pdf_url)
    pdf.write_bytes(response.content)
    
def pdf_info(pdf_path):
    pdf = open(pdf_path, 'rb')
    parser = PDFParser(pdf)
    return PDFDocument(parser).info[0]

def pdf2txt(pdf_path):
    
    pdf = open(pdf_path, 'rb')

    # Init.
    resource_manager = PDFResourceManager()
    string_buffer = io.StringIO()  
    
    # Construct.
    converter = TextConverter(resource_manager, 
                              string_buffer, 
                              laparams=LAParams(), 
                              codec='utf-8')
    interpreter = PDFPageInterpreter(resource_manager, converter) 
    
    # Read pdf (list of pages).
    pdf_pages = PDFPage.get_pages(pdf, check_extractable=True) # list
    
    # Process pdf to text.
    [interpreter.process_page(page) for page in pdf_pages]
    
    # Result.
    pdf_text = string_buffer.getvalue()
    
    [o.close() for o in [pdf, string_buffer, converter]]
    return pdf_text



In [2]:
# TEXT CLEANER

from bs4 import BeautifulSoup
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Set stop words
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['RT', 'rt'])

# Initiate sub-parts
WNL = WordNetLemmatizer()

def remove_mentions(text):
    return ' '.join([w for w in text.split(' ') if not w.startswith('@')])
def remove_url(text):
    return re.sub('https?://[A-Za-z0-9./]+','',text)
def html_strip_lxml(text):
    return BeautifulSoup(text, 'lxml').get_text()
def remove_special_characters(text, preserve):
    return re.sub("[^a-zA-Z{}]".format(preserve), " ", text)
def lowercase_text(text):
    return text.lower()
def strip_inner_spaces(text):
    return ' '.join([w.strip() for w in text.split()])
def remove_stop_words(text):
    return ' '.join([w for w in text.split() if not w in set(stopwords)])
def lemmatize_words(text, WNL):
    return ' '.join([WNL.lemmatize(word, pos='v') for word in text.split()])

# def cleaner(text):
#     text = remove_mentions(text)
#     text = remove_url(text)
#     text = html_strip_lxml(text)
#     text = remove_special_characters(text,preserve='.')
#     text = lowercase_text(text)
#     text = strip_inner_spaces(text)
#     text = remove_stop_words(text)
#     #text = lemmatize_words(text, WNL)
#     return text

def sentence_tokenizer_cleaner(text):
    text = remove_url(text)
    text = html_strip_lxml(text)
    #text = remove_special_characters(text,preserve='.!?')
    text = strip_inner_spaces(text)
    return text

# def custom_cleaner(text):
#     text = remove_url(text) # Dont use for link catch-up
#     text = html_strip_lxml(text) # Dont use to split layout paragraphs
#     text = remove_special_characters(text, preserve='') #='.?!') #234567890') # Dont use for dates
#     text = lowercase_text(text) # Dont for entities
#     text = strip_inner_spaces(text)
#     text = remove_stop_words(text)
#     #text = lemmatize_words(text, WNL)
#     return text

def BOW_cleaner(text):
    text = remove_url(text) # Dont use for link catch-up
    text = html_strip_lxml(text) # Dont use to split layout paragraphs
    text = remove_special_characters(text, preserve='') #='.?!') #234567890') # Dont use for dates
    text = lowercase_text(text) # Dont for entities
    text = strip_inner_spaces(text)
    text = remove_stop_words(text)
    text = lemmatize_words(text, WNL)
    return text

def EXPLORATION_cleaner(text):
    text = remove_url(text) # Dont use for link catch-up
    text = html_strip_lxml(text) # Dont use to split layout paragraphs
    text = remove_special_characters(text, preserve='') #='.?!') #234567890') # Dont use for dates
    text = lowercase_text(text) # Dont for entities
    text = strip_inner_spaces(text)
    text = remove_stop_words(text)
    #text = lemmatize_words(text, WNL)
    return text

def TO_SUMMARY_cleaner(text):
    text = remove_url(text) # Dont use for link catch-up
    text = html_strip_lxml(text) # Dont use to split layout paragraphs
    text = remove_special_characters(text, preserve='') #='.?!') #234567890') # Dont use for dates
    text = lowercase_text(text) # Dont for entities
    text = strip_inner_spaces(text)
    text = remove_stop_words(text)
    #text = lemmatize_words(text, WNL)
    return text

# Usage single
#clean = cleaner('your text')

# Usage dataframe
#df['clean_COLUMN_NAME'] = df.COLUMN_NAME.apply(cleaner)

In [3]:
# PDF TEXT TO SENTS

from nltk.tokenize import sent_tokenize

def text_to_sentences_df(text, sent_min_chars):
    sentences = sent_tokenize(sentence_tokenizer_cleaner(text))   # !!!
    #sentences = sentence_tokenizer_cleaner(pdf_text).split('.')  # !!!
    df =pd.DataFrame({'initial_sentences':sentences})   
    # Bugfix: pandas & spacy integration  (link: https://github.com/jupyter/notebook/issues/4369)
    get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""
    # Pre-cleaning to select sentences > N characters.
    df['clean_sent'] = df['initial_sentences'].apply(custom_cleaner)      # !!!
    df = df[(df.clean_sent.str.len() > sent_min_chars)].reset_index(drop=True)
    return df

In [4]:
# BAG OF WORDS

from collections import Counter
from itertools import chain

def bag_of_words(df, column, word_min_chars):
    
    sentences = df.loc[:,column].to_list()
    clean_words = list(chain(*[s.split() for s in sentences]))
    clean_words = [w for w in clean_words if len(w) > word_min_chars]

    # Set up words as columns and their weights as dict.
    word_weights= {}
    for w in clean_words:
        df[w] = 0
        word_weights[w] = 0
        
    # Get word counts vector for each of sentences.
    for i, sent in enumerate(sentences):
        for word in sent.split():
            if len(word) > word_min_chars:
                df.loc[i, word] +=1

    # Drop words below treshold of n occurences in full text ~ [2-8]
    vc_tresh = 2 #[~2-8]
    cols_to_drop = [w for w in clean_words if not sum(df[w]) >= vc_tresh]
    for col in cols_to_drop:
        if not col in my_words or not col in paper_words:
            del df[col]
            clean_words.remove(col)
    
    # Compute value of each of words 
    most_occuring_5_words = Counter([w for w in clean_words if len(w)>word_min_chars]).most_common(5)
    occurence_weight = np.median([w[1] for w in most_occuring_5_words])
    for i, w in enumerate(clean_words):
        word_weights[w] = int(sum(df[w])) / int(occurence_weight)
    
    # Calculate each sentence value
    def get_sent_weight_BOW(sent):
        sentence_rank = 0
        for w in sent.split():
            if w in word_weights:
                sentence_rank += word_weights[w]
        return sentence_rank

    df['RANKING_BOW1'] = df.clean_sent.apply(get_sent_weight_BOW)

    return df

In [19]:
# AS A WHOLE

def pdf_summary_df(pdf_url, my_words, paper_words,
                N_sentences=5,sent_min_chars=20,word_min_chars=2):
    
    pdf_path = os.path.join(os.getcwd(), 'MY_PDF.pdf')
    get_pdf_from_url(pdf_path, pdf_url)

    pdf_data = pdf_info(pdf_path)    
    pdf_text = pdf2txt(pdf_path)
    
    df = text_to_sentences_df(pdf_text, sent_min_chars) # 8

    df = bag_of_words(df, 'clean_sent', word_min_chars) # 2,2
    df.to_csv('PDF_CSV.csv', sep=',', index=False)
    return df

def pdf_summary(df, N_sentences):   
    
    # Rankings:
    RANKING_BOW1 = df.sort_values(by='RANKING_BOW1', ascending=False)
    RANKING_BOW2 = None
    RANKING_ALL1 = None

    # Summary.
    ranking = RANKING_BOW1  
    summary = ' '.join(ranking.initial_sentences[:N_sentences])
    return summary

In [20]:
# SCRIPT OPEN

# Generate N sentences
N_sentences = 5

# Target PDF URL
pdf_url = 'placeholder'

# Test PDF URL's
pdf_url = 'https://arxiv.org/pdf/1904.00157.pdf'
pdf_url = 'https://arxiv.org/pdf/1806.09525.pdf'
pdf_url = 'https://assets.cureus.com/uploads/technical_report/pdf/17211/1553801088-20190328-62-44vky7.pdf'
pdf_url = 'https://arxiv.org/pdf/1904.00110.pdf'

# User defined keyword pick
my_words = ['ai','psychology']

# Paper - related keywords (predefined)
paper_words = ['title', 'subject',
                'author',
                'keyword', 'keywords', 'category', 'categories',
                'abstract', 
                'introduction', 
                'intro', 
                'method', 'methods',
                'result', 'results',
                'discussion',
                'conclusions', 'conclusion',
                'acknowledgement', 'acknowledgements',
                'references', 'citation', 'citations']

# Get summary
summary_df = pdf_summary_df(pdf_url,my_words,paper_words,N_sentences=5,sent_min_chars=20,word_min_chars=2)
summary = pdf_summary(summary_df, N_sentences)
summary

'We trained them with a newly created dataset of 2.2 million article titles, ab- stracts and keyphrase strings that we processed and released.1 The selected text summarization models are compared with popular unsupervised and super- vised methods using ROUGE (Lin, 2004) and full- match F1 metrics. Despite using advanced deep learning models, large quantities of data and many days of com- putation, our systematic evaluation on four test datasets reveals that the explored text sum- marization methods could not produce bet- ter keyphrases than the simpler unsupervised methods, or the existing supervised ones. 3.2 Text Summarization Methods To overcome the three problems mentioned in Sec- tion 3.1, we explore abstractive text summariza- tion models proposed in the literature, trained with article abstracts and titles as sources and keyword strings as targets. Motivated by recent advances in neural ma- chine translation and abstractive text summariza- tion (Vaswani et al., 2017; Foster et a

In [9]:
# # FOR FURTHER USE - GET ADJS NOUNS HEADS ETC

# # EXPLORE: Word parts

# import spacy
# from itertools import chain
# from collections import Counter
# nlp = spacy.load('en_core_web_sm')


# all_nouns = []
# all_adjs = []
# for sent in all_sents:

#     sent_nouns = [w.text for w in nlp(sent) if w.tag_ in ['NN']]
#     all_nouns.extend(sent_nouns)

#     sent_adjs = [w.text for w in nlp(sent) if w.tag_ in ['JJ']]
#     all_adjs.extend(sent_adjs)    


# #Counter(all_words).most_common(50)
# #Counter(all_nouns).most_common(30)
# Counter(all_adjs).most_common(30)

# # ALSO

# # df['links'] = '' 
# # df['section'] = ''
# # df['dates'] = ''
# # df['entities'] = ''
# # df['x'] = ''



# # # NIE WIEM NADAL PO CO MI TO
# # # General text info

# # # Text
# # text = pdf_text

# # all_chars = text
# # uni_chars = set(text)

# # all_words = text.split()
# # uni_words = set(text.split())

# # all_sents = text.split('.')
# # uni_sents = set(text.split('.'))

[('able', 16),
 ('computational', 12),
 ('rst', 10),
 ('open', 9),
 ('recent', 8),
 ('key', 8),
 ('original', 7),
 ('relevant', 7),
 ('new', 6),
 ('second', 6),
 ('third', 6),
 ('good', 6),
 ('reproducible', 6),
 ('erent', 5),
 ('many', 4),
 ('simple', 4),
 ('excellent', 4),
 ('main', 4),
 ('repository', 4),
 ('available', 4),
 ('great', 3),
 ('several', 3),
 ('online', 3),
 ('full', 3),
 ('generic', 3),
 ('literate', 3),
 ('pro', 3),
 ('konkol', 3),
 ('kray', 3),
 ('analy', 3)]