# Creating Name-Index based on Search Words

Search for specific names in a PDF, retreive page numbers, create dataframe with names and page numbers on which they appear. Great for first drafts!

In [2]:
import pandas as pd
import more_itertools as mit
import PyPDF2, os
import re

In [7]:
# Testing on a PDF book
file = "2019 Carlson Lundahl - Ett forskningsinstitut expanderar.pdf"
pdfFileObj = open(file, "rb", newline=None, errors=None, closefd=True, opener=None)
pdfReader = PyPDF2.PdfFileReader(pdfFileObj, strict=False)

print("Total number of pages in {}: {}.".format(file, pdfReader.numPages))

Total number of pages in 2019 Carlson Lundahl - Ett forskningsinstitut expanderar.pdf: 824.


pageObj = pdfReader.getPage(8)
text = pageObj.extractText()
text_latin = text.encode('latin-1', 'replace').decode('latin-1')
text_latin

text_utf8 = pageObj.extractText().encode('UTF-8')
text_utf8

text_utf8 = pageObj.extractText().encode('ASCII', 'ignore')
text_utf8

pageObj = pdfReader.getPage(8)
text = pageObj.extractText().encode('utf-8')

## Searching for specific search words

In [5]:
search_words = ["naja", "wallander", "wallenberg", "dahmén", "bentzel"]

In [6]:
names = []
pagenumbers = []

for search_word in search_words:
    search_word_count = 0
    for pageNum in range(1, pdfReader.numPages):
        pageObj = pdfReader.getPage(pageNum)
        text = pageObj.extractText().encode('utf-8')
        search_text = text.lower().split()
        for word in search_text:
            if search_word in word.decode("utf-8"):
                search_word_count += 1
                names.append(search_word)
                pagenumbers.append(pageNum)

    print("The word {} was found {} times".format(search_word, search_word_count))

The word naja was found 67 times
The word wallander was found 584 times
The word wallenberg was found 83 times
The word dahmén was found 297 times
The word bentzel was found 468 times


## Creating DataFrame

In [8]:
df = pd.DataFrame({'Name': names,
     'Page': pagenumbers})

df = df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 523 entries, 0 to 1498
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    523 non-null    object
 1   Page    523 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 12.3+ KB


### Creating index-like structure

In [9]:
df_gr = df.groupby('Name').agg(lambda x: list(x))
print('Appearances per person', df_gr, sep='\n', end='\n\n\n')

Appearances per person
                                                         Page
Name                                                         
bentzel     [10, 13, 19, 37, 60, 86, 88, 143, 150, 160, 21...
dahmén      [9, 13, 19, 24, 26, 33, 34, 35, 36, 37, 48, 60...
naja        [7, 13, 100, 212, 220, 221, 222, 223, 225, 227...
wallander   [7, 13, 14, 19, 23, 24, 34, 49, 51, 53, 61, 65...
wallenberg  [9, 15, 30, 33, 34, 88, 194, 197, 198, 199, 20...




### Creating range for consecutive page numbers

In [10]:
## From Stackoverflow with modifications:
## https://stackoverflow.com/questions/2154249/identify-groups-of-continuous-numbers-in-a-list

def find_ranges(iterable):
    """Yield range of consecutive numbers."""
    for group in mit.consecutive_groups(iterable):
        group = list(group)
        if len(group) == 1:
            yield group[0]
        else:
            yield str(group[0]) + "-" + str(group[-1])

df_gr['Page'] = [list(find_ranges(x)) for x in df_gr['Page']]

In [11]:
df_gr.head()

Unnamed: 0_level_0,Page
Name,Unnamed: 1_level_1
bentzel,"[10, 13, 19, 37, 60, 86, 88, 143, 150, 160, 21..."
dahmén,"[9, 13, 19, 24, 26, 33-37, 48, 60, 65-95, 138-..."
naja,"[7, 13, 100, 212, 220-223, 225, 227, 279, 315,..."
wallander,"[7, 13-14, 19, 23-24, 34, 49, 51, 53, 61, 65, ..."
wallenberg,"[9, 15, 30, 33-34, 88, 194, 197-202, 204-206, ..."


## Saving

In [12]:
df_gr.to_excel('auto_personregister.xlsx')