In [1]:
# Requirements

import PyPDF2
import pandas as pd 
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize


In [25]:
#  Extract text from PDF.  Save text to file.

pdf_file_name = "The Hitchhiker's Guide To The Galaxy.pdf"

with open(pdf_file_name, 'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)
    page_nums = pdf_reader.numPages
    pdf_text = ""
    for page_num in range(page_nums):
        page = pdf_reader.getPage(page_num)
        text = page.extractText()
        pdf_text += text
    with open(f'{pdf_file_name}.txt','w', encoding = "utf-8") as output:
        output.writelines(pdf_text)
        output.close()
    pdf_file.close()

In [56]:
#  Extract text and convert to Dataframe for further analysis

import string
special_characters = [':',',','.','!','?','(',')',';','"','~','-',"'"]

df = pd.DataFrame(pdf_text.split(), columns=['Words'])
df['Words'] = df['Words'].str.upper()
df['Words'] = df[df['Words'] != '']  # trying to remove blanks
for i in special_characters:
    df['Words'] = df['Words'].str.replace(i,'')

# Removing anything that doesn't include a letter
df_words = df[df['Words'].str.contains('[A-Za-z]')]

# Removing anything that doesn't include a number and move those to a different Dataframe
df_numbers = df[df['Words'].str.contains('[0-9]')]
df_numbers.rename(columns={'Words': 'Numbers'}, inplace=True)

# Add additional columns to your dataframe
stop_words = set(stopwords.words("english"))
stop_words = list(stop_words)
stop_words_upper = [x.upper() for x in stop_words]
df_words['Stop Word'] = df_words['Words'].isin(stop_words_upper)
df_words['Parts of Speech'] = [x[1] for x in nltk.pos_tag(df_words['Words'])]
POS = [
    ['CC','conjunction, cordinating'],
    ['CD','numeral, cardinal'],
    ['DT','determiner'],
    ['EX','existential there'],
    ['FW','foreign word'],
    ['IN','preposition or conjunction, subordinating'],
    ['JJ','adjective or numeral, ordinal'],
    ['JJR','adjective, comparative'],
    ['JJS','adjective, superlative'],
    ['LS','list item marker'],
    ['MD','modal auxiliary'],
    ['NN','noun, common, singular or mass'],
    ['NNP','noun, proper, singular'],
    ['NNPS','noun, proper, plural'],
    ['NNS','noun, common, plural'],
    ['PDT','pre-determiner'],
    ['POS','genitive marker'],
    ['PRP','pronoun, personal'],
    ['PRP$','pronoun, possessive'],
    ['RB','adverb'],
    ['RBR','adverb, comparative'],
    ['RBS','adverb, superlative'],
    ['RP','particle'],
    ['SYM','symbol'],
    ['TO','as preposition or infinitive marker'],
    ['UH','interjection'],
    ['VB','verb, base form'],
    ['VBD','verb, past tense'],
    ['VBG','verb, present participle or gerund'],
    ['VBN','verb, past participle'],
    ['VBP','verb, present tense, not 3rd person singular'],
    ['VBZ','verb, present tense, 3rd person singular'],
    ['WDT','WH-determiner'],
    ['WP','WH-pronoun'],
    ['WP$','H-pronoun, possessive'],
    ['WRB',' Wh-adverb'],
    ['``','opening quotation mark'],
    ["''",'']
    ]

POS_df = pd.DataFrame(POS, columns=['Parts of Speech', 'Parts of Speech Description'])
df_words = pd.merge(df_words,POS_df,how='left', on='Parts of Speech')

df_words['Length of Word'] = df_words['Words'].apply(len)

# Create your insights file
unique_words = str(df_words['Words'].nunique())
unique_numbers = str(df_numbers['Numbers'].nunique())

popular_nouns = df_words[(df_words['Stop Word'] == False) & (df_words['Parts of Speech'].str.contains('NN'))].filter(items=['Words']).value_counts()[:10].to_string(header=False)
popular_adjectives = df_words[(df_words['Stop Word'] == False) & (df_words['Parts of Speech'].str.contains('JJ'))].filter(items=['Words']).value_counts()[:10].to_string(header=False)
popular_verbs = df_words[(df_words['Stop Word'] == False) & (df_words['Parts of Speech'].str.contains('VB'))].filter(items=['Words']).value_counts()[:10].to_string(header=False)
popular_numbers = df_numbers.value_counts()[:10].to_string(header=False)

with open(f'{pdf_file_name} - Insight.txt', 'w') as output:
    output.write(pdf_file_name + "\n\n")
    output.write("Unique Words:  " + unique_words)
    output.write("\n")
    output.write("Unique Numbers:  " + unique_numbers)
    output.write("\n\n")
    output.write("Popular Nouns:  \n" + popular_nouns)
    output.write("\n\n")
    output.write("Popular Adjectives:  \n" + popular_adjectives)
    output.write("\n\n")
    output.write("Popular Verbs:  \n" + popular_verbs)
    output.write("\n\n")
    output.write("Popular Numbers:  \n" + popular_numbers)
    output.close()


  df['Words'] = df['Words'].str.replace(i,'')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numbers.rename(columns={'Words': 'Numbers'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_words['Stop Word'] = df_words['Words'].isin(stop_words_upper)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_words['Parts of Speech'] = [x[1] for x in nltk.pos_tag(df_words['Words'])]


In [60]:
df_words.groupby(['Parts of Speech Description']).count().filter(items=['Words'])

Unnamed: 0_level_0,Words
Parts of Speech Description,Unnamed: 1_level_1
Wh-adverb,14
WH-determiner,5
WH-pronoun,188
"adjective or numeral, ordinal",296
"adjective, comparative",39
"adjective, superlative",8
adverb,487
"adverb, comparative",3
"adverb, superlative",1
as preposition or infinitive marker,38
