# Find Keywords from the document
In this notebook, we are going to extract the keywords from the document shared in the link.

Original Document link is provided below.

Link: http://bit.ly/epo_keyword_extraction_document 

In [1]:
#Importing necessary packages
# For basic string,text operation import following
import re, string, unicodedata

# Natural Language toolkit (nltk) used for text processing
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,PorterStemmer

#Importing necessary package for pdf to word conversion

import os
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
# From PDFInterpreter import both PDFResourceManager and PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
# Import this to raise exception whenever text extraction from PDF is not allowed
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator





In [2]:
''' This is what we are trying to do:
1) Transfer information from PDF file to PDF document object. This is done using parser
2) Open the PDF file
3) Parse the file using PDFParser object
4) Assign the parsed content to PDFDocument object
5) Now the information in this PDFDocumet object has to be processed. For this we need
   PDFPageInterpreter, PDFDevice and PDFResourceManager
 6) Finally process the file page by page 
'''
#Put your pdf file path 

base_path = "C:/Users/Dipti_B/Desktop/ds_keyword_assignment"


my_file = os.path.join(base_path + "/" + "Oreilly.pdf")
log_file = os.path.join(base_path + "/" + "Oreilly.txt")

password = ""
extracted_text = ""

# Open and read the pdf file in binary mode
fp = open(my_file, "rb")

# Create parser object to parse the pdf content
parser = PDFParser(fp)

# Store the parsed content in PDFDocument object
document = PDFDocument(parser, password)

# Check if document is extractable, if not abort
if not document.is_extractable:
	raise PDFTextExtractionNotAllowed
	
# Create PDFResourceManager object that stores shared resources such as fonts or images
rsrcmgr = PDFResourceManager()

# set parameters for analysis
laparams = LAParams()

# Create a PDFDevice object which translates interpreted information into desired format
# Device needs to be connected to resource manager to store shared resources
# device = PDFDevice(rsrcmgr)
# Extract the decive to page aggregator to get LT object elements
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# Create interpreter object to process page content from PDFDocument
# Interpreter needs to be connected to resource manager for shared resources and device 
interpreter = PDFPageInterpreter(rsrcmgr, device)

# Ok now that we have everything to process a pdf document, lets process it page by page
for page in PDFPage.create_pages(document):
	# As the interpreter processes the page stored in PDFDocument object
	interpreter.process_page(page)
	# The device renders the layout from interpreter
	layout = device.get_result()
	# Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
	for lt_obj in layout:
		if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
			extracted_text += lt_obj.get_text()
			
#close the pdf file
fp.close()
print(extracted_text);
raw=extracted_text

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/Dipti_B/Desktop/ds_keyword_assignment/Oreilly.pdf'

# Step 2:
Importing Data & Visualize it

this step is importanat to get insight from data

In [None]:
#We can see the raw data
#print(raw)
print(len(raw))

# Step 3:
Preprocessing the data

As we want to find keywords from data, first we have to clean it, filter it for further processing

Here data is text so we have to remove white spaces, special characters, symbols, stopwords etc


In [None]:
# as we have to find keywords we have to seperate out each word from whole document
#this can be done by nltk's tokenize function
tokens = word_tokenize(raw)

In [None]:
# So we have total 5331 tokens
print(len(tokens))

In [None]:
#We can see the tokens data
#print(tokens)

In [None]:
# remove punctuation from each word as we have to find keywords punctuation are treated as noise in data
import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]

#printing first 100 keywords hich are stored in stripped
#print(stripped[:100])

In [None]:
#checking whether the string consists of alphabetic characters only
#if yes then only keeping it

words=[word for word in stripped if word.isalpha()]
#print(words[:100])

In [None]:
#printing length of punctioctions free words
#So we are filtering unwanted stuff 
#print(len(words))

In [None]:
#converting all characters to loer case for further processing
#This is also called as normelization

words_lower=[w.lower() for w in words]
#print(words_lower[:100])

In [None]:
# Removing stop words
# we can see the list of stop words by printing it
stop_words = stopwords.words('english')
#print(stop_words)


In [None]:
#filtering stop words
set(stopwords.words('english'))
words_stopw_rem = [w for w in words_lower if not w in stop_words]
#print(words_stopw_rem[:100])

In [None]:
#printing length of words after removing stop words
#print(len(words_stopw_rem))

In [None]:
#lemitizing is the  process of converting the words of a sentence to its dictionary form. 
#it is very important as it normalize all words 
lemmatizer = WordNetLemmatizer()
words_lemmatized=[lemmatizer.lemmatize(word)for word in words_stopw_rem]

In [None]:
#print(len(words_lemmatized))
#print(words_lemmatized)

In [None]:
#print(len(set(words_lemmatized)))
sorted((words_lemmatized),reverse=True)

# Step 4:
    
Getting insight from data

All preprocessing task has done now we can play with this data to find the keywords, which is our final goal

we can also calculate lexical richness of the text

importance or how frequent the specific word has used

count of each word in this document



In [None]:
#let's calculate a measure of the lexical richness of the text
# From this we can say that in document most of the words are repeated as result shows it has 28.9% lexical richness
len(set(words_lemmatized))*100 / len(words_lemmatized)

In [None]:
#how often a word occurs in a text, and compute what percentage of the text is taken up by a specific word
#ex: java
100 * words_lemmatized.count('java') / len(words_lemmatized)

In [None]:
words_freqDist = nltk.FreqDist(words_lemmatized)


In [None]:
# Output top 50 words
#It shows how much time that perticular word has repeated in document

for word, frequency in words_freqDist.most_common(50):
    print(u'{}:{}'.format(word, frequency))
   
    


# Step 5:

This is the final step

After finding the occurrence of each word now we can find the weight of each word

Here the document is related to Java language

so constraining the word length will remove 'c' which is itself a language

so printing the keywords according to their weights and saving the same in CSV format

this CSV file is stored in the same folder in which this notebook is saved



In [None]:
# Saving output in csv file
#for this we require pandas package and collection package to deal with freqDist output
import pandas as pd
from collections import Counter

d=words_freqDist
d

In [None]:
df = pd.DataFrame.from_dict(d,orient='index').reset_index()
df_new=df
#df_new.columns=['index','Keywords']
df_new.sort_values(by='index',ascending=True)
df_new.columns=['index','Keywords']
f=df_new
f.sort_values(by='Keywords',ascending=0,inplace=True)

In [None]:
f['Keywords']=round((f['Keywords']*100)/len(words_lemmatized),2)

print(f)
#Saving it in csv format

sorted_df.to_csv("keywords_1.csv")