# Import packages

In [180]:
import os
import pytesseract
from PIL import Image
import nltk
from nltk.corpus import words
from nltk.metrics.distance import (
    edit_distance,
    jaccard_distance,
    )
from nltk.util import ngrams
nltk.download('words')
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Daksh\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


# Declare Constants

In [195]:
IMG_PATH = "D:\\major-project\\dataset\\archive\\ocr_dataset\\sample\\sample7.jpg"
CORPUS_DATASET_PATH = "D:\\major-project\\dataset\\archive\\corpus\\coca-samples-text"
STOP_WORDS_DATASET_PATH = 'D:\\major-project\\dataset\\archive\\corpus\\stop_words\\stopwords.txt'

# Extract Text from Image

In [196]:
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
output = pytesseract.image_to_string(IMG_PATH)
print(output)

July 17, 2019

Notice

An Orientation Programme for the newly admitted students will be held on
Saturday, July 20, 2019 at 10.30 a.m. in the lawn near the Seminar Room. All

Teachers and Staff are requested to attend the event.

This will be followed by a department-wise orientation in the respective rooms
allotted to the departments.



In [197]:
text = " ".join(output.replace('*', ' ').split('\n'))
print(text)

July 17, 2019  Notice  An Orientation Programme for the newly admitted students will be held on Saturday, July 20, 2019 at 10.30 a.m. in the lawn near the Seminar Room. All  Teachers and Staff are requested to attend the event.  This will be followed by a department-wise orientation in the respective rooms allotted to the departments. 


# Keyword Extraction

### Corpus Cleaning

In [184]:
def pre_process(text):
    
    #lowercase
    text = text.lower()
    
    #remove tags
    text = re.sub("","",text)
    
    #remove @xyx
    text = re.sub("@*", "", text)
    
    #remove html tags
    text = re.sub("<.*>", "", text)
    
    #remove special characters and digits
    #text = re.sub("(\\d|\\W)+", " ", text)
    
    return text

In [185]:
def find(s, ch):
    return [i for i, ltr in enumerate(s) if ltr == ch]

corpus = []
files = os.listdir(CORPUS_DATASET_PATH)
for path in files:
    with open(CORPUS_DATASET_PATH+"\\"+path, 'r', encoding= 'utf-8') as f:
        curr_corpus = " ".join(f.readlines())
        full_stop_indices = find(curr_corpus, '.')
        start_ind = 0
        for i in range(10, len(full_stop_indices), 10):
            ind = full_stop_indices[i]
            document = curr_corpus[start_ind: ind]
            start_ind = ind+1
            corpus.append(pre_process(document))
        



In [186]:
text = pre_process(text)
print(text)

kirori mal college accounts department  26 october, 2021  notice  it is hereby notified to the students that the last date for submission of online fees for under graduate ii/ilird year and post graduate final year has been extended up to 15.11.2021 the  students are required to deposit their college fees  through on-line payment by using the link: https://www.payumoney.com/webfronts/#/index/kirorimallcollege  


In [187]:
def get_stop_words(stop_file_path):

    with open(stop_file_path, 'r', encoding='utf-8') as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

In [188]:
stopwords = get_stop_words(STOP_WORDS_DATASET_PATH)

In [189]:
cv = CountVectorizer(stop_words = stopwords)

In [190]:
word_count_vector = cv.fit_transform(corpus)

In [191]:
print("Vocab size: "+str(len(list(cv.vocabulary_.keys()))))

Vocab size: 116812


In [192]:
tfidf_transformer=TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [193]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [198]:
feature_names=cv.get_feature_names()
tf_idf_vector=tfidf_transformer.transform(cv.transform([text]))
sorted_items=sort_coo(tf_idf_vector.tocoo())
keywords=extract_topn_from_vector(feature_names,sorted_items,10)
print("\n=====Doc=====")
print(text)
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


=====Doc=====
July 17, 2019  Notice  An Orientation Programme for the newly admitted students will be held on Saturday, July 20, 2019 at 10.30 a.m. in the lawn near the Seminar Room. All  Teachers and Staff are requested to attend the event.  This will be followed by a department-wise orientation in the respective rooms allotted to the departments. 

===Keywords===
2019 0.398
orientation 0.375
july 0.28
allotted 0.228
seminar 0.21
programme 0.2
respective 0.194
requested 0.184
departments 0.18
lawn 0.177
