In [39]:
import platform
import re
import openai
import os
import tqdm

from io import BytesIO
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

print('Python: ', platform.python_version())
print('re: ', re.__version__)
print('nltk: ', nltk.__version__)



Python:  3.9.6
re:  2.2.1
nltk:  3.8.1


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\33669\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
def clean(text):
    '''
    Just a helper fuction to add a space before the punctuations for better tokenization
    '''
    filters = ["!", "#", "$", "%", "(", ")", "/", "*", ":", ";", "<", "=", ">", "?", "@", "[",
               "\\", "]", "_", "`", "{", "}", "~", ",","."]
    text = text.strip('\n ')
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\br', ' ')
    for i in text:
        i = i.replace("'", ' ').replace("`", ' ').replace("°",'')
        if i in filters:
            text = text.replace(i, " "+i+' ')
    text = " ".join(text.split())
    return text


## Extract text from PDF files ##

def pdf2token(loc):
    escape_seq = '\n'
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    data = []
    laparams = LAParams(char_margin=30, line_margin=2, boxes_flow=1)
    device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    caching = True
    fp = open(loc, 'rb')
    pagenumber = 0
    pagenos=set()
    for pagenumber, page in enumerate(PDFPage.get_pages(fp,pagenos, maxpages=0,caching=True, check_extractable=True)):
        #Process pages one by one into interpreter
        if pagenumber is not None:
            temp_str = ''
            interpreter.process_page(page)
            if len(retstr.getvalue()) < 10:
               continue
            else:
                temp_str = retstr.getvalue().decode('ascii', 'ignore')  # add extracted text from bytesIO to data variable
                temp_str = temp_str.replace('\x0c', ' ')    # Remove useless character
                temp_str = re.sub('\n+', '\n', temp_str)
                data.append(temp_str)
        retstr.truncate(0)
        retstr.seek(0)
                
    print(f' >>>>>>NORMALSTART<<<<<< {escape_seq.join(data)} >>>>>>>>END<<<<<<<')
    #print(f' >>>>>>START<<<<<< {sent_text} >>>>>>>>END<<<<<<<')
    return(data)

In [41]:
########### MAIN Functions ############
escape_seq = '\n'
excel, ppt, word = [], [], []
err_files = []
err_log = []
file_names = []
page_wise_data = []
folder_path = input("Enter folder location: ")

# Walk through all files and subdirectories in the folder
for root, dirs, files in os.walk(folder_path):
    try:
        for file in files:
            if file.endswith(".pdf"):
                pdf_location = os.path.abspath(os.path.join(root, file))
                print(pdf_location)
                page_wise_data = pdf2token(pdf_location)   #Extract text from PDF
                file_names.append(file)
            elif file.endswith(".xlsx") or file.endswith(".csv") or file.endswith(".xlsb"):
                excel.append(file)
            elif file.endswith(".ppt"):
                ppt.append(file)
            elif file.endswith(".docx") or file.endswith(".xlsb"):
                word.append(file)
                
    except Exception as e:
        print(f'///////////////////////////\n\n |Random ERORR| {e}, continuing PDF extraction\n\n/////////////////////////////////////')
        err_files.append(pdf_location)
        err_log.append(e)
        continue   
    
print(f'Files detected in folder:\nX-X-X-X\n\nPDFs\n {escape_seq.join(file_names)} \nX-X-X-X\n Excel\n\n{escape_seq.join(excel)}\nX-X-X-X\nWord Docs\n\n{escape_seq.join(word)}  \nX-X-X-X\nPPTs\n\n{escape_seq.join(ppt)}')
errors = list(zip(pdf_location,err_log))
print(f'\nError logs ||::||\n\n {errors}')

C:\Data\gpt_docs\Company Policies & Procedures\Delegation of Authority  (DOA)\Havells DOA Version 1.17.pdf
 >>>>>>NORMALSTART<<<<<< Havells India Limited
Financial & Operational Delegation of Authority (DOA) 
V 1.17
1
 
Havells India Limited-Approval Matrix
4
3
S No. Index
1 Marcom
2 CSR & Sustainability
Fixed Assets; Capex
a)Annual Capex Budget including New Project.
b)Procurement & Disposal of Assets.
Procurement & Supplier Management 
a)Vendor On-boarding RM, Traded Goods, Consumables & Job Worker
b)Change in Rates, Payment terms
c)Purchase order , Service Order,
d)Air freight
Credit Control Management
a)Opening of New Branch & Sales Office
b)Appointment of Dealer, Distributor, Galaxy, Brand Store, MFR, RR, ECOM, CPC, CSD & Other. (Single SBU & Multiple SBU)
c) Extension/Unblocking- Dealer/Distributor- Product
d) Sales Invoicing, Credit Limit & Customer Overdue.
e)Commercial Return, Claim Settlement etc. 
f)Sales Return, Warranty Return, Out of Warranty Return & Transit Damage
g) Sa

In [43]:
print(page_wise_data[2])


ABSTRACT 
      The tool management enables to get the number of tools in an enterprise its associated components,        
its life , balance life , number of cavity etc. i.e. a history of tools , the expenses incurred due to repair  
on a particular tool or total expense on a particular business location. It also help to get the data to 
analyze the current capacity and to plan for future capacity of the components .The current automa-
tion in tool management in Havells have below features : 
  Proper Mapping of FA Asset Number to Tool Number and component to be manufactured.         
  Tool Life Cycle Management 
  Notification of End of Life  
  Addition of Tool Life on Refurbishing 
  Alert for Preventive Maintenance  
  Capture Tool Wise expenses on Repair, modification, R&D, Preventive Maintenance and Refur-
bishing. 
  Tool History Card  
The manual will help to understand the overall flow and guide for the steps and procedure to manage    
a tool .  
Author Amit Katiyar 
 
 
 


In [46]:

## USE OPENAI API TO GET EMBEDDING DATA ##


openai.api_key ="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"
response = openai.Embedding.create(
  model="text-embedding-ada-002",
  input=page_wise_data[2]
)
embeddings = response['data'][0]['embedding']

print(embeddings)


[-0.019713036715984344, -0.00952419638633728, 0.006858552806079388, -0.03552306070923805, -0.030686721205711365, 0.024860486388206482, -0.033401861786842346, 0.005087349098175764, -0.021480705589056015, -0.0017429209547117352, 0.01221105270087719, 0.003941900096833706, -0.008025214076042175, 0.01582416705787182, 0.002462361939251423, 0.006314110942184925, 0.0138090243563056, -0.02256958931684494, 0.016799919307231903, -8.3190891018603e-05, -0.022725144401192665, 0.01215448696166277, -0.006183303892612457, 0.005182803608477116, -0.014063568785786629, 0.0013284026645123959, 0.0374462865293026, -0.03563619405031204, 0.011624186299741268, 0.013292865827679634, 0.006869159173220396, -0.013073674403131008, -0.00409038458019495, -0.010938331484794617, -0.016191842034459114, -6.580421541002579e-06, 0.013420137576758862, -0.01812920533120632, 0.027504919096827507, -0.0017906479770317674, 0.04406443610787392, 0.01583830825984478, -0.0032631156500428915, 0.008838341571390629, -0.02863622643053531