In [1]:
# Run the installation shell script in this section to install the required packages :
# https://github.com/jsvine/pdfplumber for more explanations on how to use pdfplumber

# !pip install pdfplumber
# !pip install nltk

In [2]:
import pdfplumber
import os
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import RegexpStemmer
from nltk.stem import PorterStemmer
from nltk.corpus import words

In [3]:
# NLTK might require some installations : I may have forget some of them, but each error due to that should 
# advice a specific python statement such as the following to download the components needed. 

# nltk.download('words')

# https://www.sandwichpdf.com/ : get text searchable pdf editer : input a pdf with text under image, and this will transform
# the pdf into text searchable pdf file, to process it with the following scripts

In [4]:
# Use this function to know what is inside the different folders
def update_folder_contents():
    manuals_pdf_folder = os.listdir(os.getcwd()+"/manuals_pdf")
    pdf_files = [f for f in manuals_pdf_folder if f.endswith('.pdf')]
    manuals_txt_folder = os.listdir(os.getcwd()+"/manuals_txt")
    txt_files = [f for f in manuals_txt_folder if f.endswith('.txt')]
    manuals_dict_folder = os.listdir(os.getcwd()+"/manuals_dict")
    dict_files = [f for f in manuals_dict_folder if f.endswith('.txt')]
    return pdf_files,txt_files,dict_files

# Use this function to determine the PDF files that need their text to be extracted or the TXT files that must be transformed into dictionary
def check_new_works():
    pdf_files, txt_files, dict_files = update_folder_contents()
    pdf_to_txt_needed, txt_to_dict_needed = [],[]
    for pdf_file in pdf_files :
         if os.path.splitext(pdf_file)[0]+'.txt' not in txt_files: 
            pdf_to_txt_needed.append(pdf_file)
    for txt_file in txt_files : 
        if os.path.splitext(txt_file)[0]+'_dict.txt' not in dict_files: 
            txt_to_dict_needed.append(txt_file)
    return pdf_to_txt_needed, txt_to_dict_needed


In [5]:
# Convert pdf into into text files 
def pdf_to_txt(filename):
    if os.path.splitext(filename)[0]+'.txt' not in txt_files: 
        with pdfplumber.open('manuals_pdf/'+filename) as pdf:
            text = ""
            for page in pdf.pages:
                if page.extract_text():
                    if text!='':
                        text = text + ' ' + page.extract_text() 
                    else :
                        text = text + page.extract_text()
            if(text==''):
                print("Report problem with pdf : "+pdf_files[i])
            with open('manuals_txt/'+os.path.splitext(filename)[0]+'.txt','w',encoding='utf-8') as filehandle:
                filehandle.write(str(text))
        print("Done : " + str(filename))
    else: 
        print("Already done")


In [6]:
# Clean the txt files files we just created
# Interest list of words
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
porter = PorterStemmer()
regex = RegexpStemmer('ing$|s$|e$|able$', min=4)
# Custom this list for words to KEEP that are important for topic modelling tasks down the road
# This custom list MUST include : the words that we want to keep, as well as STEMS of words we want to keep
# Run the following section with the specific word to understand whether this is useful or not to add the real word
# or its stem or lemma. 
custom_list = ['app','application','download','mail','email','bluetooth','pair','installation','setup','guide','howto','QR',
               "wifi","Wi-fi",'customize','QR-code','phone','smartphone','website','site','web','desktop','installing',
              'detects','device','devices','username','creating','syncs','downloading','restarts','storing','internet','instal']
custom_stop_list = ['m','rain','may','w']

In [7]:
from nltk.stem import PorterStemmer

word_test = "requires"
porter_test = porter.stem(word_test)
lemma_test = lemma.lemmatize(word_test)
print(word_test + " is an english word : "+str(word_test in words.words()))
print(word_test + " porter's stem is " + porter_test + " ,which is an english word : "+str(porter_test in words.words()))
print(word_test + " lemmatized word is " + lemma_test + " ,which is an english word : "+str(lemma_test in words.words()))

# If at least one of the test is True, then the word will be kept within the dictionary corresponding to the specific PDF file

# If all tests are False, we may consider adding the porter or lemma root to the custom list if the word can be changed
# We had download, because the porter root will redirect to download the following "downloading","downloaded","downloads"
# This is not required for names that won't be changing such as firstname, or product names...

requires is an english word : False
requires porter's stem is requir ,which is an english word : False
requires lemmatized word is requires ,which is an english word : False


In [8]:
# Remove stop words ("the","a","in"...), and puntuation
def clean(doc):
    doc = re.sub(r'\d+', '', doc)
    stop_free = " ".join([i for i in doc.lower().split() if (i not in stop) and (i not in custom_stop_list)])
    normalized = ''.join(ch for ch in stop_free if ch not in exclude)
    return normalized

# Keep the english words recognized in the dictionary: might miss some words, that is why we keep track of the deleted words
def english_words_only(doc):
    returned_doc, deleted_doc = [],[]
    print("The doc is of length :"+str(len(doc)))
    for word in doc:
        if word not in deleted_doc:
            if word in words.words() or word in custom_list:
                returned_doc.append(word)
            else:
                new_word = porter.stem(word)
                if new_word in words.words() or new_word in custom_list:
                    returned_doc.append(new_word)
                else :
                    new_word = lemma.lemmatize(word)
                    if new_word in words.words() or new_word in custom_list:
                        returned_doc.append(new_word)
                    else:
                        deleted_doc.append(word)
    return returned_doc,deleted_doc

In [9]:
# Transform a TXT file to a dictionary using the clean functions
def txt_to_dict(filename):
    new_text = ''
    with open('manuals_txt/'+filename,'r',encoding='utf-8') as filehandle:
        text_retrieved = filehandle.readlines()
        text_retrieved = [item.replace("\n", " ") for item in text_retrieved]
        new_text = ' '.join(text_retrieved)
    cleaned = clean(new_text).split(' ')
    super_cleaned,removed = english_words_only(cleaned)
    print(len(super_cleaned),len(removed))
    with open('manuals_dict/'+os.path.splitext(filename)[0]+'_dict.txt','w',encoding='utf-8') as filehandle:
                filehandle.write(str(super_cleaned))
    with open('manuals_removed/'+os.path.splitext(filename)[0]+'_removed.txt','w',encoding='utf-8') as filehandle:
                filehandle.write(str(removed))
    print("Done : " + str(filename))
    

In [10]:
# Take a look at all the words that has been removed from the previous clean functions 
# If some words were removed and should not, may consider increasing the custom_list
def reunite_removed():
    manuals_removed_folder = os.listdir(os.getcwd()+"/manuals_removed")
    removed_files = [f for f in manuals_removed_folder if f.endswith('.txt')]
    all_deleted = []
    total_len = 0
    for removed_file in removed_files:
        with open('manuals_removed/'+removed_file,'r',encoding='utf-8') as filehandle:
            deleted = eval(filehandle.readline())
            total_len = total_len + len(deleted)
        for deleted_word in deleted:
            if deleted_word not in all_deleted:
                all_deleted.append(deleted_word)
    with open('all_removed_words.txt','w',encoding='utf-8') as filehandle:
                filehandle.write(str(all_deleted))
    print(total_len,len(all_deleted))
    print(all_deleted)

In [11]:
# Run this section to convert all the NEW PDF files added in manuals_pdf that are not in format txt
# Will look at the .pdf that do not exist in .txt
pdf_to_txt_needed, txt_to_dict_needed = check_new_works()
pdf_files, txt_files, dict_files = update_folder_contents()

# Run the conversion for each
for pdf_needed in pdf_to_txt_needed:
    pdf_to_txt(pdf_needed)
    
# Check the new works to do : no pdf_to_txt_needed should appear
pdf_to_txt_needed, txt_to_dict_needed = check_new_works()
pdf_files, txt_files, dict_files = update_folder_contents()

In [None]:
# Run this section to convert all the NEW TXT files added in manuals_txt that are not converted to dictionary
# Be careful this section might be time consuming, especially with long txt files, due to the use of NLTK libraries

pdf_to_txt_needed, txt_to_dict_needed = check_new_works()
pdf_files, txt_files, dict_files = update_folder_contents()

# If we need to test only a few amount of files, consider reducing the size of txt_to_dict_needed, and then check the 
# removed words.

# txt_to_dict_needed = txt_to_dict_needed[:10]
# txt_to_dict_needed = ['amazon_echo_1st_gen.txt','Amazon_Echo_Dot_3rdGen_QSG_US_sw.txt','apple_tv_4thgen_sw.txt',
#                      'August_coorbell_cam_sw.txt','Belkin_wemo_switch.txt']
print(str(len(txt_to_dict_needed))+' Documents ready for dict transformation !')
for txt_needed in txt_to_dict_needed:
    txt_to_dict(txt_needed)
    
pdf_to_txt_needed, txt_to_dict_needed = check_new_works()
pdf_files, txt_files, dict_files = update_folder_contents()

reunite_removed()