## Natural Language Processing (NLP) Application to Articles

This NLP project demonstrates NLP application to energy industry articles in PDF files. <br>

### Part1: Preprocessing

Preprocessing part is described: conversion from PDF to text, tokenizer, duplicate file deletion.

### Import Libraries

In [1]:
import os
import re
import copy
import string
import hashlib
import operator
import pandas as pd
import numpy as np
from collections import OrderedDict
import datetime
from dateutil import parser
from calendar import month_name
import PyPDF2
from english_spelling import replace_gb2us
import nltk
_ = nltk.download('wordnet', quiet=True)
from nltk.stem.wordnet import WordNetLemmatizer

### Functions for text preprocessing

In [2]:
def preprocessing(text_list):
    '''
    Convert a list of strings into a single body text.
    Exclude some words and Client Help Desk Info
    '''
    
    # In case text.split('\n') does not work, create a list of words
    if len(max(text_list, key=len)) > 100 and len(text_list) < 35:
        text_list = ' '.join(text_list).split()
    
    # Useless word list
    exclude = ['Page', 'Insight - ','Executive summary', 'Key take-aways:','Summary','INSIGHT']
    filter_func = lambda s: not any(x in s for x in exclude) and len(s) > 1
    text_list = [line.replace("'",'') for line in text_list if filter_func(line)]
    
    # Remove Help Desk Information
    flag = 0    
    try:
        flag = 1
        idx_ch = text_list.index('Client Helpdesk')
        text_list = text_list[:idx_ch]
    except:
        pass;

    body_text = ' '.join(text_list)
    return body_text,flag

In [3]:
def del_some_chars(body_text):
    ''' Delete symbols '''
    char_exclude = [',', '.','--','-','+','!','?',':',';','"','(',')',']','[',
                    '@','^','*','>','<','`','%','$','/']
    body_text = ''.join([s for s in body_text if s not in set(char_exclude)])
    return body_text

In [4]:
def del_non_ascii(body_text):
    ''' Delete non-ascii characters '''
    printable = set(string.printable)
    body_text = ''.join(filter(lambda x: x in printable, body_text))
    body_text = re.sub(' +', ' ', body_text)
    return body_text

In [5]:
def appleOrange(body_text):
    ''' Split 'appleOrange' to 'apple' and 'Orange' '''
    word_list = body_text.split(' ')
    for i, line in enumerate(word_list):        
        try:
            r1 = re.findall('([A-Z][a-z]+)',line)[-1]
            r2 = line.replace(r1, '')
            word_list[i] = r2 + ' ' + r1
        except:
            pass;
    body_text = ' '.join(word_list)[1:]
    return body_text

In [6]:
def date_finder(body_text):
    ''' Extract Date from body text '''
    
    s = body_text[0:100]
    
    # Extract Month
    pattern = '|'.join(month_name[1:])
    month = re.search(pattern, s, re.IGNORECASE).group(0)
    
    # Extract Year
    year = re.search(r'\d{4}', s).group()
    date_str = ' '.join([month,year])
    date = datetime.datetime.strptime(date_str,'%B %Y')
    ymd_str = datetime.datetime.strftime(date,'%Y/%m/%d')
    
    body_text = body_text.replace(month + ' ' + year + ' ', '')
    return body_text, ymd_str

In [7]:
def tokenizer(body_text):
    ''' Convert verbs to at its present tense, plural nouns to singular '''
    word_list = body_text.split(' ')
    word_list = [WordNetLemmatizer().lemmatize(word,'v') for word in word_list]   # Convert verbs to present tense
    word_list = [WordNetLemmatizer().lemmatize(word,'n') for word in word_list]   # Convert plural to singular
    word_list = [WordNetLemmatizer().lemmatize(word,'a') for word in word_list]   # Adjective
    word_list = [WordNetLemmatizer().lemmatize(word,'r') for word in word_list]   # Adverb    
    word_list = [word for word in word_list if not re.search(r'\d',word)]         # Remove words that contain a number
    word_list = [word for word in word_list if len(word) < 10 and len(word) > 1]  # Remove too short/long words
    body_text = ' '.join(word_list)
    return body_text

In [8]:
def pdf_parser(path_file):
    ''' Parser '''
    pdfReader = PyPDF2.PdfFileReader(open(path_file, 'rb'))  # Read a PDF file

    text = ''
    for i in range(pdfReader.numPages):
        # Extract text from a page object        
        pageObj = pdfReader.getPage(i)
        text_tmp = pageObj.extractText()

        # Append text on every page
        if i == 0:
            text += text_tmp
        else:
            text += '\n'.join(text_tmp.split('\n')[1:])  # Append text on every page
            
        text_list = text.split('\n')
        body_text, flag = preprocessing(text_list)   # Preprocessing from text list
        body_text = del_some_chars(body_text)        # Delete some characters
        body_text, ymd_str = date_finder(body_text)  # Extract Date
        body_text = del_non_ascii(body_text)         # Delete non-ascii characters
        body_text = appleOrange(body_text)           # Split overlapping word into single words
        body_text = body_text.lower()                # Convert all the text into lower case
        body_text = replace_gb2us(body_text)         # Replace British English with American English 
        body_text = tokenizer(body_text)             # Tokenizer
    return body_text, ymd_str, flag

### Read PDF files, pre-process text files, and save them.

In [9]:
def preprocess_pdfs(path_input,path_output):
    ''' Load PDF files and pre-process texts and save them in ascii files '''
    article_list = []
    for root, dirs, files in os.walk(path_input):
        Group = root.split('\\')[-2]
        SubGroup = root.split('\\')[-1]
        idx = 0
        for file in files:
            if file.endswith('.pdf'):
                # Extract body text and Issue Date
                path_file = os.path.join(root,file)
                try:
                    body_text, ymd_str, flag = pdf_parser(path_file)
                except:
                    body_text, ymd_str, flag = '', '0000/00/00', 0
                    pass;

                # Record Article Information and append it in a dictionary            
                Title = path_file.split('\\')[-1].replace('.pdf','')
                dict_add = OrderedDict({'Group':Group,'SubGroup':SubGroup,'Date':ymd_str,'Title':Title,
                                        'Length Body Text':len(body_text)})
                article_list.append(dict_add)

                idx += 1
                filename = Group+'_'+SubGroup+'_'+str(idx)+'_'+ymd_str.replace('/','.')+'.txt'
                path_output_file = os.path.join(path_output, filename)
                with open(path_output_file,'w') as file:
                    file.write(body_text)
    # Save Preprocessing Results
    df = pd.DataFrame(article_list)
    df.to_csv('./article_summary.csv',index=False)

In [10]:
path_input = '.\\Reports_PDF'
path_output = './articles_text'
preprocess_pdfs(path_input,path_output)

<br>
<br>
<br>

### Delete duplicated files
Reference: https://www.pythoncentral.io/finding-duplicate-files-with-python/

In [11]:
def findDup(parentFolder):
    # Dups in format {hash:[names]}
    dups = {}
    for dirName, subdirs, fileList in os.walk(parentFolder):
        for filename in fileList:
            # Get the path to the file
            path = os.path.join(dirName, filename)
            # Calculate hash
            file_hash = hashfile(path)
            # Add or append the file path
            if file_hash in dups:
                dups[file_hash].append(path)
            else:
                dups[file_hash] = [path]
    return dups

In [12]:
def joinDicts(dict1, dict2):
    for key in dict2.keys():
        if key in dict1:
            dict1[key] = dict1[key] + dict2[key]
        else:
            dict1[key] = dict2[key]

In [13]:
def hashfile(path, blocksize = 65536):
    afile = open(path, 'rb')
    hasher = hashlib.md5()
    buf = afile.read(blocksize)
    while len(buf) > 0:
        hasher.update(buf)
        buf = afile.read(blocksize)
    afile.close()
    return hasher.hexdigest()

In [14]:
def deleteDups(dict1):
    results = list(filter(lambda x: len(x) > 1, dict1.values()))
    if len(results) > 0:
        for result in results:
            for subresult in result[1:]: # Leave the firstfile as is
                os.remove(subresult)
    else:
        print('No duplicate files found.')

In [15]:
dups = {}
folder = './articles_text/'
if os.path.exists(folder):
    # Find the duplicated files and append them to the dups
    joinDicts(dups, findDup(folder))
else:
    print('%s is not a valid path, please verify' % folder)
    sys.exit()

deleteDups(dups)