In [32]:
from pathlib import Path
import settings
import shutil
import numpy as np
import pandas as pd
import plotly
from plotly.offline import iplot
import cufflinks as cf   
import re
from IPython.display import clear_output
import docx
import subprocess

COURPUS_SIZE = 15203
cf.go_offline()
plotly.offline.init_notebook_mode(connected=True)

# Obtaining the corpus

Up until the day 2019-02-26, we have a corpus of almost 15500 files related to the inspection acts. Most of the have the .doc or .docx extension. For our purpose, these are the files we are interested on. Every file should be named exclusively with the inspection act number. This number is built with the following regex pattern:

    "\d{2}-[A-Z]{2}-\d{4}-\d{5}-[A-Z]{2}"
    
While all the files contain this identifier in their name, they may also contain "noise" information; therefore, the first step is to normalize the name of the files. We may also have  other type of files  as .xlsx, .pdf or image files. The corpus will only have files with extension .doc or . docx, named exclusively as the inspection act number.

In [8]:
def import_corpus_files():
    confirmation = input("This is a lenghty process that should be called"\
                        " just once. Are you sure you want to execute it?"\
                        " [Y/N]")
    if confirmation != "Y": 
        print("aborting...")
        clear_output()
        return
    destination_directory = Path(settings.CORPUS_DIRECTORY_ROUTE)
    source_directory = Path(settings.INSPECTION_ACTS_ROUTE)
    pattern = re.compile(r".*(\d{2}-[A-Z]{2}-\d{4}-\d{5}-[A-Z]{2})"\
                         ".*(\.(?:doc|docx))$")
    for file in source_directory.iterdir():
        match = pattern.match(file.name)
        if match:        
            shutil.copyfile(file, destination_directory.absolute()\
                    / Path(match.group(1) + match.group(2))) 
import_corpus_files()

After executing this piece of code, we have a corpus of 15 203 documents; almost 7 GB of data.

## Converting the corpus to a readable format

In order to read the files, we are going to use the [python-docx](https://github.com/python-openxml/python-docx) library. Now, we face the problem that this library works exclusively with .docx files.

We need to change all the .doc files in the corpus to .docx files. To achieve this, we are going to use the script ./from_doc_to_docx.vbs

In [2]:
def change_files_extensions():
    source_directory = Path(settings.CORPUS_DIRECTORY_ROUTE)
    script_address = str(Path("./from_doc_to_docx.vbs").absolute())
    for file in source_directory.iterdir():        
        if file.suffix == ".doc" or file.suffix == ".DOC":
            subprocess.run(["wscript.exe", "//B", script_address, 
                            str(file.absolute())])            
            file.unlink()
change_files_extensions()

## Examining the corpus



Now, we have some questions: What words are more frequent? What is their distribution?. 

It'd be a good idea to start exploring the corpus. In order to read the files, we are going to use the [docx library](https://python-docx.readthedocs.io/en/latest/). The files may contain tables; however, for now we are going to focus in the paragraphs (titles and bullets included)

Since the corpus is toot big, we are going to take a sample of 800 documents (around 5% of all the corpus) chosen randomly:

In [30]:
def retrive_document_text(file):
    document = docx.Document(file)
    return " ".join([paragraph.text for paragraph in document.paragraphs])

corpus_directory = Path("./data/corpus")
files_collection = [file for file in corpus_directory.iterdir()]
raw_corpus_dict = dict()
for _ in range(800):
    random_index = np.random.randint(COURPUS_SIZE)
    random_file = files_collection[random_index]
    document_text = retrive_document_text(random_file)
    words_list = document_text.split()
    for word in words_list:
        word = word.strip()
        if word not in raw_corpus_dict:
            raw_corpus_dict[word] = 0
        raw_corpus_dict[word] += 1
print(raw_corpus_dict)



In [36]:
corpus_series = pd.Series(raw_corpus_dict)
corpus_series.sort_values(inplace=True, ascending=False)
corpus_series[0:100]

de              100570
la               43591
en               36334
y                32721
el               28971
que              27001
con              24935
se               23981
del              20394
los              17089
a                16336
DE               15498
para             12970
por               9476
no                8241
las               7723
Se                7465
o                 6442
verificación      5996
su                5843
(                 5560
al                5538
muestra           5419
visita            5187
)                 5183
sanitaria         5038
Y                 5015
lo                4730
C.                4711
LA                4597
                 ...  
El                2224
interesado        2194
carácter          2156
fecha             2123
productos         2102
CON               2093
horas             2030
original          2029
análisis          1979
La                1979
Sanitaria,        1961
alcance           1938
ante       

In [41]:
corpus_series[0:50].iplot(kind="bar")



In [21]:
"a, b, c".split()


['a,', 'b,', 'c']

In [26]:
help(str.split)

Help on method_descriptor:

split(...)
    S.split(sep=None, maxsplit=-1) -> list of strings
    
    Return a list of the words in S, using sep as the
    delimiter string.  If maxsplit is given, at most maxsplit
    splits are done. If sep is not specified or is None, any
    whitespace string is a separator and empty strings are
    removed from the result.

