In [1]:
from pathlib import Path
import settings
import shutil
import numpy as np
import re
from IPython.display import clear_output
import docx
import subprocess

# Obtaining the corpus

Up until the day 2019-02-26, we have a corpus of almost 15500 files related to the inspection acts. Most of the have the .doc or .docx extension. For our purpose, these are the files we are interested on. Every file should be named exclusively with the inspection act number. This number is built with the following regex pattern:

    "\d{2}-[A-Z]{2}-\d{4}-\d{5}-[A-Z]{2}"
    
While all the files contain this identifier in their name, they may also contain "noise" information; therefore, the first step is to normalize the name of the files. We may also have  other type of files  as .xlsx, .pdf or image files. The corpus will only have files with extension .doc or . docx, named exclusively as the inspection act number.

In [8]:
def import_corpus_files():
    confirmation = input("This is a lenghty process that should be called"\
                        " just once. Are you sure you want to execute it?"\
                        " [Y/N]")
    if confirmation != "Y": 
        print("aborting...")
        clear_output()
        return
    destination_directory = Path(settings.CORPUS_DIRECTORY_ROUTE)
    source_directory = Path(settings.INSPECTION_ACTS_ROUTE)
    pattern = re.compile(r".*(\d{2}-[A-Z]{2}-\d{4}-\d{5}-[A-Z]{2})"\
                         ".*(\.(?:doc|docx))$")
    for file in source_directory.iterdir():
        match = pattern.match(file.name)
        if match:        
            shutil.copyfile(file, destination_directory.absolute()\
                    / Path(match.group(1) + match.group(2))) 
import_corpus_files()

## Converting the corpus to a readable format

In order to read the files, we are going to use the [python-docx](https://github.com/python-openxml/python-docx) library. Now, we face the problem that this library works exclusively with .docx files.

We need to change all the .doc files in the corpus to .docx files. To achieve this, we are going to use the script ./from_doc_to_docx.vbs

In [None]:
def change_files_extensions():
    source_directory = Path(settings.INSPECTION_ACTS_ROUTE)
    script_address = str(Path("./from_doc_to_docx.vbs").absolute())
    for file in source_directory.iterdir():        
        if file.suffix == ".doc":
            subprocess.run(["wscript.exe", script_address, 
                            str(file.absolute())])
            file.unlink()
change_files_extensions()

After executing this piece of code, we have a corpus of 15 203 documents; almost 7 GB of data.

## Examining the corpus

Now, we have some questions: What words are more frequent? What is their distribution?. It'd be a good idea to start exploring the corpus. Since it is toot big, we are going to take a sample of 800 documents (around 5% of all the corpus) chosen randomly:

In [23]:
from pathlib import Path


np.random.uniform
random_corpus = 

D:\projects\acts_preprocessing_with_nlp\data\corpus\c.txt


In [10]:
from pathlib import Path

corpus_directory = Path("./data/corpus")
files_collection = [file for file in corpus_directory.iterdir()]
files_collection[0:5]

[WindowsPath('data/corpus/13-MF-3309-05093-DD.docx'),
 WindowsPath('data/corpus/14-AF-3301-03251-CV.doc'),
 WindowsPath('data/corpus/14-AF-3301-03252-CV.doc'),
 WindowsPath('data/corpus/14-AF-3301-03253-CV.doc'),
 WindowsPath('data/corpus/14-AF-3301-04554-IV.doc')]

In [13]:
files_collection[0].read_text(encoding="UTF-8")[0:1000]

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xae in position 14: invalid start byte

In [None]:
help(file.iterdir)