In this notebook we will massage the data, trying to remove inconsistencies and making it easier to be 
used by clustering algorithms.

In [1]:
import os
from glob import glob
result = [y for x in os.walk("../raw/processed") for y in glob(os.path.join(x[0], '*.txt'))]
print(result[0])

# remove all files called "logFechamento.txt"
list_of_files = [item for item in result if not item.endswith("logFechamento.txt")]
print(list_of_files[0])

../raw/processed\0103DOM.zip\logFechamento.txt
../raw/processed\0103DOM.zip\AACAADM.0\Publicacao\AACAADM.0839.txt


In [2]:
# The text files are recorded with arbitrary encoding, so we need to cater for that.
import io

def read_hostile_text(path_txt):
    encodings = [
        'utf-8',
        'latin_1',
        'utf_16',
        'cp1250',
    ]
    for encoding in encodings:
        file = io.open(path_txt, "r", encoding=encoding)
        try:
            text = file.read()
            file.close()
            return text
        except UnicodeDecodeError:
            file.close()
    print('Could not decode', path_txt)
    return None

In [7]:
# Check we are reading the files correctly:
text = read_hostile_text(list_of_files[101])
print(text)

010306 Publicacao 
((TITULO))COMISSÃO ESPECIAL DE ACOMPANHAMENTO DO PROGRAMA DE DESPESAS PARA APERFEIÇOAMENTO DOS INTEGRANTES DA CARREIRA DE PROCURADOR DO MUNICÍPIO DE SÃO PAULO

((TEXTO))((NG))ATA DA 7ª REUNIÃO ORDINÁRIA DA COMISSÃO ESPECIAL DE ACOMPANHAMENTO DO PROGRAMA DE DESPESAS PARA APERFEIÇOAMENTO DOS INTEGRANTES DA CARREIRA DE PROCURADOR DO MUNICÍPIO DE SÃO PAULO.((CL)) Aos vinte e três dias do mês de fevereiro de dois mil e dezesseis, às catorze horas, na sala de reuniões do gabinete da PGM, reuniu-se a Comissão Especial de Acompanhamento do Programa de Despesas para Aperfeiçoamento dos Integrantes da Carreira de Procurador do Município, sob a Presidência de LUCIANA SANT´ANA NARDI. Presentes os procuradores municipais DANILO DE ARRUDA G. PAIVA, CAYO CÉSAR CARLUCCI COELHO, DANIEL COLOMBO BRAGA, TATIANA ROBLES SEFERJAN, FELIPE RIGUEIRO NETO, FERNANDA DUTRA DRIGO DE ALMEIDA, LILIAN DAL MOLIN SCIASCIO, JERRY JACKSON FEITOSA e CARLA DAMAS DE PAULA RIBEIRO. Instalada a sessão, infor

In [4]:
# For each file in that list, extract the document and put it in the documents list.
documents = []
for filename in list_of_files:
  documents.append(read_hostile_text(filename))

In [34]:
# Create a sub-list only with tenders:
tenders = [record for record in documents if record[7:10] == 'Lic']

In [36]:
len(tenders)

57871

In [59]:
import re

addresses = []

address_pattern = re.compile(r'(Rua \w*, \d+)|(Rua \w*, nº \d*)|(Avenida \w*, \d+)|(Avenida \w*, nº \d*)|(Alameda \w*, \d+)|(Alameda \w*, nº \d*)')

for tender in tenders:
    for (address) in re.findall(address_pattern, tender):
        for item in address:
            if item != '':
                addresses.append([tenders.index(tender), item])
                print(tenders.index(tender), item)

2 Rua Augusta, 435
3 Rua Augusta, 435
265 Avenida Guarapiranga, 1695
341 Alameda Iraé, 35
461 Rua Augusta, 35
462 Rua Augusta, 35
463 Rua Augusta, 435
464 Rua Augusta, 435
468 Rua Lisboa, nº 
515 Avenida Guarapiranga, 1695
516 Avenida Guarapiranga, 1695
517 Avenida Guarapiranga, 1
578 Alameda Iraé, 35
595 Rua Cajuru, 362
628 Rua Capricho, nº 872
1189 Rua Aurélia, 996
1189 Rua Aurélia, 996
1402 Rua Finlandia, 120
1587 Rua Campantes, 100
1603 Rua Florianópolis, 184
1845 Rua Bresser, 2
1845 Rua Bresser, nº 2572
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 Rua Bresser, 2
1845 R

In [60]:
len(addresses)

1270

In [61]:
import googlemaps
from datetime import datetime

gmaps = googlemaps.Client(key='AIzaSyDdol5Au5NwQBPKvalQXfm4OyDPKC5BExk')

# Geocoding an address
geocode_result = gmaps.geocode('1600 Amphitheatre Parkway, Mountain View, CA')

ImportError: No module named 'googlemaps'

In [20]:
from tokenize import tokenize
from io import BytesIO
g = tokenize(BytesIO(documents[0].encode('utf-8')).readline)  # tokenize the string

In [21]:
for x in g:
    print(x)

TokenInfo(type=59 (ENCODING), string='utf-8', start=(0, 0), end=(0, 0), line='')
TokenInfo(type=2 (NUMBER), string='0', start=(1, 0), end=(1, 1), line='010306 Publicacao \n')
TokenInfo(type=2 (NUMBER), string='10306', start=(1, 1), end=(1, 6), line='010306 Publicacao \n')
TokenInfo(type=1 (NAME), string='Publicacao', start=(1, 7), end=(1, 17), line='010306 Publicacao \n')
TokenInfo(type=4 (NEWLINE), string='\n', start=(1, 18), end=(1, 19), line='010306 Publicacao \n')
TokenInfo(type=58 (NL), string='\n', start=(2, 0), end=(2, 1), line='\n')
TokenInfo(type=58 (NL), string='\n', start=(3, 0), end=(3, 1), line='\n')
TokenInfo(type=53 (OP), string='(', start=(4, 0), end=(4, 1), line='((TITULO))DECRETO  Nº 56.839,  DE  29  DE  FEVEREIRO  DE  2016\n')
TokenInfo(type=53 (OP), string='(', start=(4, 1), end=(4, 2), line='((TITULO))DECRETO  Nº 56.839,  DE  29  DE  FEVEREIRO  DE  2016\n')
TokenInfo(type=1 (NAME), string='TITULO', start=(4, 2), end=(4, 8), line='((TITULO))DECRETO  Nº 56.839,  DE  

TokenError: ('EOF in multi-line statement', (53, 0))

We start now the process of cleaning the data. This collection is particularly dirty, with many inconsistencies.
We will create several little helper functions, each with a specific purpose of cleaning one aspect of the data.

In [None]:
def token_is_not_digit(token):
    return (not token.isdigit())

In [None]:
# This assumes that the separator is a space (' '), which will not always be the case...
# TODO deal with colons, semicolons, \x96, brackets
# TODO deal with the special case where a word is spelled with spaces between each letter (e.g. 'D E C R E T A')
documents = [''.join([token for token in document if token_is_valid(token)]) for document in documents]