In [39]:
import joblib
import numpy as np
import pandas as pd
import string
import re

from xml.etree import ElementTree as ET
from xml.etree.ElementTree import ParseError
from pathlib import Path
from zipfile import ZipFile

In [2]:
from annoy import AnnoyIndex

In [27]:
file = Path('..', 'data', '2022_PIT_TodosArquivos.zip')

assert file.exists()

zf = ZipFile(file)

files_licitacao = [zf.open(f) for f in zf.namelist() if 'Licitacao' in f]

len(files_licitacao)

399

In [28]:
files_licitacao[20]

<zipfile.ZipExtFile name='2022_410170_Licitacao.zip' mode='r' compress_type=deflate>

In [29]:
def get_data(folder : Path):
    all_data = list()
    for f in folder.glob('*LicitacaoVencedor.xml'):
      try:
        tree = ET.parse(f)
        root = tree.getroot()
        print(f, end='\r')
        all_data.extend([{
            'idlicitacao' : child.attrib['idlicitacao'],
            'nrLote' : child.attrib['nrLote'],
            'nrItem' : child.attrib['nrItem'],
            'dsItem' : child.attrib['dsItem'],
            } for child in root])
      except ParseError as error:
        print(error, f)

    return pd.DataFrame(all_data)

In [34]:
data = list()
for file in files_licitacao:
    licitacao_vencedor = [f for f in ZipFile(file).namelist() if 'LicitacaoVencedor' in f]
    if len(licitacao_vencedor) == 1:
        file_licitacao_vencedor = ZipFile(file).open(licitacao_vencedor[0])
        try:
            tree = ET.parse(file_licitacao_vencedor)
            root = tree.getroot()
            data.extend([{
                    'idlicitacao' : child.attrib['idlicitacao'],
                    'nrLote' : child.attrib['nrLote'],
                    'nrItem' : child.attrib['nrItem'],
                    'dsItem' : child.attrib['dsItem'],
                    } for child in root])
        except ParseError as error:
            print(error, file)
    

In [35]:
df = pd.DataFrame(data)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3587391 entries, 0 to 3587390
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   idlicitacao  object
 1   nrLote       object
 2   nrItem       object
 3   dsItem       object
dtypes: object(4)
memory usage: 109.5+ MB


In [40]:
df['formatted'] = (df['dsItem']
                    .apply(lambda t: t.lower() )
                    .apply(lambda text :  text.translate(str.maketrans('áàãâäéèêëóòõôöíìîïúùüç', 'aaaaaeeeeoooooiiiiuuuc')))
                    .apply(lambda text : text.translate(str.maketrans(string.punctuation, len(string.punctuation) * ' ', 'ªº°')) )
                    .apply(lambda t : re.sub('\s+', ' ', t))
                    .apply(lambda t : re.sub('\s+pct\s{0,}', ' pacote ', t) )
                    .apply(lambda t : re.sub('\s+cx\s{0,}', ' caixa ', t) )
                    .apply(lambda t : re.sub('\s+uni\s{1,}|\s+und\s{0,}|\s+unds\s{0,}', ' unidade ', t) )
                    .apply(lambda t : re.sub('\s+c/', ' com', t) )
                    .apply(lambda t : re.sub('\s+p/', ' para', t) )
                    .apply(lambda t : re.sub('(\d{1,})\s{0,}litros{0,}|(\d{1,})\s{0,}lts{0,}', r'\1_litro', t))
                    .apply(lambda t : re.sub('(\d{1,})\s{0,}l', r'\1_litro', t))
                    .apply(lambda t : re.sub('(\d{1,})\s{0,}gramas{0,}|(\d{1,})\s{0,}gr\s{1,}', r'\1_grama', t))
                    .apply(lambda t : re.sub('(\d+)\s{0,}grs', r'\1_grama', t))
                    .apply(lambda t : re.sub('(\d{1,})\s{0,}kg|(\d{1,})\s{0,}kgs', r'\1_quilo', t))
                    .apply(lambda t : re.sub('(\d{1,})\s{0,}ml ', r'\1_ml', t) )
                    .apply(lambda t : re.sub('^\d+', '', t))
                    .apply(lambda t : re.sub('\s+\d+$', ' codigo', t) )
                )

In [42]:
df.head()

Unnamed: 0,idlicitacao,nrLote,nrItem,dsItem,formatted
0,1837502,48,1,MILHO BRANCO PARA CANJICA 500GR,milho branco para canjica 500gr
1,1837502,49,1,MILHO PARA PIPOCA 500 GR,milho para pipoca 500 gr
2,1837502,50,1,MILHO VERDE EM CONSERVA 200 G,milho verde em conserva 200 g
3,1837502,54,1,OVOS BANDEJA C 30 UNS,ovos bandeja c 30 uns
4,1837502,56,1,PEITO DE FRANGO SEM PELE E SEM OSSO,peito de frango sem pele e sem osso


In [41]:
tfidf_model = joblib.load(Path('models', 'tfidf_model.pkl'))

In [45]:
item = ['caneta esferografica azul']

vetor = tfidf_model.transform(item).toarray()

vetor.shape

(1, 334407)

In [None]:
dim = 334407
search_index = AnnoyIndex(dim, 'angular')

for i, row in enumerate(df.itertuples()):
    text = [row.formatted]
    vetor = tfidf_model.transform(text).toarray()
    vetor = vetor.reshape(-1)
    search_index.add_item(i, vetor)

In [None]:
search_index.build(10)
search_index.save('search_index.ann')

In [None]:
exemplo = [ 'pasta classificadora lombo regulavel grampo' ]
vetor = tfidf_model.transform(exemplo).toarray()

In [None]:
search_index.get_nns_by_vector(vetor, 10)