In [None]:
!pip install numpy nltk matplotlib pdfplumber sklearn pdf2image pytesseract Image gensim
!apt install tesseract-ocr tesseract-ocr-por poppler-utils

In [None]:
#incializando funções

from pdf2image import convert_from_path as cPdf2img
import pdfplumber
from PIL import Image
from pytesseract import image_to_string as img2str
import re

#################################################
page_id = '-'*100+'\nPage {}\n'+'-'*100+'\n'

#regex expressions
regexes = [
  [r'(\n)([a-z]|-)', r'\2'],
  [r'(\s+)(\.|,|;)', r'\2'],
  [r'\s+', r' ']
]
##################################################

#usando pdfplumber
def usePdfplumber(filename, verbose=True, n_pages=None):
  if verbose:
    print('Converting "{}" using pdfplumber'.format(filename))
  fd = open(filename[:-4] + '_pplumber.txt', 'w')
  with pdfplumber.open(filename) as pdf:
    n_pages = len(pdf.pages)
    for page,i in zip(pdf.pages,range(n_pages)):
      if n_pages and i > n_pages:
        break
      print("Converting page #{}".format(i+1))
      text = page.extract_text()
      for rPattern,subString in regexes:
        text = re.sub(rPattern,subString,text)
      fd.write(page_id.format(i+1))
      fd.write(text + '\n')
  fd.close()

#usando pytesseract
def usePytesseract(filename, verbose=True, n_pages=None):
  if verbose:
    print('Converting "{}" using pytesseract'.format(filename))
  fd = open(filename[:-4] + '_ptesseract.txt', 'w')
  pages =cPdf2img(filename)
  for i,page in enumerate(pages):
    if n_pages and i > n_pages:
      break
    print("Converting page #{}".format(i+1))
    text = img2str(page, lang='por')
    for rPattern,subString in regexes:
        text = re.sub(rPattern,subString,text)
    fd.write(page_id.format(i+1))
    fd.write(text + '\n')
  fd.close()

In [None]:
import zipfile
from google.colab import drive

drive.mount('/gdrive')

In [None]:
import os

zipDir = '/gdrive/My Drive/Dataset/'
zip_ref = zipfile.ZipFile(zipDir + 'Processos.zip', 'r')

w_directory = 'processos'
og_dir = os.getcwd()
try:
  os.mkdir(w_directory)
except:
  pass
os.chdir(w_directory)

zip_ref.extractall('.')
zip_ref.close()

os.chdir(og_dir)

In [None]:
proc_dir = og_dir + '/' + w_directory + '/Processos'
print(proc_dir)

In [None]:
aux = [dirname for dirname in os.listdir(proc_dir) if os.path.isdir(os.path.join(proc_dir,dirname))]
proc_dirs = [proc_dir + '/' + dirname for dirname in aux]
del aux
print(proc_dirs[:5])

In [None]:
for proc_dirname in proc_dirs[100:105]:
  files = [proc_dirname + '/' + filename for filename in os.listdir(proc_dirname) if filename[-3:] == 'pdf']
  for filename in files:
    #usePdfplumber(filename, n_pages=2)
    usePytesseract(filename, n_pages=2)

In [None]:
import os

base_dir = '/gdrive/My Drive/Aux/'

In [None]:
from shutil import copyfile

procs = {}
for dirname in proc_dirs[100:105]:
  print('Listing txt files from "{}"'.format(dirname))
  f_txt = [filename for filename in os.listdir(dirname) if filename[-4:] == ".txt"]
  print(f_txt)
  procs[dirname] = f_txt

for dirname in procs.keys():
  aux_name = dirname.split('/')[-1]
  try:
    os.mkdir(base_dir + aux_name)
  except:
    pass
  print('Copying files from {}...'.format(dirname))
  for txt_file in procs[dirname]:
    print('...copying {}...'.format(txt_file))
    source = dirname + '/' + txt_file
    dest = base_dir + aux_name + '/' + txt_file
    copyfile(source,dest)
  print('...done.')

In [None]:
#tokenizers

#word tokenization
def w_tokenize(filename):
  fd = open(filename,'r')
  text = None
  for line in fd:
    if not re.search('-{2,}',line) and not re.match('Page\ [0-9]+$',line):
      if text:
        text += line
      else:
        text = line
  fd.close()
  try:
    return text.split()
  except:
    return []

In [None]:
aux_dirs = [os.path.join(base_dir,dirname) for dirname in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir,dirname))]

classes = ['Despejo', 
'Embargos à Execução',
'Execução de Título Extrajudicial',
'Fato do Serviço',
'Impugnação ao Cumprimento de Sentença',
'Indenização e Obrigação de Fazer',
'Interdito Proibitório',
'Liquidação por Arbitramento',
'Nulidade de Assembleia',
'Obrigação de Fazer',
'Plano de Saúde',
'Poupança, Espurgos Inflacionários',
'Prestação de Contas',
'Prestação de Serviço',
'Reintegração de Posse de Arrendamento Mercantil',
'Responsabilidade do Fornecedor',
'Revisão de Cláusula Contratual',
'Suplementação Previdenciária',
'Sustação de Protesto',
'TOI',
'Usucapião']

busca_classes = list()
for val in classes:
  aux = val.split()
  busca_classes += [word for word in aux if word not in ['de', 'à', 'ao', 'e', 'do']]

files = {}
for dirname in aux_dirs:
  f_txt = [filename for filename in os.listdir(dirname) if filename[-4:] == ".txt"]
  words = list()
  print('Palavras de {}'.format(dirname))
  for filename in f_txt:
    words += w_tokenize(dirname + '/' + filename)
  all_caps = [word for word in words if re.match('[A-ZÂÃÍÓÕÚÇ]+',word)]
  print(all_caps)
  clss = [word for word in all_caps if word in busca_classes]
  print(clss)
  c_count = [0 for i in range(len(classes)+1)]
  if len(clss):
    for i, c_name in enumerate(classes):
      for word in clss:
        if word in c_name.split():
          c_count[i] += 1
    print(classes[c_count.index(max(c_count))])
  else:
    c_count[-1] = 1
    print('Outros')
  print(c_count)
  print('-'*10)
  text = ' '.join(words)
  files[dirname] = (text,[1 if i == c_count.index(max(c_count)) else 0 for i in range(len(c_count))])

In [1]:
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

In [None]:
import nltk
#from gensim import corpora
#from gensim.models import TfidfModel
from keras.preprocessing.text import Tokenizer

max_size = 1000
#stop words
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

aux = [[word.lower() for word in text if word.lower() not in stopwords] for keys,(text,_) in files.items()]
processed_corpus = [' '.join(sentence) for sentence in aux]

from collections import Counter
def counter_word(text):
  count = Counter()
  for sentence in text:
    for word in sentence.split():
      count[word] += 1
  return count

counter = counter_word(processed_corpus)
num_words = len(counter)
#dictionary = corpora.Dictionary(processed_corpus)
#BoW_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
#tfidf_map = TfidfModel(BoW_corpus, smartirs='ntc')
#X = []; y = []
#for keys, (text,clss) in files.items():
 # processed_text = [word.lower() for word in text if word.lower() not in stopwords]
 # if len(processed_text) < max_size:
 #   processed_text += ['NaN' for i in range(max_size-len(processed_text))]
 # p_input = [dictionary.doc2bow([text]) for text in processed_text]
#  for i,j in zip(p_input, tfidf_map[p_input[:max_size]]):
#    print(i); print(j)
#    input()
#  X.append(tfidf_map[p_input[:max_size]])
#  y.append(clss)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from numpy import array

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(processed_corpus)

X = [] 
y = []
for keys, (text,clss) in files.items():
  processed_text = [word.lower() for word in text.split() if word.lower() not in stopwords + ['%', '"', "!", "'", '"', '&', "%,"]]
  processed_text = tokenizer.texts_to_sequences(' '.join(processed_text))
  aux = []
  for vals in processed_text:
    aux += [i for i in vals]
  processed_text = aux[:max_size]
  if len(processed_text) < max_size:
    processed_text += [0 for i in range(max_size - len(processed_text))]
  X.append(processed_text)
  y.append(clss)

In [None]:
#rede bi-lstm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Bidirectional, Embedding
from tensorflow import convert_to_tensor
from numpy import array

n_classes = len(y[0])
epochs = 10

blstm_model = Sequential()
blstm_model.add(Embedding(num_words,32, input_length=max_size))
blstm_model.add(Bidirectional(LSTM(int(max_size/10))))                                    #20 memory units, (n_samp,1) means 10 time steps with a single feature
blstm_model.add(Dense(n_classes,activation='softmax'))                                    #this TimeDistributed wrapper ensures that the lstm layer outputs a sequence of values (one per timestep) rather the a single value for the whole input sequence
blstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')

#train lstm
X = array(X)
y = array(y)
blstm_model.fit(X,y, epochs=epochs, batch_size=10, verbose=True, validation_split=.2, shuffle=True)

In [None]:
procs = {}
for dirname in proc_dirs[100:105]:
  print('Listing txt files from "{}"'.format(dirname))
  f_txt = [filename for filename in os.listdir(dirname) if filename[-4:] == ".txt"]
  print(f_txt)
  procs[dirname] = f_txt
  for filename in f_txt:
    words += w_tokenize(dirname + '/' + filename)
  words = ' '.join(words)
  processed_text = [word.lower() for word in text.split() if word.lower() not in stopwords + ['%', '"', "!", "'", '"', '&', "%,"]]
  processed_text = tokenizer.texts_to_sequences(' '.join(processed_text))
  aux = []
  for vals in processed_text:
    aux += [i for i in vals]
  processed_text = aux[:max_size]
  if len(processed_text) < max_size:
    processed_text += [0 for i in range(max_size - len(processed_text))]
  print(classes[bilstm_model(processed_text,train=False)])