In [62]:
import numpy as np
import pandas as pd
import tensorflow_datasets as tfds
import os
from sklearn.datasets import load_files
import shutil
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
import mglearn
from zipfile import ZipFile

In [105]:
if os.path.isdir('decisions_allbps') == False:
    ZipFile("decisions_allbps.zip").extractall("decisions_allbps")

In [106]:
df = pd.read_csv('metadata.csv') 
df

Unnamed: 0,doc_id,publication_time,bp,doc_type,minister
0,19700101_ACO_1646_3420566,1970-01-01,[10],ACO,joaquim barbosa
1,19700101_AC_2287_3093972,1970-01-01,[4],AC,ellen gracie
2,19700101_AC_2459_3245342,1970-01-01,[3],AC,cármen lúcia
3,19700101_AC_2459_3258903,1970-01-01,[3],AC,cármen lúcia
4,19700101_AC_2507_3270033,1970-01-01,[17],AC,joaquim barbosa
...,...,...,...,...,...
29738,20181219_Rcl_31921_15339279527,2018-12-19,"[10, 37]",Rcl,gilmar mendes
29739,20181219_Rcl_32262_15339279531,2018-12-19,[10],Rcl,gilmar mendes
29740,20181219_Rcl_32695_15339271501,2018-12-19,[14],Rcl,rosa weber
29741,20181219_Rcl_32813_15339276760,2018-12-19,[10],Rcl,edson fachin


In [107]:
ids = list(df['doc_id'])
minister = list(df['minister'])

In [108]:
ministers_set = list(set(df['minister']))
ministers_set.pop(0)
ministers_set

['luiz fux',
 'ayres britto',
 'luís roberto barroso',
 'maurício corrêa',
 'dias toffoli',
 'teori zavascki',
 'cármen lúcia',
 'celso de mello',
 'menezes direito',
 'joaquim barbosa',
 'edson fachin',
 'marco aurélio',
 'cezar peluso',
 'eros grau',
 'ellen gracie',
 'ricardo lewandowski',
 'rosa weber',
 'alexandre de moraes',
 'gilmar mendes']

In [109]:
for element in ministers_set:
    print(element, '---->',minister.count(element))

luiz fux ----> 3251
ayres britto ----> 226
luís roberto barroso ----> 2209
maurício corrêa ----> 1
dias toffoli ----> 3610
teori zavascki ----> 1363
cármen lúcia ----> 3129
celso de mello ----> 2673
menezes direito ----> 272
joaquim barbosa ----> 731
edson fachin ----> 1787
marco aurélio ----> 356
cezar peluso ----> 223
eros grau ----> 262
ellen gracie ----> 427
ricardo lewandowski ----> 2504
rosa weber ----> 3048
alexandre de moraes ----> 791
gilmar mendes ----> 1980


In [110]:
ministers_set_copy = ministers_set[:]
for element in ministers_set_copy:
    if minister.count(element)<700:
        ministers_set.remove(element)
ministers_set

['luiz fux',
 'luís roberto barroso',
 'dias toffoli',
 'teori zavascki',
 'cármen lúcia',
 'celso de mello',
 'joaquim barbosa',
 'edson fachin',
 'ricardo lewandowski',
 'rosa weber',
 'alexandre de moraes',
 'gilmar mendes']

In [111]:
if os.path.isdir('minister') == False:
    os.mkdir('minister') 
    
for minis in ministers_set:
    if os.path.isdir('minister/'+str(minis)) == False:
        os.mkdir('minister/'+str(minis)) 

In [112]:
for index in range(len(ids)):
    if minister[index] in ministers_set:
        string = 'decisions_allbps/decisions/'+ids[index]+'.txt'
        shutil.copyfile(string, 'minister/'+str(minister[index])+'/'+ ids[index]+'.txt')

In [113]:
ministers = load_files("minister")
minister_train, label_train = ministers.data, ministers.target
print("type of text_train: {}".format(type(minister_train)))
print("length of text_train: {}".format(len(minister_train)))
print("text_train[1]:\n{}".format(minister_train[1].decode("utf-8")))

type of text_train: <class 'list'>
length of text_train: 27076
text_train[1]:
Decisão:

Vistos.

Cuida-se de reclamação constitucional eletrônica, com pedido de liminar, ajuizada pela Universidade de São Paulo – USP em face do Juízo da 2ª Vara do Trabalho de Santos, cuja decisão teria afrontado a autoridade do Supremo Tribunal Federal e a eficácia do que decidido no julgamento da ADC nº 16/DF.

Na peça vestibular, alega a reclamante que:

a) na origem, cuida-se de reclamação trabalhista movida por Jatnael da Silva Tomaz em face da empresa Corporação Gutty de Segurança Patrimonial e Vigilância Ltda, da Fazenda Pública do Estado de São Paulo e desta reclamante, pleiteando verbas trabalhistas referentes ao período laborado na aludida empresa;

b) a 2ª Vara do Trabalho de Santos “(...) afastou e declarou, por vias transversas, inconstitucional o §1º do art. 71 da Lei federal 8.666/93 – em total afronta à autoridade da decisão do Supremo Tribunal Federal, que declarou constitucional referid

In [114]:
label_set = list(set(label_train))
for element in label_set:
    print(element, '---->', list(label_train).count(element))

0 ----> 791
1 ----> 2673
2 ----> 3129
3 ----> 3610
4 ----> 1787
5 ----> 1980
6 ----> 731
7 ----> 3251
8 ----> 2209
9 ----> 2504
10 ----> 3048
11 ----> 1363


In [115]:
#take only 700 elements of each bp
minister_train_bal = []
label_train_bal = []

for minis in label_set:
    minister_only = [minister_train[index] for index in range(len(label_train)) if label_train[index]==minis]
    random_indexes = np.random.permutation(len(minister_only))
    minister_only = [minister_only[index] for index in random_indexes[:700]]
    minister_train_bal += minister_only
    label_train_bal += [minis for i in range(len(minister_only))]

In [139]:
minister_names = [minis.split() for minis in ministers_set]
flat_names = [item for sublist in minister_names for item in sublist]
cap_names = [minis.capitalize() for minis in flat_names]
up_names = [minis.upper() for minis in flat_names]
mask_list = flat_names+cap_names+up_names
mask_list

['luiz',
 'fux',
 'luís',
 'roberto',
 'barroso',
 'dias',
 'toffoli',
 'teori',
 'zavascki',
 'cármen',
 'lúcia',
 'celso',
 'de',
 'mello',
 'joaquim',
 'barbosa',
 'edson',
 'fachin',
 'ricardo',
 'lewandowski',
 'rosa',
 'weber',
 'alexandre',
 'de',
 'moraes',
 'gilmar',
 'mendes',
 'Luiz',
 'Fux',
 'Luís',
 'Roberto',
 'Barroso',
 'Dias',
 'Toffoli',
 'Teori',
 'Zavascki',
 'Cármen',
 'Lúcia',
 'Celso',
 'De',
 'Mello',
 'Joaquim',
 'Barbosa',
 'Edson',
 'Fachin',
 'Ricardo',
 'Lewandowski',
 'Rosa',
 'Weber',
 'Alexandre',
 'De',
 'Moraes',
 'Gilmar',
 'Mendes',
 'LUIZ',
 'FUX',
 'LUÍS',
 'ROBERTO',
 'BARROSO',
 'DIAS',
 'TOFFOLI',
 'TEORI',
 'ZAVASCKI',
 'CÁRMEN',
 'LÚCIA',
 'CELSO',
 'DE',
 'MELLO',
 'JOAQUIM',
 'BARBOSA',
 'EDSON',
 'FACHIN',
 'RICARDO',
 'LEWANDOWSKI',
 'ROSA',
 'WEBER',
 'ALEXANDRE',
 'DE',
 'MORAES',
 'GILMAR',
 'MENDES']

In [164]:
def masker(dataset, list_of_masked):
    '''Substitues all integers in a text dataset by an empty string
    Input: dataset made of texts
    Output: the same dataset made of texts, only with masked integers'''
    masked_set = []
    for index in range(len(dataset)):
        new_string = ''
        splitted_doc = dataset[index].split()
        for word in range(len(splitted_doc)):
            if splitted_doc[word] in list_of_masked:
                new_string += ''
            else:
                new_string += ' '+ splitted_doc[word]
        masked_set.append(new_string)
    return masked_set

In [165]:
uft_dataset = [minister_train_bal[index].decode("utf-8") for index in range(len(minister_train_bal))]

In [166]:
masked_dataset = masker(uft_dataset, mask_list)
masked_dataset[0]

' Documento assinado digitalmente conforme MP n° 2.200-2/2001 24/08/2001. O documento pode ser acessado pelo endereço http://www.stf.jus.br/portal/autenticacao/autenticarDocumento.asp sob o código 6B20-A5B4-B466-86D1 e senha 10D1-BF8B-8562-0F91 RECURSO EXTRAORDINÁRIO COM AGRAVO 1.173.343 RIO JANEIRO RELATOR : MIN. RECTE.(S) :PETROLEO BRASILEIRO S A PETROBRAS ADV.(A/S) :JULIANA CARNEIRO MARTINS MENEZES RECTE.(S) :FUNDACAO PETROBRAS SEGURIDADE SOCIAL PETROS ADV.(A/S) :RENATO LOBO GUIMARAES ADV.(A/S) :IANY PATRICIA DOS SANTOS RANGEL RECDO.(A/S) :JONAS BARCELLOS FIGUEIREDO ADV.(A/S) :SOLANGE LOPES PAROLA ADV.(A/S) :VERA LUCIA BOTELHO GASPAR RECDO.(A/S) :PETROLEO BRASILEIRO S A PETROBRAS ADV.(A/S) :JULIANA CARNEIRO MARTINS MENEZES RECDO.(A/S) :FUNDACAO PETROBRAS SEGURIDADE SOCIAL PETROS ADV.(A/S) :RENATO LOBO GUIMARAES ADV.(A/S) :RONNE CRISTIAN NUNES DECISÃO Trata-se Agravos em Recursos Extraordinários interpostos em face acórdão proferido pelo Tribunal Superior do Trabalho, assim ementado 

In [167]:
pipe = make_pipeline(TfidfVectorizer(min_df=5, norm=None),
LogisticRegression())
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(masked_dataset, label_train_bal)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best cross-validation score: 0.96


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [171]:
vectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"]
# transform the training dataset
X_train = vectorizer.transform(masked_dataset)
# find maximum value for each of the features over the dataset
max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
# get feature names
feature_names = np.array(vectorizer.get_feature_names())

In [172]:
for minis in range(len(grid.best_estimator_.named_steps["logisticregression"].coef_)):
    test_coef = grid.best_estimator_.named_steps["logisticregression"].coef_[minis]
    idx = np.argpartition(test_coef, -40)[-40:]
    indices = idx[np.argsort((-test_coef)[idx])]
    
    most_important_feature = feature_names[indices]
    print(minis, '---->', most_important_feature)

0 ----> ['2017' 'interno' 'regimento' '2018' 'amparo' 'diante' 'vol' 'codificação'
 'fl' 'apelo' 'pagina' 'decido' '29' 'indicado' 'extremo' 'supremo'
 'antes' 'recorrido' 'doc' 'violado' '85' 'circunscrição' 'caberá'
 'metropolitana' 'debatida' 'linhas' '988' 'min' 'dje' 'julgado' 'federal'
 'corregedoria' 'relato' 'maringá' 'digitalmente' 'amaral' '3ª' '21'
 'assinado' 'relatório']
1 ----> ['grifei' 'presente' 'nº' 'ora' 'enunciado' 'constante' 'instruído'
 'despacho' 'transgressão' 'causa' 'versada' 'cópia' 'suprema' 'questão'
 'sendo' 'deverá' 'deduzida' 'em' 'manuel' 'caxias' 'transgredido'
 'prévias' 'rel' 'possui' 'conseqüência' 'na' 'especialmente'
 'desautoriza' 'proferindo' 'expostas' 'marcos' 'nele' 'apreciarei'
 'cautelar' 'sede' 'medida' 'min' 'requisitório' 'sp' 'rtj']
2 ----> ['relatora' 'inc' 'supremo' 'tribunal' 'fl' 'declaratória' 'descumprido'
 'república' 'descumprimento' 'ministra' 'requerimento' '2009' 'este'
 'potirendaba' 'havidos' 'contrariado' 'teria' 'alegado

In [173]:
shutil.rmtree('minister')