In [1]:
import numpy as np
import pandas as pd
import tensorflow_datasets as tfds
import os
from sklearn.datasets import load_files
import shutil
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
import mglearn
from zipfile import ZipFile

In [2]:
#unzip dataset
if os.path.isdir('decisions_allbps') == False:
    ZipFile("decisions_allbps.zip").extractall("decisions_allbps")

In [3]:
df = pd.read_csv('metadata.csv') 
df

Unnamed: 0,doc_id,publication_time,bp,doc_type,minister
0,19700101_ACO_1646_3420566,1970-01-01,[10],ACO,joaquim barbosa
1,19700101_AC_2287_3093972,1970-01-01,[4],AC,ellen gracie
2,19700101_AC_2459_3245342,1970-01-01,[3],AC,cármen lúcia
3,19700101_AC_2459_3258903,1970-01-01,[3],AC,cármen lúcia
4,19700101_AC_2507_3270033,1970-01-01,[17],AC,joaquim barbosa
...,...,...,...,...,...
29738,20181219_Rcl_31921_15339279527,2018-12-19,"[10, 37]",Rcl,gilmar mendes
29739,20181219_Rcl_32262_15339279531,2018-12-19,[10],Rcl,gilmar mendes
29740,20181219_Rcl_32695_15339271501,2018-12-19,[14],Rcl,rosa weber
29741,20181219_Rcl_32813_15339276760,2018-12-19,[10],Rcl,edson fachin


In [4]:
ids = list(df['doc_id'])
minister = list(df['minister'])

In [5]:
#remove nan minister entry
ministers_set = list(set(df['minister']))
ministers_set.pop(0)
ministers_set

['teori zavascki',
 'maurício corrêa',
 'dias toffoli',
 'cármen lúcia',
 'eros grau',
 'rosa weber',
 'menezes direito',
 'ayres britto',
 'luiz fux',
 'alexandre de moraes',
 'ellen gracie',
 'marco aurélio',
 'ricardo lewandowski',
 'edson fachin',
 'luís roberto barroso',
 'celso de mello',
 'joaquim barbosa',
 'gilmar mendes',
 'cezar peluso']

In [6]:
for element in ministers_set:
    print(element, '---->',minister.count(element))

teori zavascki ----> 1363
maurício corrêa ----> 1
dias toffoli ----> 3610
cármen lúcia ----> 3129
eros grau ----> 262
rosa weber ----> 3048
menezes direito ----> 272
ayres britto ----> 226
luiz fux ----> 3251
alexandre de moraes ----> 791
ellen gracie ----> 427
marco aurélio ----> 356
ricardo lewandowski ----> 2504
edson fachin ----> 1787
luís roberto barroso ----> 2209
celso de mello ----> 2673
joaquim barbosa ----> 731
gilmar mendes ----> 1980
cezar peluso ----> 223


In [7]:
#use only ministers that  have more than 700 citations
ministers_set_copy = ministers_set[:]
for element in ministers_set_copy:
    if minister.count(element)<700:
        ministers_set.remove(element)
ministers_set

['teori zavascki',
 'dias toffoli',
 'cármen lúcia',
 'rosa weber',
 'luiz fux',
 'alexandre de moraes',
 'ricardo lewandowski',
 'edson fachin',
 'luís roberto barroso',
 'celso de mello',
 'joaquim barbosa',
 'gilmar mendes']

In [8]:
#make a folder to store the data
if os.path.isdir('minister') == False:
    os.mkdir('minister') 
    
#make a folder for each limit, allowing then to use sklearn.datasets.load_files
for minis in ministers_set:
    if os.path.isdir('minister/'+str(minis)) == False:
        os.mkdir('minister/'+str(minis)) 

In [9]:
for index in range(len(ids)):
    if minister[index] in ministers_set:
        string = 'decisions_allbps/decisions/'+ids[index]+'.txt'
        shutil.copyfile(string, 'minister/'+str(minister[index])+'/'+ ids[index]+'.txt')

In [10]:
ministers = load_files("minister")
#creates train and label sets
minister_train, label_train = ministers.data, ministers.target
print("type of text_train: {}".format(type(minister_train)))
print("length of text_train: {}".format(len(minister_train)))
print("text_train[1]:\n{}".format(minister_train[1].decode("utf-8")))

type of text_train: <class 'list'>
length of text_train: 27076
text_train[1]:
Decisão:

Vistos.

Cuida-se de reclamação constitucional eletrônica, com pedido de liminar, ajuizada pela Universidade de São Paulo – USP em face do Juízo da 2ª Vara do Trabalho de Santos, cuja decisão teria afrontado a autoridade do Supremo Tribunal Federal e a eficácia do que decidido no julgamento da ADC nº 16/DF.

Na peça vestibular, alega a reclamante que:

a) na origem, cuida-se de reclamação trabalhista movida por Jatnael da Silva Tomaz em face da empresa Corporação Gutty de Segurança Patrimonial e Vigilância Ltda, da Fazenda Pública do Estado de São Paulo e desta reclamante, pleiteando verbas trabalhistas referentes ao período laborado na aludida empresa;

b) a 2ª Vara do Trabalho de Santos “(...) afastou e declarou, por vias transversas, inconstitucional o §1º do art. 71 da Lei federal 8.666/93 – em total afronta à autoridade da decisão do Supremo Tribunal Federal, que declarou constitucional referid

In [11]:
label_set = list(set(label_train))
for element in label_set:
    print(element, '---->', list(label_train).count(element))

0 ----> 791
1 ----> 2673
2 ----> 3129
3 ----> 3610
4 ----> 1787
5 ----> 1980
6 ----> 731
7 ----> 3251
8 ----> 2209
9 ----> 2504
10 ----> 3048
11 ----> 1363


In [12]:
#take only 700 elements of each bp
minister_train_bal = []
label_train_bal = []

#balances the dataset so every minister have same representativity
for minis in label_set:
    minister_only = [minister_train[index] for index in range(len(label_train)) if label_train[index]==minis]
    random_indexes = np.random.permutation(len(minister_only)) 
    minister_only = [minister_only[index] for index in random_indexes[:700]] #picks 700 random documents for each minister
    minister_train_bal += minister_only
    label_train_bal += [minis for i in range(len(minister_only))]

In [51]:
#make a list of words to mask
minister_names = [minis.split() for minis in ministers_set]
flat_names = [item for sublist in minister_names for item in sublist]
flat_names += ['carmen', 'luis', 'lucia']
#substitute gender specific words that can be used to relate to some ministers
flat_names += ['relatora', 'relator', 'ministro','ministra','minis', 'turma', 'primeira', 'segunda', 'min']
cap_names = [minis.capitalize() for minis in flat_names]
up_names = [minis.upper() for minis in flat_names]
mask_list = flat_names+cap_names+up_names
#add puntuation to everything
comma_list = [word+',' for word in mask_list]
dot_list = [word+'.' for word in mask_list]
mask_list += comma_list+dot_list
mask_list

['teori',
 'zavascki',
 'dias',
 'toffoli',
 'cármen',
 'lúcia',
 'rosa',
 'weber',
 'luiz',
 'fux',
 'alexandre',
 'de',
 'moraes',
 'ricardo',
 'lewandowski',
 'edson',
 'fachin',
 'luís',
 'roberto',
 'barroso',
 'celso',
 'de',
 'mello',
 'joaquim',
 'barbosa',
 'gilmar',
 'mendes',
 'carmen',
 'luis',
 'lucia',
 'relatora',
 'relator',
 'ministro',
 'ministra',
 'minis',
 'turma',
 'primeira',
 'segunda',
 'min',
 'Teori',
 'Zavascki',
 'Dias',
 'Toffoli',
 'Cármen',
 'Lúcia',
 'Rosa',
 'Weber',
 'Luiz',
 'Fux',
 'Alexandre',
 'De',
 'Moraes',
 'Ricardo',
 'Lewandowski',
 'Edson',
 'Fachin',
 'Luís',
 'Roberto',
 'Barroso',
 'Celso',
 'De',
 'Mello',
 'Joaquim',
 'Barbosa',
 'Gilmar',
 'Mendes',
 'Carmen',
 'Luis',
 'Lucia',
 'Relatora',
 'Relator',
 'Ministro',
 'Ministra',
 'Minis',
 'Turma',
 'Primeira',
 'Segunda',
 'Min',
 'TEORI',
 'ZAVASCKI',
 'DIAS',
 'TOFFOLI',
 'CÁRMEN',
 'LÚCIA',
 'ROSA',
 'WEBER',
 'LUIZ',
 'FUX',
 'ALEXANDRE',
 'DE',
 'MORAES',
 'RICARDO',
 'LEWANDOWS

In [43]:
def masker(dataset, list_of_masked):
    '''Substitues words that are in the masked list by an empty string
    Input: dataset made of texts
    Output: the same dataset made of texts, only with masked words'''
    masked_set = []
    for index in range('len(dataset)):
        new_string = ''
        splitted_doc = dataset[index].split()
        for word in range(len(splitted_doc)):
            if splitted_doc[word] in list_of_masked:
                new_string += ''
            else:
                new_string += ' '+ splitted_doc[word]
        masked_set.append(new_string)
    return masked_set

In [54]:
uft_dataset = [minister_train_bal[index].decode("utf-8") for index in range(len(minister_train_bal))]

In [55]:
masked_dataset = masker(uft_dataset, mask_list)
masked_dataset[0]

' Ementa e Acórdão 13/04/2018 AG.REG. NO RECURSO EXTRAORDINÁRIO COM AGRAVO 975.690 PERNAMBUCO : AGTE.(S) :IRMAOS LAPA LTDA ADV.(A/S) :RAIMUNDO SOUZA MEDEIROS JUNIOR AGDO.(A/S) :UNIÃO PROC.(A/S)(ES) :PROCURADOR-GERAL DA FAZENDA NACIONAL EMENTA: AGRAVO INTERNO. RECURSO EXTRAORDINÁRIO COM AGRAVO. FUNDAMENTAÇÃO A RESPEITO DA REPERCUSSÃO GERAL. INSUFICIÊNCIA. NORMA ANTERIOR À CONSTITUIÇÃO FEDERAL 1988. JUÍZO RECEPÇÃO OU NÃO RECEPÇÃO. AUSÊNCIA AFRONTA AO ART. 97 DA CF/1988 OU À SÚMULA VINCULANTE 10. OFENSA CONSTITUCIONAL REFLEXA. INADMISSIBILIDADE. ARE 748.371-RG/PE (REL. TEMA 660). MÁTERIA ANÁLOGA À DEBATIDA NO RE 602.883-RG/SP (REL. ELLEN GRACIE, TEMA 288). QUESTÃO INFRACONSTITUCIONAL. INTERPOSIÇÃO DO RECURSO PELA ALÍNEA “C” DO ART. 102, III. DESCABIMENTO. 1. Os Recursos Extraordinários somente serão conhecidos e julgados, quando essenciais e relevantes as questões constitucionais a serem analisadas, sendo imprescindível ao recorrente, em sua petição interposição recurso, a apresentação fo

In [56]:
pipe = make_pipeline(TfidfVectorizer(min_df=5, norm=None),
LogisticRegression())
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(masked_dataset, label_train_bal)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best cross-validation score: 0.96


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [57]:
vectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"]
# transform the training dataset
X_train = vectorizer.transform(masked_dataset)
# find maximum value for each of the features over the dataset
max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
# get feature names
feature_names = np.array(vectorizer.get_feature_names())

In [58]:
for minis in range(len(grid.best_estimator_.named_steps["logisticregression"].coef_)):
    test_coef = grid.best_estimator_.named_steps["logisticregression"].coef_[minis]
    idx = np.argpartition(test_coef, -40)[-40:]
    indices = idx[np.argsort((-test_coef)[idx])]
    
    #shows 40 most important words for deciding belonging to each class
    most_important_feature = feature_names[indices]
    print(minis, '---->', most_important_feature)

0 ----> ['2017' 'interno' 'regimento' '2018' 'diante' 'amparo' 'vol' 'codificação'
 'fl' 'apelo' 'supremo' 'pagina' 'federal' 'caberá' 'rel' 'doc' 'extremo'
 'dje' '988' '85' 'decido' 'violado' 'recorrido' '2015' 'debatida' 'antes'
 'publicado' 'linhas' 'com' 'iii' 'indicado' 'ato' 'ofendeu' '29' 'não'
 '88' 'aplica' 'precedente' 'base' 'julgado']
1 ----> ['grifei' 'presente' 'nº' 'ora' 'enunciado' 'constante' 'causa' 'suprema'
 'sendo' 'versada' 'possui' 'em' 'prévias' 'sede' 'na' 'expostas'
 'conseqüência' 'transgressão' 'deduzida' 'proferindo' 'rel'
 'transgredido' 'sumular' 'medida' 'referência' 'desautoriza' 'cautelar'
 'nele' 'questão' 'formulou' 'apreciarei' 'prestados' 'df' 'assim'
 'evidencia' 'despacho' 'instruído' 'consubstanciado' 'esclarecimentos'
 'exame']
2 ----> ['supremo' 'inc' 'tribunal' 'fl' 'república' 'descumprido' 'declaratória'
 'descumprimento' 'requerimento' 'havidos' 'este' '2009' 'doc'
 'contrariado' 'agravante' 'nossos' 'os' 'teria' 'regimento' 'nascimento'


In [59]:
#deletes extra folder for splitting the dataset
shutil.rmtree('minister')