In [45]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn import metrics
from nltk.corpus import stopwords

In [46]:
df = pd.read_csv('../03_data_preparation/merged-dataset.csv', sep=';', encoding='utf-8')
df.head()

Unnamed: 0,expediente,name,acao,grupo_assunto,assunto,text
0,02.000.00030/2017,02000000302017_6761978_mandado_de_intimacao_-_...,9,25,531; 716; 972,PROCESSO:\nCLASSE:\nAUTOR:\nREU:\n\nPCTT: 92.1...
1,02.000.00035/2017,02000000352017_6774071_20170119_mandado_de_cit...,50,27,50; 254; 531; 558; 613; 620; 780,"\n\n \n\n \n\nPCTT 92,100.04\n\nURGENTE\n\n \..."
2,02.000.00136/2017,02000001362017_6855379_20170127_processo_integ...,47,26,716; 975,\n\nPODER JUDICIARIO\nTRIBUNAL REGIONAL FEDER...
3,02.000.00136/2017,02000001362017_7321216_20170210_peticao_inicia...,47,26,716; 975,", ae “8,\nCAIXA ree 625\nRun eB\n\nEXCELENTISS..."
4,02.000.00145/2017,02000001452017_6901508_20170131_mandado_intima...,43,11,246,03/02/2017Número: 1000055-73.2017.4.01.3200 ...


In [47]:
df.expediente.describe()

count                   858
unique                  659
top       08.000.00742/2017
freq                     16
Name: expediente, dtype: object

In [48]:
df1 = df.groupby('expediente')['text'].apply(lambda x: ' '.join(x))
df1.head()

expediente
02.000.00030/2017    PROCESSO:\nCLASSE:\nAUTOR:\nREU:\n\nPCTT: 92.1...
02.000.00035/2017     \n\n \n\n \n\nPCTT 92,100.04\n\nURGENTE\n\n \...
02.000.00136/2017     \n\nPODER JUDICIARIO\nTRIBUNAL REGIONAL FEDER...
02.000.00145/2017     03/02/2017Número: 1000055-73.2017.4.01.3200  ...
02.000.00186/2017     \n\nVara Unica\nSSJ Tefé\nFi.\n\nRubrica\n\n ...
Name: text, dtype: object

In [49]:
#columns = ('expediente', 'acao', 'grupo_assunto', 'assunto')
columns = ('expediente', 'acao')
df = df.loc[:,columns]
df.drop_duplicates(inplace=True)
df.set_index('expediente', inplace=True)

df = df.merge(df1.to_frame(), left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,acao,text
expediente,Unnamed: 1_level_1,Unnamed: 2_level_1
02.000.00030/2017,9,PROCESSO:\nCLASSE:\nAUTOR:\nREU:\n\nPCTT: 92.1...
02.000.00035/2017,50,"\n\n \n\n \n\nPCTT 92,100.04\n\nURGENTE\n\n \..."
02.000.00136/2017,47,\n\nPODER JUDICIARIO\nTRIBUNAL REGIONAL FEDER...
02.000.00145/2017,43,03/02/2017Número: 1000055-73.2017.4.01.3200 ...
02.000.00186/2017,50,\n\nVara Unica\nSSJ Tefé\nFi.\n\nRubrica\n\n ...


In [50]:
with open('../04_machine_learning/dataset.pkl', "wb") as f:
    clf = pickle.dump(df, f)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 659 entries, 02.000.00030/2017 to 16.000.01565/2017
Data columns (total 2 columns):
acao    659 non-null int64
text    659 non-null object
dtypes: int64(1), object(1)
memory usage: 35.4+ KB


In [51]:
data = df

In [52]:
x_train, x_test, y_train, y_test = train_test_split(data.text, data.acao, test_size = 0.2, random_state=42)
print(len(x_train))
print(len(x_test))

527
132


In [53]:
def process(x):
    return x.lower().translate(str.maketrans('', '', '0123456789_')).encode('ascii',errors='ignore').decode()

In [54]:
stopw = ['nao', 'ser', 'sera', 'serao']
vec = CountVectorizer(preprocessor = process,
                      stop_words   = stopwords.words('portuguese')+stopw,
                      max_df       = .6,
                      min_df       = 5,
                      ngram_range  = (2,2))
vec_train = vec.fit_transform(x_train)
vec_train.shape

(527, 27787)

In [55]:
analyze = vec.build_analyzer()
analyze("This is a Text doCument. toº analyze ? ; / \ .\n 123 \t")

['this is', 'is text', 'text document', 'document to', 'to analyze']

In [56]:
clf = MultinomialNB().fit(vec_train, y_train)

In [57]:
model = Pipeline([('vec', vec), ('clf', clf)])
predictions = model.predict(x_test)
print(model.score(x_test, y_test))
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

0.7424242424242424


In [58]:
print(metrics.classification_report(y_test, predictions))
print(metrics.confusion_matrix(y_test, predictions))

             precision    recall  f1-score   support

          1       1.00      1.00      1.00         2
         10       1.00      0.25      0.40         4
         14       0.00      0.00      0.00         5
         22       0.00      0.00      0.00         1
         28       0.00      0.00      0.00         2
         43       0.80      0.67      0.73         6
         47       0.44      0.50      0.47        16
         50       0.80      0.95      0.87        75
         60       0.00      0.00      0.00         1
         67       0.78      0.64      0.70        11
        227       0.67      1.00      0.80         2
        237       0.60      0.43      0.50         7

avg / total       0.70      0.74      0.71       132

[[ 2  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  1  1  0  1  0  0]
 [ 0  0  0  0  0  0  1  4  0  0  0  0]
 [ 0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  1  1  0  0  0  0]
 [ 0  0  0  0  0  4  2  0  0  0  0  0]
 [ 0  0  0  0  0  1  

  'precision', 'predicted', average, warn_for)


In [59]:
for i, label in enumerate(clf.classes_):
    print('\n' + str(label))
    print([vec.get_feature_names()[j] for j in np.argsort(clf.coef_[i])[-10:]])


1
['primeirograu processo', 'eletronicamente certificao', 'batista silva', 'saldo disponivel', 'poder judiciario', 'jus br', 'itabuna ba', 'alvara judicial', 'oab ba', 'goiania go']

2
['art cdc', 'tendo vista', 'registro imveis', 'defesa consumidor', 'cinco dias', 'prazo cinco', 'rua qd', 'qd lt', 'caixa postal', 'advogados associados']

7
['judiciaria formosa', 'crdito rural', 'custas iniciais', 'banco central', 'poder judiciario', 'desta cdula', 'rural pignoraticia', 'encargos financeiros', 'parte autora', 'formosa go']

8
['juiz direito', 'teixeira freitas', 'gov br', 'tjgo jus', 'seguro social', 'nacional seguro', 'instituto nacional', 'parte autora', 'poder judiciario', 'jus br']

9
['art cpc', 'concessdo crdito', 'conselho monetario', 'monetario nacional', 'meio ambiente', 'art inciso', 'federal procuradoria', 'ministrio publico', 'publico federal', 'crdito rural']

10
['brasilia df', 'df fi', 'circular susep', 'df fl', 'sistema financeiro', 'caixa seguradora', 'seguro habitaci

In [60]:
a = df.loc['02.000.00030/2017'].text
a += 'b'
a[-1:]

'b'