In [263]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn import metrics

In [264]:
df = pd.read_csv('../03_data_preparation/merged-dataset.csv', sep=';', encoding='utf-8')
df.head()

Unnamed: 0,expediente,name,acao,grupo_assunto,assunto,text
0,02.000.00030/2017,02000000302017_6761978_mandado_de_intimacao_-_...,9,25,531; 716; 972,b'PROCESSO:\nCLASSE:\nAUTOR:\nREU:\n\nPCTT: 92...
1,02.000.00035/2017,02000000352017_6774071_20170119_mandado_de_cit...,50,27,50; 254; 531; 558; 613; 620; 780,"b"" \n\n \n\n \n\nPCTT 92,100.04\n\nURGENTE\n\n..."
2,02.000.00136/2017,02000001362017_6855379_20170127_processo_integ...,47,26,716; 975,b' \n\nPODER JUDICIARIO\nTRIBUNAL REGIONAL FED...
3,02.000.00136/2017,02000001362017_7321216_20170210_peticao_inicia...,47,26,716; 975,"b', ae \xe2\x80\x9c8,\nCAIXA ree 625\nRun eB\n..."
4,02.000.00145/2017,02000001452017_6901508_20170131_mandado_intima...,43,11,246,03/02/2017Número: 1000055-73.2017.4.01.3200 ...


In [265]:
df.expediente.describe()

count                   846
unique                  648
top       08.000.00742/2017
freq                     16
Name: expediente, dtype: object

In [266]:
df1 = df.groupby('expediente')['text'].apply(lambda x: ' '.join(x))
df1.head()

expediente
02.000.00030/2017    b'PROCESSO:\nCLASSE:\nAUTOR:\nREU:\n\nPCTT: 92...
02.000.00035/2017    b" \n\n \n\n \n\nPCTT 92,100.04\n\nURGENTE\n\n...
02.000.00136/2017    b' \n\nPODER JUDICIARIO\nTRIBUNAL REGIONAL FED...
02.000.00145/2017     03/02/2017Número: 1000055-73.2017.4.01.3200  ...
02.000.00186/2017    b' \n\nVara Unica\nSSJ Tef\xc3\xa9\nFi.\n\nRub...
Name: text, dtype: object

In [267]:
#columns = ('expediente', 'acao', 'grupo_assunto', 'assunto')
columns = ('expediente', 'acao')
df = df.loc[:,columns]
df.drop_duplicates(inplace=True)
df.set_index('expediente', inplace=True)

df = df.merge(df1.to_frame(), left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,acao,text
expediente,Unnamed: 1_level_1,Unnamed: 2_level_1
02.000.00030/2017,9,b'PROCESSO:\nCLASSE:\nAUTOR:\nREU:\n\nPCTT: 92...
02.000.00035/2017,50,"b"" \n\n \n\n \n\nPCTT 92,100.04\n\nURGENTE\n\n..."
02.000.00136/2017,47,b' \n\nPODER JUDICIARIO\nTRIBUNAL REGIONAL FED...
02.000.00145/2017,43,03/02/2017Número: 1000055-73.2017.4.01.3200 ...
02.000.00186/2017,50,b' \n\nVara Unica\nSSJ Tef\xc3\xa9\nFi.\n\nRub...


In [268]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 648 entries, 02.000.00030/2017 to 16.000.01296/2017
Data columns (total 2 columns):
acao    648 non-null int64
text    648 non-null object
dtypes: int64(1), object(1)
memory usage: 35.2+ KB


In [269]:
data = df

In [270]:
x_train, x_test, y_train, y_test = train_test_split(data.text, data.acao, test_size = 0.2, random_state=42)
print(len(x_train))
print(len(x_test))

518
130


In [271]:
vec = CountVectorizer()
vec_train = vec.fit_transform(x_train)
vec_train.shape

(518, 286902)

In [272]:
clf = MultinomialNB().fit(vec_train, y_train)

In [273]:
vec_test = vec.transform(x_test)
predictions = clf.predict(vec_test)
clf.score(vec_test, y_test)

0.6384615384615384

In [274]:
print(metrics.classification_report(y_test, predictions))
print(metrics.confusion_matrix(y_test, predictions))

             precision    recall  f1-score   support

          1       0.00      0.00      0.00         3
         10       0.00      0.00      0.00         1
         14       0.00      0.00      0.00         5
         22       0.00      0.00      0.00         2
         26       0.00      0.00      0.00         1
         28       0.00      0.00      0.00         1
         38       0.00      0.00      0.00         1
         43       0.60      0.43      0.50         7
         45       0.00      0.00      0.00         1
         47       0.40      0.25      0.31        16
         50       0.65      0.97      0.78        74
         67       1.00      0.44      0.62         9
        227       0.00      0.00      0.00         1
        237       0.00      0.00      0.00         8

avg / total       0.52      0.64      0.55       130

[[ 0  0  0  0  0  0  0  0  0  0  3  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  5  0  0  0]
 [ 0  0  0  0 

  'precision', 'predicted', average, warn_for)
