# Automatic URP Identification (Comparation Models)

In this notebook, we implemmented and tested some Machine and Deep learning models to compare with C-BLSTM. We use te <b>Sklearn</b>, <b>Keras</b> and <b>Tensorflow</b> <i>Python libraries</i> for this work. We use Support Vector Machine (<i>SVM</i>), Decision Trees, Multi-Layer Perceptron (<i>MLP</i>), Nave Bayes, Long Short-Term Memory (<i>LSTM</i>), Bidirectional LSTM (<i>BLSTM</i>) and Convolutional Neural Networks (<i>CNN</i>). For each model, <b>f1-score</b>, <b>precision</b> and <b>recall</b> are measured and apresented.

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from os import listdir

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

import warnings
warnings.filterwarnings("ignore")

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from keras.preprocessing import text, sequence
from nltk.tokenize import word_tokenize

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, AveragePooling1D
from keras.layers import Embedding
from keras.layers import LSTM, Bidirectional, TimeDistributed
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D

Using TensorFlow backend.


### Import Database

In [6]:
mypath = 'Stem Data'

base = pd.DataFrame()
for file in listdir(mypath):
    print(file)
    new_entries_base = pd.read_excel(mypath+'/'+file)
    base = base.append(new_entries_base,ignore_index=True)


naoPru_Clean.xlsx
Pru_Clean.xlsx
twitter_Clean.xlsx


In [7]:
x = base['Postagens'].values
classes = ["PRU", "NPRU"]
y = base[classes].values

y_ml = []
for i in range(len(y)):
    if y[i][0] == 1 and y[i][1]==0:
        y_ml.append(1)
    else:
        y_ml.append(0)

In [8]:
new_x = pd.DataFrame()
lista = []
for c in x:
    lista.append(word_tokenize(str(c)))
new_x = lista

### Padding Sentences

In [9]:
max_features = 20000
maxlen = 45
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(new_x))
new_x = tokenizer.texts_to_sequences(new_x)
X_comp = sequence.pad_sequences(new_x, maxlen=maxlen)
X_comp[3]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 547,   8,
         2, 212,  21,  10, 547, 882])

## Dividing Database

In [8]:
xTrain, xTest, yTrain, yTest = train_test_split(X_comp, y_ml, test_size=0.2, random_state=42)
print(len(yTrain))
print(len(yTest))

3093
774


## Traditional Machine Learning Algorithms

### Support Vector Machine Classifier

In [18]:
svm = SVC(kernel= 'rbf', C = 1000, gamma = 0.0001)
svm.fit(xTrain,yTrain)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [19]:
y_svm = svm.predict(xTest)
print(classification_report(yTest, y_svm.round()))

             precision    recall  f1-score   support

          0       0.71      0.23      0.35       364
          1       0.76      0.96      0.85       906

avg / total       0.74      0.75      0.71      1270



In [23]:
print('Accuracy: ', accuracy_score(yTest, y_svm.round()))
print('ROC Curve: ', roc_auc_score(yTest, y_svm.round()))

Accuracy:  0.752755905511811
ROC Curve:  0.5974425684690585


### Decision Tree Classifier

In [24]:
dt = DecisionTreeClassifier()
dt.fit(xTrain, yTrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [25]:
y_dt = dt.predict(xTest)
print(classification_report(yTest, y_dt.round()))

             precision    recall  f1-score   support

          0       0.63      0.67      0.65       364
          1       0.86      0.84      0.85       906

avg / total       0.80      0.79      0.80      1270



In [26]:
print('Accuracy: ', accuracy_score(yTest, y_dt.round()))
print('ROC Curve: ', roc_auc_score(yTest, y_dt.round()))

Accuracy:  0.794488188976378
ROC Curve:  0.7573502656284113


### Gaussian Naive Bayes Classifier

In [27]:
nv = GaussianNB()
nv.fit(xTrain, yTrain)

GaussianNB(priors=None)

In [28]:
y_nv = nv.predict(xTest)
print(classification_report(yTest, y_nv.round()))

             precision    recall  f1-score   support

          0       0.29      0.99      0.44       364
          1       0.00      0.00      0.00       906

avg / total       0.08      0.29      0.13      1270



In [29]:
print('Accuracy: ', accuracy_score(yTest, y_nv.round()))
print('ROC Curve: ', roc_auc_score(yTest, y_nv.round()))

Accuracy:  0.28503937007874014
ROC Curve:  0.49725274725274726


### Multi-Layer Perceptron

In [30]:
mlp = MLPClassifier(hidden_layer_sizes=(250,100, ), activation='logistic', solver='adam', max_iter=100)
mlp.fit(xTrain, yTrain)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(250, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [31]:
y_mlp = mlp.predict(xTest)
print(classification_report(yTest, y_mlp.round()))

             precision    recall  f1-score   support

          0       0.49      0.55      0.52       364
          1       0.81      0.77      0.79       906

avg / total       0.72      0.70      0.71      1270



In [32]:
print('Accuracy: ', accuracy_score(yTest, y_mlp.round()))
print('ROC Curve: ', roc_auc_score(yTest, y_mlp.round()))

Accuracy:  0.7039370078740157
ROC Curve:  0.6577274822307936


## Deep Learning Algorithms

In [10]:
xTrain, xTest, yTrain, yTest = train_test_split(X_comp, y, test_size=0.2, random_state=42)
print(len(yTrain))
print(len(yTest))

3093
774


### Long Short-Term Memory

In [34]:
lstm = Sequential()
lstm.add(Embedding(max_features, 128, input_length=maxlen))
lstm.add(LSTM(64, recurrent_dropout=0.2, return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(LSTM(32))
lstm.add(Dropout(0.5))
lstm.add(Dense(2,activation='softmax'))
lstm.summary()
lstm.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 45, 128)           2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 45, 64)            49408     
_________________________________________________________________
dropout_1 (Dropout)          (None, 45, 64)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 66        
Total params: 2,621,890
Trainable params: 2,621,890
Non-trainable params: 0
_________________________________________________________________


In [35]:
lstm.fit(xTrain, yTrain, batch_size=32, epochs=5, validation_split=0.1, verbose=2)
y_lstm = lstm.predict(xTest)
print(classification_report(yTest, y_lstm.round()))

Train on 4569 samples, validate on 508 samples
Epoch 1/5
 - 73s - loss: 0.4456 - acc: 0.7960 - val_loss: 0.3180 - val_acc: 0.8661
Epoch 2/5
 - 59s - loss: 0.2512 - acc: 0.9076 - val_loss: 0.3100 - val_acc: 0.8780
Epoch 3/5
 - 30s - loss: 0.1716 - acc: 0.9396 - val_loss: 0.4300 - val_acc: 0.8642
Epoch 4/5
 - 30s - loss: 0.1326 - acc: 0.9567 - val_loss: 0.4440 - val_acc: 0.8622
Epoch 5/5
 - 30s - loss: 0.0984 - acc: 0.9670 - val_loss: 0.4565 - val_acc: 0.8583
             precision    recall  f1-score   support

          0       0.93      0.88      0.91       906
          1       0.74      0.84      0.79       364

avg / total       0.88      0.87      0.87      1270



### Bidirectional Long Short-Term Memory

In [36]:
blstm = Sequential()
blstm.add(Embedding(max_features, 128, input_length=maxlen))
blstm.add(Bidirectional(LSTM(64, recurrent_dropout=0.2, return_sequences=True)))
blstm.add(Dropout(0.2))
blstm.add(Bidirectional(LSTM(32)))
blstm.add(Dropout(0.5))
blstm.add(Dense(2,activation='softmax'))
blstm.summary()
blstm.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 45, 128)           2560000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 45, 128)           98816     
_________________________________________________________________
dropout_3 (Dropout)          (None, 45, 128)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                41216     
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 2,700,162
Trainable params: 2,700,162
Non-trainable params: 0
_________________________________________________________________


In [37]:
blstm.fit(xTrain, yTrain, batch_size=32, epochs=5, validation_split=0.1, verbose=2)
y_blstm = blstm.predict(xTest)
print(classification_report(yTest, y_blstm.round()))

Train on 4569 samples, validate on 508 samples
Epoch 1/5
 - 56s - loss: 0.4230 - acc: 0.8142 - val_loss: 0.2970 - val_acc: 0.8858
Epoch 2/5
 - 52s - loss: 0.2313 - acc: 0.9160 - val_loss: 0.3220 - val_acc: 0.8720
Epoch 3/5
 - 58s - loss: 0.1502 - acc: 0.9494 - val_loss: 0.3943 - val_acc: 0.8602
Epoch 4/5
 - 54s - loss: 0.1041 - acc: 0.9672 - val_loss: 0.3867 - val_acc: 0.8898
Epoch 5/5
 - 55s - loss: 0.0729 - acc: 0.9783 - val_loss: 0.4132 - val_acc: 0.8898
             precision    recall  f1-score   support

          0       0.92      0.92      0.92       906
          1       0.80      0.80      0.80       364

avg / total       0.89      0.89      0.89      1270



### Convolutional Neural Network

In [11]:
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250

cnn = Sequential()
cnn.add(Embedding(max_features,embedding_dims,input_length=maxlen))
cnn.add(Dropout(0.2))
cnn.add(Conv1D(filters,kernel_size,padding='valid',activation='relu',strides=1))
cnn.add(GlobalMaxPooling1D())
cnn.add(Dense(hidden_dims, activation='relu'))
cnn.add(Dropout(0.2))
cnn.add(Dense(2, activation='sigmoid'))

cnn.summary()
cnn.compile(loss='binary_crossentropy', 
			  optimizer='adam',
              metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 45, 50)            1000000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 45, 50)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 43, 250)           37750     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_2 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 502       
Total para

In [None]:
cnn.fit(xTrain, yTrain, batch_size=32, epochs=5, validation_split=0.1, verbose=2)
y_cnn = cnn.predict(xTest)
print(classification_report(yTest, y_cnn.round()))

Train on 2783 samples, validate on 310 samples
Epoch 1/5


In [None]:
y_pru = []
for i in range(len(yTest)):
    y_pru.append(yTest[i][1])
import scikitplot as skplt
skplt.metrics.plot_cumulative_gain(y_pru, y_cnn)
plt.show()