Conectar Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importações

In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
import nltk
import matplotlib.pyplot as plt
from keras.utils import pad_sequences
from keras.metrics.accuracy_metrics import BinaryAccuracy
from keras import backend as K
from keras.metrics import Precision, Recall, BinaryAccuracy
from keras.layers import Input, Embedding, Conv1D, ReLU, GlobalAveragePooling1D, Dense, Concatenate, BatchNormalization, Reshape
from keras.models import Model, Sequential
from keras.optimizers import Adam

Carregamento de dados

In [None]:
# Caminho do arquivo de treino
train_path = '/content/drive/MyDrive/Trabalhos/TCC/Train/dataset.csv'

# Carregando base de treino
dataframe = pd.read_csv(train_path)

# Imprimindo prévia dos dados carregados
dataframe.columns

Index(['Palavra', 'Senha', 'Relacionado'], dtype='object')

In [None]:
# Verificando comprimento máximo das senhas do conjunto
dataframe['Senha'].str.len().max()

16

Pré-processamento de dados

In [None]:
max_len = dataframe['Senha'].str.len().max()

def tokenize_input(inp_string):
    output = []
    for row in inp_string:
        char_list = [char for char in row]
        output.append(char_list)
    return output


X_temp, X_test, y_temp, y_test = train_test_split(dataframe[['Palavra', 'Senha']],dataframe['Relacionado'], test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

X_train['Palavra'] = X_train['Palavra'].astype(str)
X_train['Senha'] = X_train['Senha'].astype(str)

X_val['Palavra'] = X_val['Palavra'].astype(str)
X_val['Senha'] = X_val['Senha'].astype(str)

t = Tokenizer()

X_train_plv_tokenized = tokenize_input(X_train['Palavra'])
X_train_plv_tokenized = pd.Series(X_train_plv_tokenized)

t.fit_on_texts(X_train_plv_tokenized.values)

train_plv_seq = t.texts_to_sequences(X_train_plv_tokenized.values)
train_plv_seq = pad_sequences(train_plv_seq, max_len, padding='post')

####################################################################

X_train_snh_tokenized = tokenize_input(X_train['Senha'])
X_train_snh_tokenized = pd.Series(X_train_snh_tokenized)

t.fit_on_texts(X_train_snh_tokenized.values)

train_snh_seq = t.texts_to_sequences(X_train_snh_tokenized.values)
train_snh_seq = pad_sequences(train_snh_seq, max_len, padding='post')

####################################################################

X_val_plv_tokenized = tokenize_input(X_val['Senha'])
X_val_plv_tokenized = pd.Series(X_val_plv_tokenized)

t.fit_on_texts(X_val_plv_tokenized.values)

val_plv_seq = t.texts_to_sequences(X_val_plv_tokenized.values)
val_plv_seq = pad_sequences(val_plv_seq, max_len, padding='post')

####################################################################

X_val_snh_tokenized = tokenize_input(X_val['Senha'])
X_val_snh_tokenized = pd.Series(X_val_snh_tokenized)

t.fit_on_texts(X_val_snh_tokenized.values)

val_snh_seq = t.texts_to_sequences(X_val_snh_tokenized.values)
val_snh_seq = pad_sequences(val_snh_seq, max_len, padding='post')


print(train_plv_seq)
print(train_snh_seq)
print(val_plv_seq)
print(val_snh_seq)

train_plv_seq.shape, train_snh_seq.shape, val_plv_seq.shape, val_snh_seq.shape


Configuração das equações de métricas

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


Definição do modelo

In [None]:
input_A = Input(shape=(train_plv_seq.shape[1],), name='input_A')
input_B = Input(shape=(train_snh_seq.shape[1],), name='input_B')

def get_cnn_block(depth):
  return Sequential([Conv1D(depth, 3, 1),
                     BatchNormalization(),
                     ReLU()])

DEPTH = 16

cnn = Sequential([Reshape((max_len, 1)),
                  get_cnn_block(DEPTH),
                  get_cnn_block(DEPTH*2),
                  get_cnn_block(DEPTH*4),
                  get_cnn_block(DEPTH*8),
                  GlobalAveragePooling1D(),
                  Dense(64, activation='relu')])

feature_vector_A = cnn(input_A)
feature_vector_B = cnn(input_B)

concat = Concatenate()([feature_vector_A, feature_vector_B])

dense = Dense(64, activation='relu')(concat)
output = Dense(1, activation='sigmoid')(dense)

model = Model(inputs=[input_A, input_B], outputs=output)


metrics = [Precision(name="precision"), Recall(name="recall"), BinaryAccuracy("binary_accuracy"), f1_m]

model.compile(loss="binary_crossentropy", metrics= metrics, optimizer=Adam(0.00001))

model.summary()

Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_A (InputLayer)           [(None, 16)]         0           []                               
                                                                                                  
 input_B (InputLayer)           [(None, 16)]         0           []                               
                                                                                                  
 sequential_64 (Sequential)     (None, 64)           41760       ['input_A[0][0]',                
                                                                  'input_B[0][0]']                
                                                                                                  
 concatenate_12 (Concatenate)   (None, 128)          0           ['sequential_64[0][0]',   

Treinamento do modelo

In [None]:
history = model.fit([train_plv_seq,train_snh_seq],
                    y_train.values.reshape(-1,1),
                    epochs = 100,
                    batch_size=32,
                    validation_data = ([val_plv_seq, val_snh_seq], y_val.values.reshape(-1,1)),
                    verbose=1)

Representação de resultados

In [None]:
print(history.history.keys())

plt.plot(history.history['binary_accuracy'])
plt.title('Acurácia - Treinamento')
plt.ylabel('Acurácia')
plt.xlabel('Época')

plt.show()


plt.plot(history.history['val_binary_accuracy'])
plt.title('Acurácia - Teste')
plt.ylabel('Acurácia')
plt.xlabel('Época')

plt.show()

plt.plot(history.history['precision'])
plt.title('Precisão - Treinamento')
plt.ylabel('Precisão')
plt.xlabel('Época')

plt.show()

plt.plot(history.history['val_precision'])
plt.title('Precisão - Teste')
plt.ylabel('Precisão')
plt.xlabel('Época')

plt.show()

plt.plot(history.history['f1_m'])
plt.title('F1Score - Treinamento')
plt.ylabel('F1Score')
plt.xlabel('Época')

plt.show()

plt.plot(history.history['val_f1_m'])
plt.title('F1Score - Teste')
plt.ylabel('F1Score')
plt.xlabel('Época')

plt.show()

print(history.history['binary_accuracy'].max())