In [1]:
# -*- Mode: Python; coding: utf-8 -*-
from __future__ import print_function
import os
import csv
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras import backend as K

#Variaveis de parametros
BASE_DIR = (r'C:\Users\leonardo\Desktop\ProjetoDeep\Glove')
GLOVE_DIR = BASE_DIR + r'\glove.6B'
TEXT_DATA_DIR = BASE_DIR + r'\emotion'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 400000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

Using TensorFlow backend.


In [2]:
# Index mapping words
print('Indexing word vectors.')
embeddings_index = {}
f = open(os.path.join(BASE_DIR, r'wiki.pt.vec'), encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except:
        pass
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 590511 word vectors.


In [3]:
#Preparando os textos e classificações 5 classificações de sentimentos.
print('Processing text dataset')
dic = {'medo': '1', 'alegria': '2', 'tristeza': '3','raiva': '4', 'neutro': '5'}
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
with open(r'C:\Users\leonardo\Desktop\ProjetoDeep\Glove\emo2.csv', 'r', encoding='utf8', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar=' ')
    for linha in reader:
        label= str(linha[0])
        label_id = dic[label]
        text = linha[1]
        texts.append(text)
        labels.append(label_id)
csvfile.close()
print('Found %s texts.' % len(texts))

Processing text dataset
Found 39141 texts.


In [4]:
# Vetorizando os textos em um tensor 2D
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 44552 unique tokens.
Shape of data tensor: (39141, 1000)
Shape of label tensor: (39141, 6)


In [5]:
#Dividindo os dataset de teste e treinamento
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print('Preparing embedding matrix.')

Preparing embedding matrix.


In [None]:
# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Training model.')

Training model.


In [None]:
# Treinamento
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
print("Sequence aqui = ",sequence_input)
print("Embeded aqui = ",embedded_sequences)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
#preds = Dense(len(labels_index), activation='softmax')(x)
preds = Dense(6, activation='softmax')(x)
print(preds)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=15,
          epochs=100,
          validation_data=(x_val, y_val))
K.clear_session()

Sequence aqui =  Tensor("input_1:0", shape=(?, 1000), dtype=int32)
Embeded aqui =  Tensor("embedding_1/Gather:0", shape=(?, 1000, 300), dtype=float32)
Tensor("dense_2/Softmax:0", shape=(?, 6), dtype=float32)
Train on 31313 samples, validate on 7828 samples
Epoch 1/100