In [1]:
import pandas as pd
import numpy as np
import nltk
import unidecode
from utils import preprocessText, removeStopwords, lowerToken
from nltk.corpus import stopwords
from nltk import word_tokenize
from numpy import array, asarray, zeros

from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.preprocessing.text import Tokenizer

In [2]:
#First we need to download nltk stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\msego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\msego\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data = pd.read_csv("corpus/dataExample_0.csv", names=['text','sentiment'], header=0)
data2 = pd.read_csv("corpus/dataExample_1.csv", names=['text','sentiment'], header=0)
data3 = pd.read_csv("corpus/dataExample_2.csv", names=['text','sentiment'], header=0)
#Join all dataframes in a single dataframe
data = pd.concat([data,data3,data2])

#Check the data is correctly readed
print("Dataframe data")
data.head()

Dataframe data


Unnamed: 0,text,sentiment
0,Exijo un corto de Bucky y Sam viendo Rogers: T...,Neutro
1,Ya viendo lo mas nuevo de @MarvelLATAM \r\n#Ha...,Positivo
2,Hasta el momento #Hawkeye la mejor serie de @M...,Neutro
3,#TheEternals es la pelicula de @MarvelLATAM me...,Negativo
4,@MarvelLATAM @disneyplusla #Hawkeye Capitulo ...,Negativo


In [4]:
#Now we check the shape
print("Dataframe shape")
data.shape

Dataframe shape


(904, 2)

In [5]:
#Now we check the sentiment count
print("Dataframe sentiment count")
data.sentiment.value_counts()

Dataframe sentiment count


Neutro      415
Negativo    316
Positivo    173
Name: sentiment, dtype: int64

In [6]:
#Pre process tweets and save them as a new column in the dataframe
data['text_clean'] = data['text'].apply(lambda x: preprocessText(str(x)))
data.head()

Unnamed: 0,text,sentiment,text_clean
0,Exijo un corto de Bucky y Sam viendo Rogers: T...,Neutro,Exijo un corto de Bucky Sam viendo Rogers The ...
1,Ya viendo lo mas nuevo de @MarvelLATAM \r\n#Ha...,Positivo,Ya viendo lo mas nuevo de Hawkeye Revolucionan...
2,Hasta el momento #Hawkeye la mejor serie de @M...,Neutro,Hasta el momento Hawkeye la mejor serie de
3,#TheEternals es la pelicula de @MarvelLATAM me...,Negativo,TheEternals es la pelicula de menos Marvel
4,@MarvelLATAM @disneyplusla #Hawkeye Capitulo ...,Negativo,Hawkeye Capitulo Nada rescatable Peleas sin f...


In [7]:
#Transform sentences into tokens
tokens = [word_tokenize(sen) for sen in data.text_clean]

#Put all the words to lowercase
lower_tokens = [lowerToken(token) for token in tokens]

#Import spanish stopwords
stoplist = stopwords.words('spanish')

#Remove stopwords from sentences for better process
filtered_words = [removeStopwords(sen, stoplist) for sen in lower_tokens]

#Update processed text from dataframe with the new filtered sentences
data['text_clean'] = [' '.join(sen) for sen in filtered_words]
#Create a new column that will have the same words but as tokens
data['tokens'] = filtered_words

In [8]:
#Transform sentiment label to three columns in dataset for three outputs
pos = []
neg = []
neu = []

for sent in data.sentiment:
    if sent == 'P' or sent=='pos' or sent=='positivo':
        neu.append(0)
        pos.append(1)
        neg.append(0)
    elif sent == 'N' or sent=='neg' or sent=='negativo':
        pos.append(0)
        neg.append(1)
        neu.append(0)
    else:
        neu.append(1)
        pos.append(0)
        neg.append(0)
        
data['Pos'] = pos
data['Neg'] = neg
data['Neu'] = neu


In [9]:
#Redeclare dataframe with selected columns
data = data[['text_clean', 'tokens', 'sentiment', 'Pos', 'Neu', 'Neg']]
data.head()

Unnamed: 0,text_clean,tokens,sentiment,Pos,Neu,Neg
0,exijo corto bucky sam viendo rogers the musica...,"[exijo, corto, bucky, sam, viendo, rogers, the...",Neutro,0,1,0
1,viendo mas nuevo hawkeye revolucionando genero...,"[viendo, mas, nuevo, hawkeye, revolucionando, ...",Positivo,0,1,0
2,momento hawkeye mejor serie,"[momento, hawkeye, mejor, serie]",Neutro,0,1,0
3,theeternals pelicula menos marvel,"[theeternals, pelicula, menos, marvel]",Negativo,0,1,0
4,hawkeye capitulo rescatable peleas fuerza chic...,"[hawkeye, capitulo, rescatable, peleas, fuerza...",Negativo,0,1,0


In [10]:
#Split data for test and training 
data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

In [11]:
#Get total words in the train dataframe
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]

#Get all the sentence lengths from train dataframe
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]

#Get all the words without duplicates in the train dataframe
TRAINING_VOCAB = sorted(list(set(all_training_words)))

print("%s words, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

5820 words, with a vocabulary size of 2699
Max sentence length is 29


In [12]:
#Get total words in the test dataframe
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]

#Get all the sentence lengths from test dataframe
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]

#Get all the words without duplicates in the test dataframe
TEST_VOCAB = sorted(list(set(all_test_words)))

print("%s words, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

737 words, with a vocabulary size of 521
Max sentence length is 32


In [13]:
#Now we start using tokenizer for sentences

MAX_SEQUENCE_LENGTH = 50  #Max length that a sentence should have
EMBEDDING_DIM = 300   #Dimension of embedding (the same as the dimension of glove embeddings)

#Declare Tokenizer
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)

#Fit tokenizer with training data
tokenizer.fit_on_texts(data_train["text_clean"].tolist())

#Transform sentences from both datasets into sequences with tokenizer
training_sequences = tokenizer.texts_to_sequences(data_train["text_clean"].tolist())
test_sequences = tokenizer.texts_to_sequences(data_test["text_clean"].tolist())

#Pad the sequences adding 0s to reach the max sequence length
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 2699 unique tokens.


In [14]:
#load glove embeddings
embeddings_dictionary = dict()

#Open glove file
glove_file = open('glove/glove-sbwc.i25.vec', encoding="utf8")

#Iterate all lines in glove file 
for line in glove_file:
    #Split words
    records = line.split()
    
    #The first line should not be considered
    if len(records) == 2:
        continue
        
    #Save data in the dictionary
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions

#Close glove file
glove_file.close()

#Create an array with the glove dimension of the embeddings and total unique tokens
train_embedding_weights = zeros((len(train_word_index)+1, EMBEDDING_DIM))

#Save embedding weights using weights from glove if has the word, otherwise use a random array with the same dimension
for word, index in train_word_index.items():
    train_embedding_weights[index,:] = embeddings_dictionary[word] if word in embeddings_dictionary else np.random.rand(EMBEDDING_DIM)

print(train_embedding_weights.shape)

(2700, 300)


In [15]:
#Now we define the model
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    #Create the embedding layer of the model
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    #Create the model as a sequential model using keras
    model = Sequential()
    #Add the embedding layer to the model
    model.add(embedding_layer)
    #Add a convolutional layer of one dimension and 512 filters with tanh activation
    model.add(Conv1D(512, 10, activation='tanh'))
    #Add a global pooling layer
    model.add(GlobalMaxPooling1D())
    #Add a dense layer with 3 outputs using softmax activation, as we have 3 possible answers
    model.add(Dense(3, activation='softmax'))
    #Compile model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    model.summary()
    return model

In [16]:
#Set the label names of the answers
label_names = ['Pos', 'Neu', 'Neg']
#Set training data to fit model
y_train = data_train[label_names].values
x_train = train_cnn_data

In [17]:
#Set number of epochs and batch size
num_epochs = 5
batch_size = 128

#Set test data to evaluate model
y_test = data_test[label_names].values
X_test = test_cnn_data

#Create model
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

#Fit model and evaluate
hist = model.fit(x_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test), shuffle=True, batch_size=batch_size)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 300)           810000    
                                                                 
 conv1d (Conv1D)             (None, 41, 512)           1536512   
                                                                 
 global_max_pooling1d (Globa  (None, 512)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 3)                 1539      
                                                                 
Total params: 2,348,051
Trainable params: 1,538,051
Non-trainable params: 810,000
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
#Test the model with example
text = "Estuvo chevere la película de Eternals 🙌, Spiderverse confirmado"
text = [preprocessText(t) for t in text]
text = tokenizer.texts_to_sequences(text)
print('----------------')
print(tokenizer.sequences_to_texts(text))
text = pad_sequences(text, maxlen=MAX_SEQUENCE_LENGTH)
predictions = model.predict(text)
for p in predictions:
    p = [round(num,5) for num in p]
    print(p)


----------------
['', 's', 't', 'u', 'v', '', '', 'c', 'h', '', 'v', '', '', '', '', '', '', '', 'p', '', '', '', 'c', 'u', '', '', '', 'd', '', '', '', 't', '', '', 'n', '', '', 's', '', '', '', '', 's', 'p', 'i', 'd', '', '', 'v', '', '', 's', '', '', 'c', '', 'n', '', 'i', '', '', '', 'd', '']
[0.26172, 0.47646, 0.26182]
[0.01593, 0.96941, 0.01466]
[0.01464, 0.97023, 0.01513]
[0.22705, 0.57324, 0.19972]
[0.08233, 0.83944, 0.07822]
[0.26172, 0.47646, 0.26182]
[0.26172, 0.47646, 0.26182]
[0.31734, 0.38315, 0.29951]
[0.26002, 0.46827, 0.27171]
[0.26172, 0.47646, 0.26182]
[0.08233, 0.83944, 0.07822]
[0.26172, 0.47646, 0.26182]
[0.26172, 0.47646, 0.26182]
[0.26172, 0.47646, 0.26182]
[0.26172, 0.47646, 0.26182]
[0.26172, 0.47646, 0.26182]
[0.26172, 0.47646, 0.26182]
[0.26172, 0.47646, 0.26182]
[0.11508, 0.74249, 0.14244]
[0.26172, 0.47646, 0.26182]
[0.26172, 0.47646, 0.26182]
[0.26172, 0.47646, 0.26182]
[0.31734, 0.38315, 0.29951]
[0.22705, 0.57324, 0.19972]
[0.26172, 0.47646, 0.26182]
[0

In [19]:
#export model
import pickle

with open('tokenizer3.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
model.save("cnn_model_3")


INFO:tensorflow:Assets written to: cnn_model3\assets
