In [14]:
import pandas as pd 
import spacy
from nltk.tokenize import RegexpTokenizer
from spacy.lang.en import stop_words
from nltk.corpus import stopwords

In [3]:
dataset_training = pd.read_csv("twitter_training.csv",names=["id","entity","sentiment","tweet"])
dataset_training.dropna(inplace=True)

In [4]:
dataset_training

Unnamed: 0,id,entity,sentiment,tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [5]:
dataset_training["sentiment"].value_counts()

Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: sentiment, dtype: int64

In [97]:
tweets = dataset_training['tweet']
sentiment = dataset_training['sentiment']

In [98]:
tweets

0        im getting on borderlands and i will murder yo...
1        I am coming to the borders and I will kill you...
2        im getting on borderlands and i will kill you ...
3        im coming on borderlands and i will murder you...
4        im getting on borderlands 2 and i will murder ...
                               ...                        
74677    Just realized that the Windows partition of my...
74678    Just realized that my Mac window partition is ...
74679    Just realized the windows partition of my Mac ...
74680    Just realized between the windows partition of...
74681    Just like the windows partition of my Mac is l...
Name: tweet, Length: 73996, dtype: object

# Pré-processamento

In [24]:
sample = tweets.iloc[68]

In [25]:
sample

"Going to finish up Borderlands 2 today. I've got some new events set up and am look forward to a good stream! Starting in about 20 hour!"

In [10]:
def tokenizer_tweet(tweet):
    tokenizer = RegexpTokenizer(r"\w+|[0-9]")
    tokenized_tweet = tokenizer.tokenize(tweet)
    return tokenized_tweet
    

In [11]:
sample_tokenized = tokenizer_tweet(sample)

In [12]:
sample_tokenized

[]

In [16]:
stop_words_nltk = set(stopwords.words('english'))
all_stopwords = stop_words.STOP_WORDS.union(stop_words_nltk) 

def remove_stopwords(tokens):
    tweet_without_stopwords = []
    for token in tokens:
        if token not in all_stopwords:
            tweet_without_stopwords.append(token)
    return tweet_without_stopwords

In [17]:
sample_without_stopwords = remove_stopwords(sample_tokenized)

In [18]:
def lower_tweet(tokens):
    return [token.lower() for token in tokens]

In [19]:
nlp = spacy.load("en_core_web_sm")

def lemmatize_tweet(tokens):
    tweet = " ".join(token for token in tokens)
    doc = nlp(tweet)
    return [token.lemma_ for token in doc]

In [20]:
def pipeline(tweet):
    new_tweet = tokenizer_tweet(tweet)
    new_tweet = remove_stopwords(new_tweet)
    new_tweet = lower_tweet(new_tweet)
    new_tweet = lemmatize_tweet(new_tweet)
    
    return new_tweet

In [26]:
pipeline(sample)

['go',
 'finish',
 'borderland',
 '2',
 'today',
 'I',
 'get',
 'new',
 'event',
 'set',
 'look',
 'forward',
 'good',
 'stream',
 'start',
 '20',
 'hour']

In [99]:
tweet_preprocess = tweets.apply(lambda x: pipeline(x))

# Word2vec

In [28]:
from gensim.models import Word2Vec
import os

In [29]:
if os.path.isfile("twweets_model_100.model"):
    w2v = Word2Vec.load("twweets_model_100.model")
else:
    w2v = Word2Vec(sentences=tweet_preprocess.values.tolist(),vector_size=100,min_count=1,window=5)
    w2v.train(tweet_preprocess.values.tolist(),total_examples=5,epochs=10)
    w2v.save("twweets_model_100.model")

In [40]:
weights = w2v.wv.vectors
vocab = w2v.wv.index_to_key

In [32]:
length_vocab = len(vocab)

In [100]:
qt_tokens = tweet_preprocess.apply(lambda x: len(x))

In [103]:
MAX_TOKENS = max(qt_tokens)

## Modelagem

In [66]:
from tensorflow.keras.layers import TextVectorization, LSTM, Dense, Embedding, Activation, Flatten, Input
from tensorflow.keras import Sequential, Model
from tensorflow.ragged import constant
from sklearn.model_selection import train_test_split

In [None]:
tweets_train, tweet_test, sentiment_train, sentiment_test = train_test_split(tweet_preprocess,sentiment,test_size=0.2)

In [115]:
t2v = TextVectorization(max_tokens=None, standardize=None, split=None, output_sequence_length=MAX_TOKENS+1, vocabulary=vocab)

In [54]:
X_train = constant(tweets_train.values.tolist())
X_test = constant(tweet_test.values.tolist())

In [57]:
y_train = sentiment_train.replace({"Negative":0,"Positive":4,"Neutral":3, "Irrelevant":2}).values
y_test = sentiment_test.replace({"Negative":0,"Positive":4,"Neutral":3, "Irrelevant":2}).values

In [125]:
X_train_vec = t2v(X_train)

In [126]:
X_train_vec

<tf.Tensor: shape=(59196, 199), dtype=int64, numpy=
array([[ 304,  220, 4779, ...,    0,    0,    0],
       [ 124,   14,    0, ...,    0,    0,    0],
       [2521, 1284,  739, ...,    0,    0,    0],
       ...,
       [   2, 2629,  193, ...,    0,    0,    0],
       [4309,   85, 1936, ...,    0,    0,    0],
       [   2,   10,   29, ...,    0,    0,    0]])>

In [127]:
"""model = Sequential()
model.add(Input(shape=X_train_vec.shape[1:]))
model.add(Embedding(weights=[weights],input_dim=weights.shape[0],input_length=MAX_TOKENS+1,output_dim=weights.shape[1]))
model.add(LSTM(50,return_sequences=True))
model.add(Flatten())
model.add(Dense(50))
model.add(Dense(4))
model.add(Activation(activation='softmax'))"""



input_model = Input(shape=X_train_vec.shape[1:])
tensor = Embedding(weights=[weights],input_length=MAX_TOKENS+1,input_dim=weights.shape[0],output_dim=weights.shape[1])(input_model)
tensor = LSTM(50,return_sequences=True)(tensor)
vec = Flatten()(tensor)
vec = Dense(50)(vec)
vec = Dense(4)(vec)
output_model = Activation(activation='softmax')(vec)

model = Model(input_model,output_model)

model.summary()

Model: "model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_19 (InputLayer)       [(None, 199)]             0         
                                                                 
 embedding_16 (Embedding)    (None, 199, 100)          2726400   
                                                                 
 lstm_14 (LSTM)              (None, 199, 50)           30200     
                                                                 
 flatten_7 (Flatten)         (None, 9950)              0         
                                                                 
 dense_29 (Dense)            (None, 50)                497550    
                                                                 
 dense_30 (Dense)            (None, 4)                 204       
                                                                 
 activation_14 (Activation)  (None, 4)                 0  

In [128]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['sparse_categorical_accuracy'])

In [130]:
history = model.fit(X_train_vec,y_train,
                    batch_size=32,
                    epochs=10,
                    validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: 