# Tweet dataset

https://www.kaggle.com/maxjon/complete-tweet-sentiment-extraction-data

In [118]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Flatten, Concatenate, Input, LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

import pandas as pd

In [119]:
tweet_data = pd.read_csv('./data/tweet_dataset.csv')
tweet_data.head()

Unnamed: 0,textID,sentiment,author,text,old_text,aux_id,new_sentiment,selected_text
0,1956967341,empty,xoshayzers,i know i was listenin to bad habit earlier a...,@tiffanylue i know i was listenin to bad habi...,p1000000000,,
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,Layin n bed with a headache ughhhh...waitin o...,c811396dc2,negative,headache
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,Funeral ceremony...gloomy friday...,9063631ab1,negative,gloomy
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants to hang out with friends SOON!,2a815f151d,positive,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,We want to trade with someone who has Houston...,@dannycastillo We want to trade with someone w...,82565a56d3,neutral,We want to trade with someone who has Houston ...


In [120]:
tweet_data = tweet_data.loc[:, ['text', 'sentiment']]
tweet_data.head()

Unnamed: 0,text,sentiment
0,i know i was listenin to bad habit earlier a...,empty
1,Layin n bed with a headache ughhhh...waitin o...,sadness
2,Funeral ceremony...gloomy friday...,sadness
3,wants to hang out with friends SOON!,enthusiasm
4,We want to trade with someone who has Houston...,neutral


In [121]:
pd.factorize(tweet_data['sentiment'])

(array([0, 1, 1, ..., 6, 9, 6], dtype=int64),
 Index(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
        'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
       dtype='object'))

In [122]:
category_list = pd.factorize(tweet_data['sentiment'])[1]
category_list

Index(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype='object')

In [123]:
tweet_data['sentiment'] = pd.factorize(tweet_data['sentiment'])[0]
tweet_data.head()

Unnamed: 0,text,sentiment
0,i know i was listenin to bad habit earlier a...,0
1,Layin n bed with a headache ughhhh...waitin o...,1
2,Funeral ceremony...gloomy friday...,1
3,wants to hang out with friends SOON!,2
4,We want to trade with someone who has Houston...,3


In [124]:
tweet_data['text'] = tweet_data['text'].str.replace("[^\w]", " ")
# tweet_data['text'] = tweet_data['text'].str.replace("[^\(0-9)]", " ")

In [125]:
tweet_data = tweet_data.dropna()

In [126]:
tweet_train, tweet_test, y_train, y_test = train_test_split(tweet_data['text'], tweet_data['sentiment'], random_state=123)

In [127]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

len(y_train[0]), len(y_test[0])

(13, 13)

In [128]:
stopwords = ['a', 'an']

X_train = []
for stc in tweet_train:
    token = []
    # print(stc)
    words = stc.split()
    for word in words:
        if word not in stopwords:
            token.append(word)
    X_train.append(token)


X_test = []
for stc in tweet_test:
    token = []
    for word in stc.split():
        if word not in stopwords:
            token.append(word)
    X_test.append(token)

In [129]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(25000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

print(len(tokenizer.word_index))

low_count = 0
for word, word_count in tokenizer.word_counts.items():
    if word_count == 1:
        low_count += 1
print(low_count)

27921
16939


In [130]:
max_length = 0
for data in X_train:
    if max_length < len(data):
        max_length = len(data)
print(max_length)

40


In [131]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 40
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [132]:
model = Sequential()
model.add(Embedding(27921, 32))
model.add(LSTM(32))
model.add(Dense(13, activation='relu'))

In [133]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3)

Train on 29950 samples, validate on 9984 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1c7c5ee9a48>

In [134]:
sentence = "I can`t sleep"

token_stc = sentence.split()
encode_stc = tokenizer.texts_to_sequences([token_stc])
pad_stc = pad_sequences(encode_stc)

score = model.predict(pad_stc)
print(score)

[[0.         0.04797543 0.01669496 0.06538234 0.13466057 0.03877593
  0.07940554 0.03044652 0.         0.03297355 0.         0.03044399
  0.        ]]


In [135]:
print(category_list[score.argmax()], score[0, score.argmax()])

worry 0.13466057
