In [1]:
import os
import pandas as pd

tweets = []
sentiments = []

# Get data
train_file = '/home/gkc/ProjectData/tweet-sentiment-extraction/train.csv'
test_file = '/home/gkc/ProjectData/tweet-sentiment-extraction/test.csv'

train_df = pd.read_csv(train_file).astype(str)
test_df = pd.read_csv(test_file).astype(str)

In [2]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [3]:
test_df.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27481 non-null  object
 2   selected_text  27481 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [5]:
import numpy as np

# Separate and format data to feed into model
train_data = train_df['selected_text'].to_numpy()
train_labels = pd.get_dummies(train_df['sentiment'])
test_data = test_df['text'].to_numpy()
test_labels = pd.get_dummies(test_df['sentiment'])

In [6]:
# Create labels index for later use
train_cats = train_df["sentiment"].astype('category')
cat_index = train_cats.cat.categories

cat_index

Index(['negative', 'neutral', 'positive'], dtype='object')

In [7]:
test_labels.head()

Unnamed: 0,negative,neutral,positive
0,0,1,0
1,0,0,1
2,1,0,0
3,0,0,1
4,0,0,1


In [8]:
train_labels.head()

Unnamed: 0,negative,neutral,positive
0,0,1,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


In [9]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

# Establish hyperparameters
vocab_size = 10000
embedding_dim = 10
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"

In [10]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_data)
word_index = tokenizer.word_index
total_words = len(tokenizer.word_index) + 1

In [11]:
total_words

17832

In [12]:
# Convert train and test data to sequences
train_sequences = tokenizer.texts_to_sequences(train_data)
train_padded = pad_sequences(train_sequences, maxlen=max_length, truncating=trunc_type)
test_sequences = tokenizer.texts_to_sequences(test_data)
test_padded = pad_sequences(test_sequences, maxlen=max_length, truncating=trunc_type)

In [13]:
train_padded

array([[   0,    0,    0, ...,    2,  158,   47],
       [   0,    0,    0, ...,    0,  420,   72],
       [   0,    0,    0, ...,    0, 7032,   16],
       ...,
       [   0,    0,    0, ...,  396,   15,    6],
       [   0,    0,    0, ...,   30,  578,    7],
       [   0,    0,    0, ..., 2512,  210,  692]], dtype=int32)

In [14]:
# Build Model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, recurrent_dropout=0.35, dropout=0.35, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, recurrent_dropout=0.35, dropout=0.35,)),
    tf.keras.layers.Dense(3, activation='softmax')
])


model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 10)           100000    
_________________________________________________________________
bidirectional (Bidirectional (None, 120, 64)           8448      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                18816     
_________________________________________________________________
dense (Dense)                (None, 3)                 195       
Total params: 127,459
Trainable params: 127,459
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Train model
num_epochs = 3
history = model.fit(train_padded, train_labels, epochs=num_epochs, validation_data=(test_padded, test_labels))
#model.save('tweet_model.h5')

Epoch 1/3
Epoch 2/3

In [None]:
import matplotlib.pyplot as plt

# Plot accuracy vs loss to gain a better understanding of the model's performance

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend(loc=0)
plt.figure()
plt.show()

In [None]:
# Predict a fake tweet
def predict_tweet(tweet):
    token_sequence = tokenizer.texts_to_sequences([tweet])[0]
    token_padded = pad_sequences([token_sequence], maxlen=max_length, truncating=trunc_type)
    predicted = model.predict(token_padded)
    predicted_class = model.predict_classes(token_padded)
    predicted_cat = str(cat_index[predicted_class].tolist())

    print(predicted)
    print(predicted_cat) 
    
fake_tweet = "i love pizza"
predict_tweet(fake_tweet)

In [None]:
# Visualize embeddings

# Set up a logs directory, so Tensorboard knows where to look for files
log_dir='/logs/imdb-example/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Save Labels separately on a line-by-line manner.
with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
  for subwords in encoder.subwords:
    f.write("{}\n".format(subwords))
  # Fill in the rest of the labels with "unknown"
  for unknown in range(1, encoder.vocab_size - len(encoder.subwords)):
    f.write("unknown #{}\n".format(unknown))


# Save the weights we want to analyse as a variable. Note that the first
# value represents any unknown word, which is not in the metadata, so
# we will remove that value.
weights = tf.Variable(model.layers[0].get_weights()[0][1:])
# Create a checkpoint from embedding, the filename and key are
# name of the tensor.
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Set up config
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)


In [None]:
%tensorboard --logdir /logs/tweet_model/