In [137]:
import os
import pandas as pd

tweets = []
sentiments = []

# Get data
train_file = '/home/gkc/ProjectData/tweet-sentiment-extraction/train.csv'
test_file = '/home/gkc/ProjectData/tweet-sentiment-extraction/test.csv'

train_df = pd.read_csv(train_file).astype(str)
test_df = pd.read_csv(test_file).astype(str)

In [138]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [139]:
test_df.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [183]:
import numpy as np

# Separate and format data to feed into model
train_data = train_df['selected_text'].to_numpy()
train_labels = pd.get_dummies(train_df['sentiment'])
test_data = train_df['text'].to_numpy()
test_labels = pd.get_dummies(train_df['sentiment'])

In [184]:
test_labels.head()

Unnamed: 0,negative,neutral,positive
0,0,1,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


In [185]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

# Establish hyperparameters
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"

In [186]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_data)
word_index = tokenizer.word_index
total_words = len(tokenizer.word_index) + 1

In [187]:
total_words

17832

In [188]:
word_index

{'<OOV>': 1,
 'i': 2,
 'to': 3,
 'the': 4,
 'a': 5,
 'you': 6,
 'it': 7,
 'my': 8,
 'and': 9,
 'is': 10,
 's': 11,
 'in': 12,
 't': 13,
 'for': 14,
 'of': 15,
 'me': 16,
 'that': 17,
 'on': 18,
 'so': 19,
 'have': 20,
 'but': 21,
 'm': 22,
 'good': 23,
 'not': 24,
 'just': 25,
 'be': 26,
 'day': 27,
 'with': 28,
 'at': 29,
 'was': 30,
 'love': 31,
 'can': 32,
 'happy': 33,
 'no': 34,
 'all': 35,
 'this': 36,
 'up': 37,
 'now': 38,
 'out': 39,
 'get': 40,
 'like': 41,
 'are': 42,
 'go': 43,
 'do': 44,
 'work': 45,
 'too': 46,
 'going': 47,
 'your': 48,
 'today': 49,
 'lol': 50,
 'what': 51,
 'got': 52,
 'don': 53,
 'we': 54,
 'one': 55,
 'time': 56,
 'thanks': 57,
 'u': 58,
 'miss': 59,
 'really': 60,
 'will': 61,
 'back': 62,
 'know': 63,
 'from': 64,
 'im': 65,
 'there': 66,
 'great': 67,
 'fun': 68,
 'see': 69,
 'its': 70,
 'am': 71,
 'sad': 72,
 'sorry': 73,
 'if': 74,
 'some': 75,
 'well': 76,
 'home': 77,
 'want': 78,
 'about': 79,
 'they': 80,
 'hope': 81,
 'had': 82,
 'bad': 83,

In [189]:
# Convert train and test data to sequences
train_sequences = tokenizer.texts_to_sequences(train_data)
train_padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)
test_sequences = tokenizer.texts_to_sequences(test_data)
test_padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

In [190]:
train_padded

array([[   0,    0,    0, ...,    2,  158,   47],
       [   0,    0,    0, ...,    0,  420,   72],
       [   0,    0,    0, ...,    0, 7032,   16],
       ...,
       [   0,    0,    0, ...,  396,   15,    6],
       [   0,    0,    0, ...,   30,  578,    7],
       [   0,    0,    0, ..., 2512,  210,  692]], dtype=int32)

In [191]:
# Build Model
# temporary simplified model
# > add dropout, more layers, etc

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                9600      
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 390       
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 21        
Total params: 170,011
Trainable params: 170,011
Non-trainable params: 0
_________________________________________________________________


In [193]:
# Train model
# temporary placeholder
# > train with gpu optimization for many epochs

num_epochs = 1
history = model.fit(train_padded, train_labels, epochs=num_epochs, validation_data=(test_padded, test_labels))



In [None]:
'''
TO DO:

- Improve model
- Improve training
- Reverse Word Index
- Predict tweet method
- Visualizations such as
    - most common words
    - word embeddings, clustering
    - model training graphs

'''