## Testing Dataset

In [18]:
import pandas as pd
Tweet = pd.read_csv("Airlines.csv")

In [19]:
Tweet.head()

Unnamed: 0,twsentiment,text,tweet_coord,tweet_created,tweet_location,user_timezone,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,neutral,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada),,,,,,
1,positive,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada),,,,,,
2,neutral,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada),,,,,,
3,negative,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada),,,,,,
4,negative,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada),,,,,,


In [20]:
import re
def tweet_to_words(raw_tweet):
    """
    Only keeps ascii characters in the tweet and discards @words

    :param raw_tweet:
    :return:
    """
    letters_only = re.sub("[^a-zA-Z@]", " ", raw_tweet)
    words = letters_only.lower().split()
    meaningful_words = [w for w in words if not re.match("^[@]", w)]
    return " ".join(meaningful_words)

In [21]:
# Pre-process the tweet and store in a separate column
Tweet['clean_tweet'] = Tweet['text'].apply(lambda x: tweet_to_words(x))

In [51]:
Tweet['clean_tweet'].head()

0                                            what said
1    plus you ve added commercials to the experienc...
2    i didn t today must mean i need to take anothe...
3    it s really aggressive to blast obnoxious ente...
4             and it s a really big bad thing about it
Name: clean_tweet, dtype: object

In [25]:
# Convert sentiment to binary
Tweet['sentiment'] = Tweet['twsentiment'].apply(lambda x: 0 if x == 'negative' else 1 if x == 'positive' else 2)

In [28]:
Tweet['sentiment'].head() # label

0    2
1    1
2    2
3    0
4    0
Name: sentiment, dtype: int64

In [29]:
# Join all the words in review to build a corpus
all_text = ' '.join(Tweet['clean_tweet'])
words = all_text.split()

In [33]:
len(all_text)

1250702

In [36]:
len(words) # 241782 words

241782

In [38]:
from collections import Counter
# Convert words to integers
counts = Counter(words)

In [42]:
counts

Counter({'what': 803,
         'said': 178,
         'plus': 56,
         'you': 4345,
         've': 472,
         'added': 20,
         'commercials': 6,
         'to': 8454,
         'the': 5956,
         'experience': 198,
         'tacky': 1,
         'i': 6602,
         'didn': 195,
         't': 3289,
         'today': 419,
         'must': 36,
         'mean': 55,
         'need': 542,
         'take': 263,
         'another': 266,
         'trip': 206,
         'it': 2359,
         's': 1564,
         'really': 297,
         'aggressive': 2,
         'blast': 2,
         'obnoxious': 2,
         'entertainment': 24,
         'in': 2526,
         'your': 1712,
         'guests': 4,
         'faces': 3,
         'amp': 638,
         'they': 741,
         'have': 1603,
         'little': 68,
         'recourse': 5,
         'and': 3696,
         'a': 4542,
         'big': 72,
         'bad': 184,
         'thing': 69,
         'about': 505,
         'seriously': 79,
         'wou

In [43]:
numwords = 200  # Limit the number of words to use
vocab = sorted(counts, key=counts.get, reverse=True)[:numwords]
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [45]:
vocab

['to',
 'i',
 'the',
 'a',
 'you',
 'for',
 'flight',
 'on',
 'and',
 't',
 'my',
 'is',
 'in',
 'it',
 'of',
 'me',
 'your',
 'that',
 'can',
 'have',
 'was',
 's',
 'not',
 'with',
 'at',
 'no',
 'this',
 'get',
 'we',
 'but',
 'from',
 'be',
 'co',
 'http',
 'are',
 'thanks',
 'cancelled',
 'now',
 'an',
 'just',
 'service',
 'do',
 'so',
 'been',
 'help',
 'what',
 'time',
 'm',
 'will',
 'they',
 'customer',
 'out',
 'up',
 'us',
 'our',
 'hours',
 'all',
 'when',
 'amp',
 'how',
 'flights',
 'plane',
 'there',
 'hold',
 'if',
 'thank',
 'why',
 'please',
 'still',
 'one',
 'would',
 'am',
 'need',
 'delayed',
 'back',
 'gate',
 'about',
 'call',
 'flightled',
 'had',
 'or',
 'bag',
 'as',
 'hour',
 've',
 'has',
 'after',
 'don',
 'got',
 'any',
 'like',
 'phone',
 'today',
 'late',
 'airline',
 'over',
 'more',
 'again',
 'by',
 'guys',
 'fly',
 're',
 'waiting',
 'know',
 'airport',
 'way',
 'u',
 'trying',
 'day',
 'great',
 'only',
 'did',
 'going',
 'then',
 'should',
 'wait

In [56]:
vocab_to_int

{'a': 4,
 'aa': 133,
 'about': 77,
 'after': 87,
 'again': 98,
 'agent': 150,
 'airline': 95,
 'airport': 105,
 'all': 57,
 'am': 72,
 'amp': 59,
 'an': 39,
 'and': 9,
 'another': 144,
 'any': 90,
 'are': 35,
 'as': 83,
 'at': 25,
 'back': 75,
 'bag': 82,
 'baggage': 178,
 'bags': 142,
 'be': 32,
 'because': 117,
 'been': 44,
 'before': 191,
 'being': 193,
 'but': 30,
 'by': 99,
 'call': 78,
 'can': 19,
 'cancelled': 37,
 'change': 118,
 'check': 129,
 'co': 33,
 'could': 157,
 'crew': 177,
 'customer': 51,
 'd': 194,
 'day': 109,
 'days': 174,
 'delay': 128,
 'delayed': 74,
 'did': 112,
 'didn': 197,
 'dm': 138,
 'do': 42,
 'don': 88,
 'due': 165,
 'email': 173,
 'even': 132,
 'ever': 163,
 'experience': 195,
 'first': 148,
 'flight': 7,
 'flighted': 182,
 'flightled': 79,
 'flights': 61,
 'fly': 101,
 'flying': 123,
 'for': 6,
 'from': 31,
 'gate': 76,
 'get': 28,
 'getting': 166,
 'go': 124,
 'going': 113,
 'good': 131,
 'got': 89,
 'great': 110,
 'guys': 100,
 'had': 80,
 'has': 86

In [52]:
tweet_ints = []
for each in Tweet['clean_tweet']:
    tweet_ints.append([vocab_to_int[word] for word in each.split() if word in vocab_to_int])

In [54]:
tweet_ints

[[46],
 [5, 85, 1, 3, 195],
 [2, 197, 10, 93, 2, 73, 1, 145, 144, 187],
 [14, 22, 127, 1, 13, 17, 59, 50, 20],
 [9, 14, 22, 4, 127, 77, 14],
 [71, 4, 7, 6, 189, 18, 197, 10, 20, 27, 14, 22, 127, 3, 111, 77, 123],
 [161, 47, 2, 101, 27, 10, 124],
 [127, 4, 6, 63, 10, 33],
 [2, 197, 10, 30, 38, 2, 42, 194],
 [14, 21, 9, 39, 84, 5, 102, 168, 131, 1, 16],
 [112, 5, 104, 18, 12, 3, 15],
 [2],
 [27, 12, 4, 110, 77, 11, 187, 1, 59],
 [2, 48, 123, 17, 98, 107, 145, 57, 3, 31, 171, 34, 10, 33],
 [36],
 [12, 69],
 [43, 6, 11, 148, 7, 1, 2, 85, 30, 110, 77],
 [2, 31, 1, 126, 9, 10, 13, 11, 141, 165, 1, 183, 8, 15, 16, 45],
 [2, 123],
 [5, 104, 46, 71, 32, 68, 2, 149, 1, 101, 24, 111, 5],
 [67, 35, 17, 148, 13, 96, 97, 164, 188, 58, 57, 189, 35, 1],
 [2, 162, 27, 34, 10, 33],
 [2, 162, 3, 5, 35, 4, 131],
 [49, 5, 32],
 [5, 100, 53, 11, 2, 24, 11, 9, 5, 100, 11, 141, 2, 149],
 [2, 9, 14, 22, 44, 9, 24, 26, 190],
 [46, 25, 8, 43, 2, 104, 2, 10, 32, 6, 172, 125],
 [42, 5, 16, 88, 10, 29, 151, 32, 153

In [61]:
import numpy as np
# Create a list of labels
labels = np.array(Tweet['sentiment'])
labels

array([2, 1, 2, ..., 2, 0, 2], dtype=int64)

In [62]:
# Find the number of tweets with zero length after the data pre-processing
tweet_len = Counter([len(x) for x in tweet_ints])
print("Zero-length reviews: {}".format(tweet_len[0]))
print("Maximum tweet length: {}".format(max(tweet_len)))

Zero-length reviews: 148
Maximum tweet length: 29


In [63]:
# Remove those tweets with zero length and its corresponding label
tweet_idx = [idx for idx, tweet in enumerate(tweet_ints) if len(tweet) > 0]  # collect idx for len(tweet) > 0
labels = labels[tweet_idx]
Tweet = Tweet.iloc[tweet_idx]
tweet_ints = [tweet for tweet in tweet_ints if len(tweet) > 0]

In [65]:
seq_len = max(tweet_len)
features = np.zeros((len(tweet_ints), seq_len), dtype=int)
for i, row in enumerate(tweet_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

In [68]:
features # Finally, we get our inputs from dataset!!! please compare features with tweet_ints

array([[  0,   0,   0, ...,   0,   0,  46],
       [  0,   0,   0, ...,   1,   3, 195],
       [  0,   0,   0, ..., 145, 144, 187],
       ...,
       [  0,   0,   0, ...,   0,  68,   1],
       [  0,   0,   0, ...,  19, 122,  11],
       [  0,   0,   0, ...,   3, 172,   7]])

### Split feature into train set and val set

In [71]:
split_frac = 0.8
split_idx = int(len(features) * 0.8)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x) * 0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

print("Train set: \t\t{}".format(train_y.shape),
      "\nValidation set: \t{}".format(val_y.shape),
      "\nTest set: \t\t{}".format(test_y.shape))

			Feature Shapes:
Train set: 		(11593, 29) 
Validation set: 	(1449, 29) 
Test set: 		(1450, 29)
Train set: 		(11593,) 
Validation set: 	(1449,) 
Test set: 		(1450,)


## Model Building

In [72]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM
from keras.optimizers import RMSprop, SGD
from keras.utils import np_utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [75]:
 # Model
drop = 0.0
nlayers = 1  # >= 1
RNN = LSTM  # GRU
nclasses = 3 # positive, neutral and negative
impl = 2

neurons = 64
embedding = 20

model = Sequential()
model.add(Embedding(numwords + 1, embedding, input_length=seq_len))

if nlayers == 1:
    model.add(RNN(neurons, implementation=impl, recurrent_dropout=drop))
else:
    model.add(RNN(neurons, implementation=impl, recurrent_dropout=drop, return_sequences=True))
    for i in range(1, nlayers - 1):
        model.add(RNN(neurons, recurrent_dropout=drop, implementation=impl, return_sequences=True))
    model.add(RNN(neurons, recurrent_dropout=drop, implementation=impl))

model.add(Dense(nclasses))
model.add(Activation('softmax'))

## Training

In [77]:
learning_rate = 0.01
optimizer = SGD(lr=learning_rate, momentum=0.95)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

epochs = 50
batch_size = 100
verbose = 1

train_y_c = np_utils.to_categorical(train_y, 3)
val_y_c = np_utils.to_categorical(val_y, 3)

model.fit(train_x, train_y_c,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(val_x, val_y_c),
          verbose=verbose)

Train on 11593 samples, validate on 1449 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x26f7b66f9b0>

## Result

In [78]:
from sklearn.metrics import confusion_matrix, classification_report

test_y_c = np_utils.to_categorical(test_y, 3)
score, acc = model.evaluate(test_x, test_y_c,
                            batch_size=batch_size,
                            verbose=verbose)
print()
print('Test ACC=', acc)

test_pred = model.predict_classes(test_x, verbose=verbose)

print()
print('Confusion Matrix')
print('-'*20)
print(confusion_matrix(test_y, test_pred))
print()
print('Classification Report')
print('-'*40)
print(classification_report(test_y, test_pred))
print()
print("Ending")


Test ACC= 0.8034482742178028

Confusion Matrix
--------------------
[[964  42 114]
 [ 36  80  19]
 [ 55  19 121]]

Classification Report
----------------------------------------
             precision    recall  f1-score   support

          0       0.91      0.86      0.89      1120
          1       0.57      0.59      0.58       135
          2       0.48      0.62      0.54       195

avg / total       0.82      0.80      0.81      1450


Ending
