## Twitter Sentiment Analysis
1. This is a Twitter data Sentiment Analysis Program<br>
2. This code is implemented using Deep Learning methods .<br>
3. I used <font color=#068DA9>Trax</font>.library for implementation.<br>
4. Trax is an end-to-end deep learning library focusing on clear code and speed. It is  <font color=#068DA9>actively used and maintained in the <b>Google Brain team.</b> </font><br>
5. <a href="https://trax-ml.readthedocs.io/en/latest/notebooks/trax_intro.html">Trax Documetation</a>

In [None]:
import os
import trax
import trax.fastmath.numpy as np
from trax import layers as tl

# import Layer from the utils.py file

In [None]:
from utils import *

  getpass.getpass = self._save_getpass
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import numpy as np
all_positive_tweets,all_negative_tweets=load_tweets()
print(f"The number of positive tweets: {len(all_positive_tweets)}")
print(f"The number of negative tweets: {len(all_negative_tweets)}")

val_pos=all_positive_tweets[4000:]
train_pos=all_positive_tweets[:4000]

val_neg=all_negative_tweets[4000:]
train_neg=all_negative_tweets[:4000]


train_x=train_pos+train_neg
val_x=val_pos+val_neg


train_y_train=np.append(np.ones((len(train_pos),1)),np.zeros((len(train_neg),1)))
val_y=np.append(np.ones((len(val_pos),1)),np.zeros((len(val_neg),1)))

print(f"length of train_x {len(train_x)}")
print(f"length of val_x {len(val_x)}")

The number of positive tweets: 5000
The number of negative tweets: 5000
length of train_x 8000
length of val_x 2000


In [None]:
print("original tweet at training position 0")
print(train_pos[0])

print("Tweet at training position 0 after processing:")
process_tweet(train_pos[0])

original tweet at training position 0
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
Tweet at training position 0 after processing:


['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']

In [None]:
Vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2}

for tweet in train_x:
  tokens=process_tweet(tweet)
  for token in tokens:
    if token not in Vocab:
      Vocab[token]=len(Vocab)
print("Total words in vocab are",len(Vocab))

  getpass.getpass = self._save_getpass


Total words in vocab are 9088


In [None]:
def tweet_to_tensor(tweet, vocab_dict, unk_token='__UNK__', verbose=False):
  word_l =process_tweet(tweet)
  if verbose:
        print("List of words from the processed tweet:")
        print(word_l)
  tensor_l = []

  unk_ID =vocab_dict.get(unk_token)
  if verbose:
    print(f"The unique integer ID for the unk_token is {unk_ID}")

  for words in word_l:
    word_ID=vocab_dict.get(words,unk_ID)
    tensor_l.append(word_ID)
  return tensor_l

  getpass.getpass = self._save_getpass


In [None]:
print("Actual tweet is\n", val_pos[0])
print("\nTensor of tweet:\n", tweet_to_tensor(val_pos[0], vocab_dict=Vocab))

Actual tweet is
 Bro:U wan cut hair anot,ur hair long Liao bo
Me:since ord liao,take it easy lor treat as save $ leave it longer :)
Bro:LOL Sibei xialan

Tensor of tweet:
 [1064, 136, 478, 2351, 744, 8148, 1122, 744, 53, 2, 2671, 790, 2, 2, 348, 600, 2, 3488, 1016, 596, 4558, 9, 1064, 157, 2, 2]


In [None]:
import random as rnd

  getpass.getpass = self._save_getpass


In [None]:
def data_generator(data_pos, data_neg, batch_size, loop, vocab_dict, shuffle=False):
    assert batch_size % 2 == 0
    n_to_take = batch_size // 2

    pos_index = 0
    neg_index = 0

    len_data_pos = len(data_pos)
    len_data_neg = len(data_neg)

    pos_index_lines = list(range(len_data_pos))
    neg_index_lines = list(range(len_data_neg))

    if shuffle:
        rnd.shuffle(pos_index_lines)
        rnd.shuffle(neg_index_lines)

    stop = False

    # Loop indefinitely
    while not stop:
        batch = []
        for i in range(n_to_take):
          if pos_index >= len_data_pos:
              if not loop:
                  stop = True;
                  break;
              pos_index = 0
              if shuffle:
                  rnd.shuffle(pos_index_lines)
          tweet = data_pos[pos_index_lines[pos_index]]
          tensor = tweet_to_tensor(tweet, vocab_dict)
          batch.append(tensor)
          pos_index = pos_index + 1

        for i in range(n_to_take):
          if neg_index>=len_data_neg:
            if not loop:
              stop=True
              break
            neg_index=0
            if shuffle:
              rnd.shuffle(neg_index_lines)
          tweet=data_neg[neg_index_lines[neg_index]]
          tensor=tweet_to_tensor(tweet,vocab_dict)
          batch.append(tensor)
          neg_index=neg_index+1

        if stop:
          break
        pos_index+=n_to_take
        neg_index+=n_to_take
        max_len = max([len(t) for t in batch])
        tensor_pad_l = []

        for tensor in batch:
          n_pad=max_len-len(tensor)
          pad_l=[0]*n_pad
          tensor_pad=tensor+pad_l
          tensor_pad_l.append(tensor_pad)

        inputs=np.array(tensor_pad_l)
        target_pos=[1]*n_to_take
        target_neg=[0]*n_to_take
        target_l=target_pos+target_neg
        targets=np.array(target_l)
        example_weights = np.ones_like(targets)
        yield inputs, targets, example_weights




In [None]:
rnd.seed(30)

# Create the training data generator
def train_generator(batch_size, shuffle = False):
    return data_generator(train_pos, train_neg, batch_size, True, Vocab, shuffle)

# Create the validation data generator
def val_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, True, Vocab, shuffle)

# Create the validation data generator
def test_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, False, Vocab, shuffle)

# Get a batch from the train_generator and inspect.
inputs, targets, example_weights = next(train_generator(4, shuffle=True))

# this will print a list of 4 tensors padded with zeros
print(f'Inputs: {inputs}')
print(f'Targets: {targets}')
print(f'Example Weights: {example_weights}')

Inputs: [[2005 4450 3200    9    0    0    0    0    0    0    0]
 [4953  566 2000 1453 5173 3498  141 3498  130  458    9]
 [3760  109  136  582 2929 3968    0    0    0    0    0]
 [ 249 3760    0    0    0    0    0    0    0    0    0]]
Targets: [1 1 0 0]
Example Weights: [1 1 1 1]


In [None]:

tmp_data_gen = train_generator(batch_size = 4)
tmp_inputs, tmp_targets, tmp_example_weights = next(tmp_data_gen)

print(f"The inputs shape is {tmp_inputs.shape}")
print(f"The targets shape is {tmp_targets.shape}")
print(f"The example weights shape is {tmp_example_weights.shape}")

for i,t in enumerate(tmp_inputs):
    print(f"input tensor: {t}; target {tmp_targets[i]}; example weights {tmp_example_weights[i]}")

The inputs shape is (4, 14)
The targets shape is (4,)
The example weights shape is (4,)
input tensor: [3 4 5 6 7 8 9 0 0 0 0 0 0 0]; target 1; example weights 1
input tensor: [10 11 12 13 14 15 16 17 18 19 20  9 21 22]; target 1; example weights 1
input tensor: [5736 2900 3760    0    0    0    0    0    0    0    0    0    0    0]; target 0; example weights 1
input tensor: [ 857  255 3651 5737  306 4457  566 1229 2766  327 1201 3760    0    0]; target 0; example weights 1


In [None]:
from trax import fastmath

# usinng the numpy module from trax
np = fastmath.numpy

# using the fastmath.random module from trax
random = fastmath.random

In [None]:
tmp_key = random.get_prng(seed=1)
print("The random seed generated by random.get_prng")
display(tmp_key)

print("choose a matrix with 2 rows and 3 columns")
tmp_shape=(2,3)
display(tmp_shape)
tmp_weight = trax.fastmath.random.normal(key=tmp_key, shape=tmp_shape)

print("Weight matrix generated with a normal distribution with mean 0 and stdev of 1")
display(tmp_weight)

The random seed generated by random.get_prng


DeviceArray([0, 1], dtype=uint32)

choose a matrix with 2 rows and 3 columns


(2, 3)

Weight matrix generated with a normal distribution with mean 0 and stdev of 1


DeviceArray([[ 0.95730704, -0.9699289 ,  1.0070665 ],
             [ 0.3661903 ,  0.1729483 ,  0.29092234]], dtype=float32)

In [None]:
def classifier(vocab_size=len(Vocab), embedding_dim=256, output_dim=2, mode='train'):
  embed_layer=tl.Embedding(vocab_size=vocab_size,d_feature=embedding_dim)
  mean_layer=tl.Mean(axis=1)
  dense_output_layer=tl.Dense(n_units=output_dim)
  log_softmax_layer=tl.LogSoftmax()


  model=tl.Serial(
      embed_layer,
      mean_layer,
      dense_output_layer,
      log_softmax_layer
  )
  return model

  getpass.getpass = self._save_getpass


In [None]:
tmp_model=classifier()

In [None]:
print(type(tmp_model))
display(tmp_model)

<class 'trax.layers.combinators.Serial'>


Serial[
  Embedding_9088_256
  Mean
  Dense_2
  LogSoftmax
]

In [None]:
from trax.supervised import training
batch_size=16
rnd.seed(271)
train_task=training.TrainTask(
    labeled_data=train_generator(batch_size=batch_size, shuffle=True),
    loss_layer=tl.CrossEntropyLoss(),
    optimizer=trax.optimizers.Adam(0.01),
     n_steps_per_checkpoint=10,
)
eval_task=training.EvalTask(
    labeled_data=val_generator(batch_size=batch_size, shuffle=True),
    metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
)
model = classifier()

In [None]:
output_dir = '~/model/'
output_dir_expand = os.path.expanduser(output_dir)
print(output_dir_expand)

/root/model/


In [None]:
def train_model(classifier,train_task,eval_task,n_steps,output_dir):
  training_loop=training.Loop(
      classifier,
      train_task,
      eval_tasks = eval_task,
      output_dir=output_dir
  )
  training_loop.run(n_steps=n_steps)
  return training_loop

In [None]:
training_loop = train_model(model, train_task, eval_task, 100, output_dir_expand)

  with gzip.GzipFile(fileobj=f, compresslevel=compresslevel) as gzipf:



Step      1: Total number of trainable weights: 2327042
Step      1: Ran 1 train steps in 1.71 secs
Step      1: train CrossEntropyLoss |  0.69223136


  with gzip_lib.GzipFile(fileobj=f, compresslevel=2) as gzipf:


Step      1: eval  CrossEntropyLoss |  0.68210447
Step      1: eval          Accuracy |  0.50000000

Step     10: Ran 9 train steps in 3.99 secs
Step     10: train CrossEntropyLoss |  0.63012367
Step     10: eval  CrossEntropyLoss |  0.55799788
Step     10: eval          Accuracy |  0.87500000

Step     20: Ran 10 train steps in 2.22 secs
Step     20: train CrossEntropyLoss |  0.41321626
Step     20: eval  CrossEntropyLoss |  0.24554147
Step     20: eval          Accuracy |  1.00000000

Step     30: Ran 10 train steps in 1.83 secs
Step     30: train CrossEntropyLoss |  0.20119026
Step     30: eval  CrossEntropyLoss |  0.12441064
Step     30: eval          Accuracy |  1.00000000

Step     40: Ran 10 train steps in 1.43 secs
Step     40: train CrossEntropyLoss |  0.13968445
Step     40: eval  CrossEntropyLoss |  0.08694202
Step     40: eval          Accuracy |  1.00000000

Step     50: Ran 10 train steps in 1.02 secs
Step     50: train CrossEntropyLoss |  0.05359090
Step     50: eval  Cr

In [None]:
tmp_train_generator = train_generator(16)
tmp_batch = next(tmp_train_generator)
tmp_inputs, tmp_targets, tmp_example_weights = tmp_batch

print(f"The batch is a tuple of length {len(tmp_batch)} because position 0 contains the tweets, and position 1 contains the targets.")
print(f"The shape of the tweet tensors is {tmp_inputs.shape} (num of examples, length of tweet tensors)")
print(f"The shape of the labels is {tmp_targets.shape}, which is the batch size.")
print(f"The shape of the example_weights is {tmp_example_weights.shape}, which is the same as inputs/targets size.")

The batch is a tuple of length 3 because position 0 contains the tweets, and position 1 contains the targets.
The shape of the tweet tensors is (16, 15) (num of examples, length of tweet tensors)
The shape of the labels is (16,), which is the batch size.
The shape of the example_weights is (16,), which is the same as inputs/targets size.


In [None]:
tmp_pred = training_loop.eval_model(tmp_inputs)
print(f"The prediction shape is {tmp_pred.shape}, num of tensor_tweets as rows")
print("Column 0 is the probability of a negative sentiment (class 0)")
print("Column 1 is the probability of a positive sentiment (class 1)")
print()
print("View the prediction array")
tmp_pred

The prediction shape is (16, 2), num of tensor_tweets as rows
Column 0 is the probability of a negative sentiment (class 0)
Column 1 is the probability of a positive sentiment (class 1)

View the prediction array


DeviceArray([[-8.5379219e+00, -1.9598007e-04],
             [-9.6788101e+00, -6.2465668e-05],
             [-8.7531166e+00, -1.5783310e-04],
             [-8.5696316e+00, -1.8978119e-04],
             [-6.2401304e+00, -1.9514561e-03],
             [-7.9618835e+00, -3.4856796e-04],
             [-8.6638069e+00, -1.7261505e-04],
             [-5.0924158e+00, -6.1621666e-03],
             [-2.3238659e-03, -6.0657158e+00],
             [-3.2658577e-03, -5.7258863e+00],
             [-3.8337708e-04, -7.8665628e+00],
             [-4.7683716e-07, -1.4753685e+01],
             [-1.1482954e-02, -4.4726315e+00],
             [-1.2810230e-03, -6.6608448e+00],
             [-2.4194717e-03, -6.0253959e+00],
             [-5.3596497e-04, -7.5315137e+00]], dtype=float32)

In [None]:
tmp_is_positive = tmp_pred[:,1] > tmp_pred[:,0]
for i, p in enumerate(tmp_is_positive):
    print(f"Neg log prob {tmp_pred[i,0]:.4f}\tPos log prob {tmp_pred[i,1]:.4f}\t is positive? {p}\t actual {tmp_targets[i]}")

Neg log prob -8.5379	Pos log prob -0.0002	 is positive? True	 actual 1
Neg log prob -9.6788	Pos log prob -0.0001	 is positive? True	 actual 1
Neg log prob -8.7531	Pos log prob -0.0002	 is positive? True	 actual 1
Neg log prob -8.5696	Pos log prob -0.0002	 is positive? True	 actual 1
Neg log prob -6.2401	Pos log prob -0.0020	 is positive? True	 actual 1
Neg log prob -7.9619	Pos log prob -0.0003	 is positive? True	 actual 1
Neg log prob -8.6638	Pos log prob -0.0002	 is positive? True	 actual 1
Neg log prob -5.0924	Pos log prob -0.0062	 is positive? True	 actual 1
Neg log prob -0.0023	Pos log prob -6.0657	 is positive? False	 actual 0
Neg log prob -0.0033	Pos log prob -5.7259	 is positive? False	 actual 0
Neg log prob -0.0004	Pos log prob -7.8666	 is positive? False	 actual 0
Neg log prob -0.0000	Pos log prob -14.7537	 is positive? False	 actual 0
Neg log prob -0.0115	Pos log prob -4.4726	 is positive? False	 actual 0
Neg log prob -0.0013	Pos log prob -6.6608	 is positive? False	 actual 0

In [164]:
def compute_accuracy(preds,y,y_weights):
    # here Y_weights are array having shape of (len(y),1)
    #Therefore sum is total number of y_weights is total
    is_pos =  preds[:, 1] > preds[:, 0]
    is_pos_int = is_pos.astype(np.int32)
    correct = is_pos_int == y
    sum_weights = np.sum(y_weights)
    correct_float = correct.astype(np.float32)
    weighted_correct_float = correct_float * y_weights
    weighted_num_correct = np.sum(weighted_correct_float)
    accuracy = weighted_num_correct / sum_weights

    return accuracy, weighted_num_correct, sum_weights

In [165]:
def test_model(generator, model):

    accuracy = 0.
    total_num_correct = 0
    total_num_pred = 0
    for batch in generator:
        inputs = batch[0]
        targets = batch[1]
        example_weight = batch[2]
        pred = model(inputs)
        batch_accuracy, batch_num_correct, batch_num_pred = compute_accuracy(pred, targets, example_weight)
        total_num_correct += batch_num_correct
        total_num_pred += batch_num_pred
    accuracy = total_num_correct / total_num_pred
    return accuracy

In [166]:
model = training_loop.eval_model
accuracy = test_model(test_generator(16), model)

print(f'The accuracy of your model on the validation set is {accuracy:.4f}', )

The accuracy of your model on the validation set is 0.9950
