In [None]:
pip install trax==1.3.1

Collecting trax==1.3.1
[?25l  Downloading https://files.pythonhosted.org/packages/fe/d8/ad90a5c79804561bbbc5fd65a4cb6b6e735370225e777cfc46980a9dc479/trax-1.3.1-py2.py3-none-any.whl (347kB)
[K     |█                               | 10kB 18.6MB/s eta 0:00:01[K     |█▉                              | 20kB 1.7MB/s eta 0:00:01[K     |██▉                             | 30kB 2.3MB/s eta 0:00:01[K     |███▊                            | 40kB 2.6MB/s eta 0:00:01[K     |████▊                           | 51kB 2.0MB/s eta 0:00:01[K     |█████▋                          | 61kB 2.3MB/s eta 0:00:01[K     |██████▋                         | 71kB 2.5MB/s eta 0:00:01[K     |███████▌                        | 81kB 2.7MB/s eta 0:00:01[K     |████████▌                       | 92kB 2.9MB/s eta 0:00:01[K     |█████████▍                      | 102kB 2.8MB/s eta 0:00:01[K     |██████████▍                     | 112kB 2.8MB/s eta 0:00:01[K     |███████████▎                    | 122kB 2.8MB/s e

In [None]:
import os
import random as rnd
import trax
from trax import layers as tl
import io
import pandas as pd
import numpy as np
trax.supervised.trainer_lib.init_random_number_generators(31)
import trax.fastmath.numpy as np
from trax import layers as tl
from google.colab import files
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
import random as rnd
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer 
nltk.download('stopwords')
nltk.download('twitter_samples')
from nltk.corpus import twitter_samples 
import re
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize,RegexpTokenizer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


# **1. Loading the tweet data**

In [None]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
print('There are {} positive tweets'.format(len(all_positive_tweets)))
print('There are {} negative tweets'.format(len(all_negative_tweets)))

#split the positive tweets into train and validation set
val_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]

#split the negative tweets into train and validation set
val_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

#combine train_pos and train_neg
train_x = train_pos + train_neg

#combine test_pos and test_neg
val_x = val_pos + val_neg

#set the labels for the training set (1 for positive and 0 for negative)
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))

#set the labels for the test set (1 for positive and 0 for negative)
val_y = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))

print('Length of training set:', len(train_x))
print('Length of validation set:', len(val_x))

There are 5000 positive tweets
There are 5000 negative tweets
Length of training set: 8000
Length of validation set: 2000


# **2. Function to remove unwanted characters and returns tokenized list of words**

In [None]:
def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  
                word not in string.punctuation): 
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean


In [None]:
print('Original positive tweet in training set at position 0 is:', train_pos[0], '\n')
print('Preprocessed positive tweet:', process_tweet(train_pos[0]), '\n')

print('Original negative tweet in training set at position 0 is:', train_neg[0], '\n')
print('Preprocessed negative tweet:', process_tweet(train_neg[0]))

Original positive tweet in training set at position 0 is: #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :) 

Preprocessed positive tweet: ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)'] 

Original negative tweet in training set at position 0 is: hopeless for tmr :( 

Prprocessed negative tweet: ['hopeless', 'tmr', ':(']


# **3. Building the vocabulary**

In [None]:
vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2}
#build vocabulary using training data
for tweet in train_x:
  processed_tweet = process_tweet(tweet)
  for word in processed_tweet:
    if word not in vocab:
      vocab[word] = len(vocab)

print('There are {} words in vocabulary'.format(len(vocab)))
display(vocab)

There are 9092 words in vocabulary


{'__PAD__': 0,
 '__</e>__': 1,
 '__UNK__': 2,
 'followfriday': 3,
 'top': 4,
 'engag': 5,
 'member': 6,
 'commun': 7,
 'week': 8,
 ':)': 9,
 'hey': 10,
 'jame': 11,
 'odd': 12,
 ':/': 13,
 'pleas': 14,
 'call': 15,
 'contact': 16,
 'centr': 17,
 '02392441234': 18,
 'abl': 19,
 'assist': 20,
 'mani': 21,
 'thank': 22,
 'listen': 23,
 'last': 24,
 'night': 25,
 'bleed': 26,
 'amaz': 27,
 'track': 28,
 'scotland': 29,
 'congrat': 30,
 'yeaaah': 31,
 'yipppi': 32,
 'accnt': 33,
 'verifi': 34,
 'rqst': 35,
 'succeed': 36,
 'got': 37,
 'blue': 38,
 'tick': 39,
 'mark': 40,
 'fb': 41,
 'profil': 42,
 '15': 43,
 'day': 44,
 'one': 45,
 'irresist': 46,
 'flipkartfashionfriday': 47,
 'like': 48,
 'keep': 49,
 'love': 50,
 'custom': 51,
 'wait': 52,
 'long': 53,
 'hope': 54,
 'enjoy': 55,
 'happi': 56,
 'friday': 57,
 'lwwf': 58,
 'second': 59,
 'thought': 60,
 '’': 61,
 'enough': 62,
 'time': 63,
 'dd': 64,
 'new': 65,
 'short': 66,
 'enter': 67,
 'system': 68,
 'sheep': 69,
 'must': 70,
 'buy':

# **4. Converting tweet to a tensor**

In [None]:
def tweet_to_tensor(tweet, vocab_dict, unk_token='__UNK__', verbose=False):
  #process tweet into a list of words
  word_l = process_tweet(tweet)

  if verbose:
    print('List of words from the processed tweet:', word_l)

    #Initialize the list that will contain the unique integer ids for each word
  tensor_l = []

    #index of __UNK__
  unk_id = vocab_dict[unk_token]

    #if the word is in vocab then get its index else assign unk_id
  for word in word_l:
    word_id = vocab_dict[word] if word in vocab_dict else unk_id

    tensor_l.append(word_id)

  return tensor_l


print('Original tweet in validation set:', val_pos[0], '\n')
print('\ntensor of tweet:', tweet_to_tensor(val_pos[0], vocab_dict = vocab, unk_token='__UNK__', verbose = True))

Original tweet in validation set: Bro:U wan cut hair anot,ur hair long Liao bo
Me:since ord liao,take it easy lor treat as save $ leave it longer :)
Bro:LOL Sibei xialan 

List of words from the processed tweet: ['bro', 'u', 'wan', 'cut', 'hair', 'anot', 'ur', 'hair', 'long', 'liao', 'bo', 'sinc', 'ord', 'liao', 'take', 'easi', 'lor', 'treat', 'save', 'leav', 'longer', ':)', 'bro', 'lol', 'sibei', 'xialan']

tensor of tweet: [1065, 136, 479, 2351, 745, 8146, 1123, 745, 53, 2, 2672, 791, 2, 2, 349, 601, 2, 3489, 1017, 597, 4559, 9, 1065, 157, 2, 2]


# **5. Creating batch generator**

In [None]:
def data_generator(data_pos, data_neg, batch_size, loop, vocab_dict, shuffle=False):
  
  assert batch_size % 2 == 0

  n_to_take = batch_size // 2

  pos_index = 0
  neg_index = 0

  len_data_pos = len(data_pos)
  len_data_neg = len(data_neg)

  pos_index_lines = list(range(len_data_pos))
  neg_index_lines = list(range(len_data_neg))

  if shuffle:
    rnd.shuffle(pos_index_lines)
    rnd.shuffle(neg_index_lines)
  
  stop = False

  while not stop:

    #create a batch with pos and neg examples
    batch = []

    # positive examples
    for i in range(n_to_take):
      if pos_index >= len_data_pos:
        if not loop:
          stop = True;
          break;
        
        pos_index = 0

        if shuffle:
          #shuffle the index of the positive sample
          rnd.shuffle(pos_index_lines)

      #get the tweet as positive index
      tweet = data_pos[pos_index_lines[pos_index]]

      #convert the tweet into tensor of integers
      tensor = tweet_to_tensor(tweet, vocab_dict)

      #append the tensor to the batch list
      batch.append(tensor)

      pos_index = pos_index + 1

    # Negative examples
    for i in range(n_to_take):
      if neg_index > len_data_neg:
        if not loop:
          stop = True
          break;

        neg_index = 0

        if shuffle:
          #shuffle the index of the negative sample
          rnd.shuffle(neg_index_lines)

      # Get the tweet as negative index
      tweet = data_neg[neg_index_lines[neg_index]]

      #convert the tweet into tensor of integers
      tensor = tweet_to_tensor(tweet, vocab_dict)

      #append the tensor to the batch list
      batch.append(tensor)

      neg_index = neg_index + 1

    if stop:
      break;

    # update the start index for positive data so that its n_to_take positions after the current pos_index
    pos_index += n_to_take

    # update the start index for negative data so that its n_to_take positions after the current neg_index
    neg_index += n_to_take

    #Get the max tweet length
    max_len = max([len(t) for t in batch])

    #Padded version of the tensors
    tensor_pad_l = []

    for tensor in batch:
      #get the number of positions to pad from each tensor to make it to max_len long
      n_pad = max_len - len(tensor)

      pad_l = [0]*n_pad

      tensor_pad = tensor + pad_l

      tensor_pad_l.append(tensor_pad)

    #convert the list of padded tensors to numpy array
    inputs = np.array(tensor_pad_l)

    #generate the list of targets for the positive examples 
    target_pos = [1]*n_to_take

    #generate the list of targets for the negative examples
    target_neg = [0]*n_to_take

    target_l = target_pos + target_neg

    #convert the target list to array
    targets = np.array(target_l)

    #Treat all examples equally important
    example_weights = np.ones_like(targets)

    yield inputs, targets, example_weights

In [None]:
rnd.seed(30)

#Create training data generator
def train_generator(batch_size, shuffle = False):
  return data_generator(train_pos, train_neg, batch_size, True, vocab, shuffle)

#Create validation data generator
def validation_generator(batch_size, shuffle = False):
  return data_generator(val_pos, val_neg, batch_size, True, vocab, shuffle)

def test_generator(batch_size, shuffle = False):
  return data_generator(val_pos, val_neg, batch_size, False, vocab, shuffle)

inputs, targets, example_weights = next(train_generator(4, shuffle=True))

print('Input tensors:', inputs, '\n')
print('targets:', targets, '\n')
print('Weights:', example_weights, '\n')

Input tensors: [[2005 4451 3201    9    0    0    0    0    0    0    0]
 [4954  567 2000 1454 5174 3499  141 3499  130  459    9]
 [3761  109  136  583 2930 3969    0    0    0    0    0]
 [ 250 3761    0    0    0    0    0    0    0    0    0]] 

targets: [1 1 0 0] 

Weights: [1 1 1 1] 



In [None]:
#Create a data generator for training data
tmp_data_gen = train_generator(batch_size = 4)

tmp_inputs, tmp_targets, tmp_example_weights = next(tmp_data_gen)

print('Input shape:', tmp_inputs.shape)
print('target shape:', tmp_targets.shape)
print('Weights shape:', tmp_example_weights.shape, '\n')

for i, t in enumerate(tmp_inputs):
  print(f'Input tensor:{t}; target:{tmp_targets[i]}; weights:{tmp_example_weights[i]}')

Input shape: (4, 14)
target shape: (4,)
Weights shape: (4,) 

Input tensor:[3 4 5 6 7 8 9 0 0 0 0 0 0 0]; target:1; weights:1
Input tensor:[10 11 12 13 14 15 16 17 18 19 20  9 21 22]; target:1; weights:1
Input tensor:[5738 2901 3761    0    0    0    0    0    0    0    0    0    0    0]; target:0; weights:1
Input tensor:[ 858  256 3652 5739  307 4458  567 1230 2767  328 1202 3761    0    0]; target:0; weights:1


In [None]:
# use the fastmath module within trax
from trax import fastmath

# use the numpy module from trax
np = fastmath.numpy

# use the fastmath.random module from trax
random = fastmath.random

# **6. Creating a model**

In [None]:
def classifier(vocab_size = len(vocab), embedding_dim = 256, output_dim = 2, mode = 'train'):
  #create embedding layer
  embed_layer = tl.Embedding(vocab_size=vocab_size, d_feature=embedding_dim)
  #Create a mean layer to create an average word embedding
  mean_layer = tl.Mean(axis=1)
  #create a dense layer, one unit for each output
  dense_layer = tl.Dense(n_units = output_dim)
  #create log softmax layer
  log_softmax_layer = tl.LogSoftmax()

  model = tl.Serial(
      embed_layer,
      mean_layer,
      dense_layer,
      log_softmax_layer
  )

  return model

tmp_model = classifier()
print(type(tmp_model))
display(tmp_model)

<class 'trax.layers.combinators.Serial'>


Serial[
  Embedding_9092_256
  Mean
  Dense_2
  LogSoftmax
]

# **7. Training the model**

In [None]:
from trax.supervised import training

batch_size = 16
rnd.seed(271)

train_task = training.TrainTask(
    labeled_data = train_generator(batch_size = batch_size, shuffle=True),
    loss_layer = tl.CrossEntropyLoss(),
    optimizer = trax.optimizers.Adam(0.01),
    n_steps_per_checkpoint = 10,
)

eval_task = training.EvalTask(
    labeled_data = validation_generator(batch_size=batch_size, shuffle=True),
    metrics = [tl.CrossEntropyLoss(), tl.Accuracy()],
)

model = classifier()

In [None]:
output_dir = 'model/'
output_dir_expand = os.path.expanduser(output_dir)
print(output_dir_expand)

/root/model/


In [None]:
def train_model(classifier, train_task, eval_task, n_steps, output_dir):
  training_loop = training.Loop(
      classifier,
      train_task,
      eval_task = eval_task,
      output_dir=output_dir
  )
  training_loop.run(n_steps = n_steps)
  return training_loop

In [None]:
training_loop = train_model(model, train_task, eval_task, 100, output_dir_expand)

Step      1: train CrossEntropyLoss |  0.71663344
Step      1: eval  CrossEntropyLoss |  0.80896854
Step      1: eval          Accuracy |  0.56250000
Step     10: train CrossEntropyLoss |  0.65261537
Step     10: eval  CrossEntropyLoss |  0.67353255
Step     10: eval          Accuracy |  0.43750000
Step     20: train CrossEntropyLoss |  0.41853198
Step     20: eval  CrossEntropyLoss |  0.32247627
Step     20: eval          Accuracy |  0.93750000
Step     30: train CrossEntropyLoss |  0.30088764
Step     30: eval  CrossEntropyLoss |  0.27893353
Step     30: eval          Accuracy |  1.00000000
Step     40: train CrossEntropyLoss |  0.17787032
Step     40: eval  CrossEntropyLoss |  0.18334462
Step     40: eval          Accuracy |  0.93750000
Step     50: train CrossEntropyLoss |  0.15455118
Step     50: eval  CrossEntropyLoss |  0.05755305
Step     50: eval          Accuracy |  1.00000000
Step     60: train CrossEntropyLoss |  0.12856857
Step     60: eval  CrossEntropyLoss |  0.15309972


# **8. Making a prediction on sample training data**

In [None]:
#prediction

# Create a generator object
tmp_train_generator = train_generator(16)

# get one batch
tmp_batch = next(tmp_train_generator)

tmp_inputs, tmp_targets, tmp_example_weights = tmp_batch
print(tmp_inputs.shape)
print(tmp_targets.shape)
print(tmp_example_weights)

(16, 15)
(16,)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [None]:
# feed the tweet tensors into the model to get a prediction
tmp_pred = training_loop.eval_model(tmp_inputs)

print(f"The prediction shape is {tmp_pred.shape}, num of tensor_tweets as rows")
print("Column 0 is the probability of a negative sentiment (class 0)")
print("Column 1 is the probability of a positive sentiment (class 1)")
print("View the prediction array")
tmp_pred

The prediction shape is (16, 2), num of tensor_tweets as rows
Column 0 is the probability of a negative sentiment (class 0)
Column 1 is the probability of a positive sentiment (class 1)
View the prediction array


DeviceArray([[-3.7626619e+00, -2.3495674e-02],
             [-4.1175117e+00, -1.6419172e-02],
             [-4.3073044e+00, -1.3561487e-02],
             [-2.6957486e+00, -6.9877386e-02],
             [-4.0748644e+00, -1.7140627e-02],
             [-2.1877446e+00, -1.1897421e-01],
             [-6.6753860e+00, -1.2621880e-03],
             [-3.5692210e+00, -2.8582335e-02],
             [-2.3403168e-03, -6.0587635e+00],
             [-1.1348724e-03, -6.7817626e+00],
             [-8.5546970e-03, -4.7655659e+00],
             [-8.5353851e-05, -9.3677816e+00],
             [-2.8588772e-03, -5.8587508e+00],
             [-1.0375977e-03, -6.8714089e+00],
             [-2.4724007e-03, -6.0037174e+00],
             [-2.2265911e-03, -6.1084499e+00]], dtype=float32)

In [None]:
#turn probabilities into category predictions
tmp_is_positive = tmp_pred[:,1] > tmp_pred[:,0]
for i,p in enumerate(tmp_is_positive):
   print(f"Neg log prob {tmp_pred[i,0]:.4f}\tPos log prob {tmp_pred[i,1]:.4f}\t is positive? {p}\t actual {tmp_targets[i]}")

Neg log prob -3.7627	Pos log prob -0.0235	 is positive? True	 actual 1
Neg log prob -4.1175	Pos log prob -0.0164	 is positive? True	 actual 1
Neg log prob -4.3073	Pos log prob -0.0136	 is positive? True	 actual 1
Neg log prob -2.6957	Pos log prob -0.0699	 is positive? True	 actual 1
Neg log prob -4.0749	Pos log prob -0.0171	 is positive? True	 actual 1
Neg log prob -2.1877	Pos log prob -0.1190	 is positive? True	 actual 1
Neg log prob -6.6754	Pos log prob -0.0013	 is positive? True	 actual 1
Neg log prob -3.5692	Pos log prob -0.0286	 is positive? True	 actual 1
Neg log prob -0.0023	Pos log prob -6.0588	 is positive? False	 actual 0
Neg log prob -0.0011	Pos log prob -6.7818	 is positive? False	 actual 0
Neg log prob -0.0086	Pos log prob -4.7656	 is positive? False	 actual 0
Neg log prob -0.0001	Pos log prob -9.3678	 is positive? False	 actual 0
Neg log prob -0.0029	Pos log prob -5.8588	 is positive? False	 actual 0
Neg log prob -0.0010	Pos log prob -6.8714	 is positive? False	 actual 0


# **9. Computing accuracy on the batch**

In [None]:
def computing_accuracy(preds, y, y_weights):
  is_pos = preds[:,1] > preds[:,0]
  
  # convert the array of booleans into an array of np.int32
  is_pos_int = is_pos.astype(np.int32)
  
  #compare predictions with actual
  correct = is_pos_int == y
  
  sum_weights = np.sum(y_weights)
  
  # convert the array of correct predictions (boolean) into an arrayof np.float32
  correct_float = correct.astype(np.float32)
  
  # Multiply each prediction with its corresponding weight.
  weighted_correct_float = correct_float * y_weights

  # Sum up the weighted correct predictions (of type np.float32), to go in the denominator.
  weighted_num_correct = np.sum(weighted_correct_float)
 
  accuracy = weighted_num_correct / sum_weights

  return accuracy, weighted_num_correct, sum_weights


In [None]:
tmp_val_generator = validation_generator(128)
tmp_batch = next(tmp_val_generator)
tmp_inputs, tmp_targets, tmp_example_weights = tmp_batch
tmp_pred = training_loop.eval_model(tmp_inputs)
tmp_acc, tmp_num_correct, tmp_num_predictions = computing_accuracy(preds=tmp_pred, y=tmp_targets, y_weights=tmp_example_weights)

print(f"Model's prediction accuracy on a single training batch is: {100 * tmp_acc}%")
print(f"Weighted number of correct predictions {tmp_num_correct}; weighted number of total observations predicted {tmp_num_predictions}")

Model's prediction accuracy on a single training batch is: 92.1875%
Weighted number of correct predictions 118.0; weighted number of total observations predicted 128


# **10. Testing your model on validation data**

In [None]:
def test_model(generator, model):
    
    accuracy = 0.
    total_num_correct = 0
    total_num_pred = 0
    
    ### START CODE HERE (Replace instances of 'None' with your code) ###
    for batch in generator: 
        
        # Retrieve the inputs from the batch
        inputs = batch[0]
        
        # Retrieve the targets (actual labels) from the batch
        targets =  batch[1]
        
        # Retrieve the example weight.
        example_weight = batch[2]

        # Make predictions using the inputs
        pred = model(inputs)
        
        # Calculate accuracy for the batch by comparing its predictions and targets
        batch_accuracy, batch_num_correct, batch_num_pred = computing_accuracy(pred, targets, example_weight) 
        
        # Update the total number of correct predictions
        # by adding the number of correct predictions from this batch
        total_num_correct += batch_num_correct
        
        # Update the total number of predictions 
        # by adding the number of predictions made for the batch
        total_num_pred += batch_num_pred

    # Calculate accuracy over all examples
    accuracy = total_num_correct / total_num_pred
    
    ### END CODE HERE ###
    return accuracy

In [None]:
model = training_loop.eval_model
accuracy = test_model(test_generator(16), model)

print(f'The accuracy of your model on the validation set is {accuracy:.4f}', )

The accuracy of your model on the validation set is 0.9702


# **11.Testing your own sentence**

In [None]:
def predict(sentence):
  inputs = np.array(tweet_to_tensor(sentence, vocab_dict = vocab))
  
  # Batch size 1, add dimension for batch, to work with the model
  inputs = inputs[None, :]  

  # predict with the model
  preds_probs = model(inputs)

  # Turn probabilities into categories
  preds = int(preds_probs[0, 1] > preds_probs[0, 0])

  sentiment = "negative"
  if preds == 1:
    sentiment = 'positive'

  return preds, sentiment  
      

In [None]:
#positive tweet
sentence = "its a wonderful day today"
tmp_pred, tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")

#negative tweet
sentence = "very disappointing"
tmp_pred, tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")



The sentiment of the sentence 
***
"its a wonderful day today"
***
is positive.
The sentiment of the sentence 
***
"very disappointing"
***
is negative.
