# Sentiment Analysis with DeepNets using Trax 

In [1]:
import string
import re
import os
import nltk
import jax

from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples
from nltk.stem import PorterStemmer

import random as rnd
rnd.seed(30) 

import numpy as np

import trax
from trax import layers as tl
from trax.fastmath import numpy as tnp
from trax.supervised import training


n_devices = None  # use all available devices
random_seed = 31  

# init the hosts and the devices for computation 
process_count = training.init_host_and_devices(n_devices, random_seed)

devices = jax.devices()

if any(str(device).startswith("gpu") for device in devices):
    print("Trax detected GPUs!")
else:
    print("Trax is using CPU.")


2023-12-17 17:14:59.383793: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-17 17:14:59.383852: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-17 17:14:59.411695: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


Trax is using CPU.




In [2]:
jax.devices(),jax.device_count()

([CpuDevice(id=0)], 1)

In [3]:
tweet_tokenizer = TweetTokenizer(
    preserve_case=False, # all tokens are converted to the lowercase 
    strip_handles=True,     # removes the Twitter handles 
    reduce_len=True # reduces the length of repeated char in the tokens
)

stopwords_english = stopwords.words('english')


### Data Source

In [4]:
stemmer = PorterStemmer()

def process_tweet(tweet):
    tweet = re.sub(r'\$\w*', '', tweet) # removes stock market tickers, that is starting with $
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and 
            word not in string.punctuation): 
            stem_word = stemmer.stem(word) 
            tweets_clean.append(stem_word)
    return tweets_clean

def load_tweets():
    all_positive_tweets = twitter_samples.strings('positive_tweets.json')
    all_negative_tweets = twitter_samples.strings('negative_tweets.json')  
    return all_positive_tweets, all_negative_tweets



In [5]:
all_positive_tweets, all_negative_tweets = load_tweets()

print(len(all_positive_tweets), len(all_negative_tweets))

val_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]

val_neg   = all_negative_tweets[4000:] 
train_neg  = all_negative_tweets[:4000] 

train_x = train_pos + train_neg
val_x = val_pos + val_neg

train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
val_y  = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))

print(len(train_x), len(val_x))

5000 5000
8000 2000


In [6]:
print(train_pos[0])

process_tweet(train_pos[0])

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)


['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']

### Buiilding Vocabulary

In [7]:
Vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

for tweet in train_x: 
    processed_tweet = process_tweet(tweet)
    for word in processed_tweet:
        if word not in Vocab: 
            Vocab[word] = len(Vocab)
    
print("Total words in vocab are",len(Vocab))
display(Vocab)

Total words in vocab are 9088


{'__PAD__': 0,
 '__</e>__': 1,
 '__UNK__': 2,
 'followfriday': 3,
 'top': 4,
 'engag': 5,
 'member': 6,
 'commun': 7,
 'week': 8,
 ':)': 9,
 'hey': 10,
 'jame': 11,
 'odd': 12,
 ':/': 13,
 'pleas': 14,
 'call': 15,
 'contact': 16,
 'centr': 17,
 '02392441234': 18,
 'abl': 19,
 'assist': 20,
 'mani': 21,
 'thank': 22,
 'listen': 23,
 'last': 24,
 'night': 25,
 'bleed': 26,
 'amaz': 27,
 'track': 28,
 'scotland': 29,
 'congrat': 30,
 'yeaaah': 31,
 'yipppi': 32,
 'accnt': 33,
 'verifi': 34,
 'rqst': 35,
 'succeed': 36,
 'got': 37,
 'blue': 38,
 'tick': 39,
 'mark': 40,
 'fb': 41,
 'profil': 42,
 '15': 43,
 'day': 44,
 'one': 45,
 'irresist': 46,
 'flipkartfashionfriday': 47,
 'like': 48,
 'keep': 49,
 'love': 50,
 'custom': 51,
 'wait': 52,
 'long': 53,
 'hope': 54,
 'enjoy': 55,
 'happi': 56,
 'friday': 57,
 'lwwf': 58,
 'second': 59,
 'thought': 60,
 '‚Äô': 61,
 'enough': 62,
 'time': 63,
 'dd': 64,
 'new': 65,
 'short': 66,
 'enter': 67,
 'system': 68,
 'sheep': 69,
 'must': 70,
 'buy

### Tweet to Tensor

In [8]:
def tweet_to_tensor(tweet, vocab_dict, unk_token='__UNK__', verbose=False):
    """Convert the Tweet to the Tensor

    Args:
        tweet (str): tweet
        vocab_dict (dict): words
        unk_token (str, optional): for unknown tokens. Defaults to '__UNK__'.
        verbose (bool, optional): print during runtime. Defaults to False.
    """

    word_l = process_tweet(tweet)

    if verbose:
        print(word_l)
    
    # Initialize the list that will contain the integer IDs of each word
    tensor_l = []

    unk_ID = vocab_dict[unk_token]

    if verbose:
        print("UNK ID ", unk_ID)
    
    for word in word_l:
        word_ID = vocab_dict[word] if word in vocab_dict else unk_ID
        tensor_l.append(word_ID)
    
    return tensor_l

In [9]:
print(val_pos[0])
print(tweet_to_tensor(val_pos[0], vocab_dict=Vocab))

Bro:U wan cut hair anot,ur hair long Liao bo
Me:since ord liao,take it easy lor treat as save $ leave it longer :)
Bro:LOL Sibei xialan
[1064, 136, 478, 2351, 744, 8148, 1122, 744, 53, 2, 2671, 790, 2, 2, 348, 600, 2, 3488, 1016, 596, 4558, 9, 1064, 157, 2, 2]


In [10]:
def data_generator(
        data_pos,
        data_neg,
        batch_size, 
        loop,
        vocab_dict,
        shuffle=False
):
    """
    Args:
        data_pos (set): pos examples
        data_neg (set): neg samples
        batch_size (int): must be even
        loop (bool): true or fase
        vocab_dict (dict): words dict
        shuffle (bool, optional):Defaults to False.
    
    Yield:
        inputs - subset of pos and neg sampls
        targets - labels
        example_weights - an array specifying
    """
    
    assert batch_size % 2 == 0
    
    # no of neg samples in each batch
    n_to_take = batch_size // 2

    # to iter the data 
    pos_index = 0
    neg_index = 0

    len_data_pos = len(data_pos)
    len_data_neg = len(data_neg)

    # list of data idexes
    pos_index_lines = list(range(len_data_pos))
    neg_index_lines = list(range(len_data_neg))

    if shuffle:
        rnd.shuffle(pos_index_lines)
        rnd.shuffle(neg_index_lines)
    
    stop = False

    while not stop:
        batch = [] # batch with pos and neg samples

        # pack the n_to_take pos sam'ples 
        for i in range(n_to_take):
            # if pos index does not go past the pos data input 
            if pos_index >= len_data_pos:
                if not loop: # if no loop, break once we reach the end of the dataset
                    stop = True
                    break;

                # if we want to re use the data, reset the index
                pos_index = 0

                if shuffle:
                    rnd.shuffle(pos_index_lines)
            
            tweet = data_pos[pos_index_lines[pos_index]]
            tensor = tweet_to_tensor(tweet, vocab_dict)

            batch.append(tensor)
            pos_index += 1
        
        # for the neg samples
            
        
        for i in range(n_to_take):
            if neg_index >= len_data_neg:
                if not loop:
                    stop = True

                    break;

                neg_index = 0

                if shuffle:
                    rnd.shuffle(neg_index_lines)
                
            
            tweet = data_neg[neg_index_lines[neg_index]]
            tensor = tweet_to_tensor(tweet, vocab_dict)
            batch.append(tensor)

            neg_index += 1
    
        if stop:
            break;

        # update the start index of the positive data, s that it's n_to_take positions after the current pos_index
        pos_index += n_to_take
        neg_index += n_to_take

        # get the max tweet length
        max_len = max([len(t) for t in batch])

        # padded version of the tensors
        tensor_pad_l = []
        
        for tensor in batch:
            n_pad = max_len - len(tensor)
            pad_l = [0] * n_pad
            tensor_pad = tensor + pad_l

            tensor_pad_l.append(tensor_pad)

        
        # convert the list of padded tensors to a numpy array
        inputs = np.array(tensor_pad_l)

        # list of targets
        target_pos = [1] * n_to_take
        target_neg = [0] * n_to_take

        target_l = target_pos + target_neg

        targets = np.array(target_l)

        # treat all examples equally 
        example_weights = np.ones_like(targets)


        yield inputs, targets, example_weights


In [11]:
def train_generator(batch_size, shuffle = False):
    return data_generator(train_pos, train_neg, batch_size, True, Vocab, shuffle)

def val_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, True, Vocab, shuffle)

def test_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, False, Vocab, shuffle)

inputs, targets, example_weights = next(train_generator(4, shuffle=True))

print(f'Inputs: \n {inputs}')
print(f'Targets: {targets}')
print(f'Example Weights: {example_weights}')

Inputs: 
 [[1230    9    0    0    0]
 [ 253  229  335  416   75]
 [  95   22   14   95 3760]
 [5927  507 3948 5432 3760]]
Targets: [1 1 0 0]
Example Weights: [1 1 1 1]


In [12]:
tmp_data_gen = train_generator(batch_size = 4)
tmp_inputs, tmp_targets, tmp_example_weights = next(tmp_data_gen)

print(f"inputs shape: {tmp_inputs.shape}")
print(f"targets shape: {tmp_targets.shape}")
print(f"example weights shape: {tmp_example_weights.shape}")

for i,t in enumerate(tmp_inputs):
    print(f"input tensor: {t}; target {tmp_targets[i]}; example weights {tmp_example_weights[i]}")

inputs shape: (4, 14)
targets shape: (4,)
example weights shape: (4,)
input tensor: [3 4 5 6 7 8 9 0 0 0 0 0 0 0]; target 1; example weights 1
input tensor: [10 11 12 13 14 15 16 17 18 19 20  9 21 22]; target 1; example weights 1
input tensor: [5736 2900 3760    0    0    0    0    0    0    0    0    0    0    0]; target 0; example weights 1
input tensor: [ 857  255 3651 5737  306 4457  566 1229 2766  327 1201 3760    0    0]; target 0; example weights 1


### Model

In [13]:
class Layer(object): 
    """Base class for layers"""
    def __init__(self):
        self.weights = None
    
    def forward(self, x): # any subclass inheriting inheriting from this must implement its own forward()
        raise NotImplementedError
    
    def init_weights_and_state(self, input_signature, random_key):
        pass # init the weights and state 

    def init(self, input_signature, random_key):
        self.init_weights_and_state(input_signature, random_key) # inits 

        return self.weights

    def __call__(self, x): # to be called as a function 
        return self.forward(x)


In [14]:
class Relu(Layer):
    def forward(self, x):
        activation = np.maximum(x, 0)
        return activation


In [15]:
np = trax.fastmath.numpy

random = trax.fastmath.random

In [16]:
class Dense(Layer):
    def __init__(self, n_units, init_stdev=0.1):
        self._n_units = n_units
        self._init_stdev = init_stdev
    
    def forward(self, x):
        dense = np.dot(x, self.weights)
        return dense
    
    def init_weights_and_state(self, input_signature, random_key):
        input_shape = input_signature.shape # info about the input 
        w = self._init_stdev * random.normal(key = random_key, shape = (input_shape[-1], self._n_units))
        self.weights = w

        return self.weights

### Classifier

In [25]:
def classifier(vocab_size=len(Vocab), embedding_dim=256, output_dim=2, mode='tran'):
    embed_layer = tl.Embedding( # embedding for the word representation 
        vocab_size=vocab_size,
        d_feature=embedding_dim
    )
    mean_layer = tl.Mean(axis=1) # mean along the axis=1. 

    dense_output_layer = tl.Dense(n_units=output_dim) # units / neurons 

    log_softmax_layer = tl.LogSoftmax()

    model = tl.Serial(
        embed_layer,
        mean_layer,
        dense_output_layer,
        log_softmax_layer
    )

    return model 

In [26]:
trax.supervised.training.TrainTask # training task for the neural network. 
# it has te data pipeline, loss function, and any metrix required for evaluation during training 

trax.supervised.training.EvalTask # to monitor the model's performance 

trax.supervised.training.Loop # manages the training and eval loop for the Nn. 

trax.optimizers



<module 'trax.optimizers' from '/home/jerlshin/env_ai/lib/python3.10/site-packages/trax/optimizers/__init__.py'>

In [27]:
batch_size = 16
rnd.seed(271)

train_task = training.TrainTask(
    labeled_data=train_generator(batch_size=batch_size, shuffle=True),
    loss_layer=tl.CrossEntropyLoss(),
    optimizer=trax.optimizers.Adam(0.01),
    n_steps_per_checkpoint=10
)


eval_task = training.EvalTask(
    labeled_data=val_generator(batch_size=batch_size, shuffle=True),
    metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
)

model = classifier()

In [33]:
def train_model(classifier, train_task, eval_task, n_steps, output_dir):
    training_loop = training.Loop(
        classifier, 
        train_task, 
        eval_tasks=eval_task,
        output_dir=output_dir
    )

    return training_loop

In [34]:
training_loop = train_model(model, train_task, eval_task, 100, "./Model")