# Import Libraries

In [76]:
import string
import numpy as np
import re
import random
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
from keras import layers

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load Data

In [2]:
# Essential objects
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
stopwords_english = stopwords.words('english')
stemmer = PorterStemmer()

In [3]:
def load_tweets():
    all_positive_tweets = twitter_samples.strings('positive_tweets.json')
    all_negative_tweets = twitter_samples.strings('negative_tweets.json')  
    return all_positive_tweets, all_negative_tweets

def process_tweet(tweet):
    '''
    Input: 
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    
    '''
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
            word not in string.punctuation): # remove punctuation
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [4]:
def train_val_split():
    # Load positive and negative tweets
    all_positive_tweets, all_negative_tweets = load_tweets()

    # View the total number of positive and negative tweets.
    print(f"The number of positive tweets: {len(all_positive_tweets)}")
    print(f"The number of negative tweets: {len(all_negative_tweets)}")

    # Split positive set into validation and training
    val_pos = all_positive_tweets[4000:] # generating validation set for positive tweets
    train_pos = all_positive_tweets[:4000]# generating training set for positive tweets

    # Split negative set into validation and training
    val_neg = all_negative_tweets[4000:] # generating validation set for negative tweets
    train_neg = all_negative_tweets[:4000] # generating training set for nagative tweets
    
    # Combine training data into one set

    train_x = train_pos + train_neg 

    # Combine validation data into one set
    val_x  = val_pos + val_neg

    # Set the labels for the training set (1 for positive, 0 for negative)
    train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))

    # Set the labels for the validation set (1 for positive, 0 for negative)
    val_y  = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))


    return train_pos, train_neg, train_x, train_y, val_pos, val_neg, val_x, val_y

In [5]:
train_pos, train_neg, train_x, train_y, val_pos, val_neg, val_x, val_y = train_val_split()

print(f"length of train_x {len(train_x)}")
print(f"length of val_x {len(val_x)}")

The number of positive tweets: 5000
The number of negative tweets: 5000
length of train_x 8000
length of val_x 2000


In [6]:
# Get vocab based on train_x only
def get_vocab(train_x):
    
    # Include special tokens started with pad, end of line and unk tokens
    vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

    for tweet in train_x:
        processed_tweet = process_tweet(tweet)
        for word in processed_tweet:
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

vocab = get_vocab(train_x)
print("Total words in vocab are", len(vocab))
display(vocab)

Total words in vocab are 9088


{'__PAD__': 0,
 '__</e>__': 1,
 '__UNK__': 2,
 'followfriday': 3,
 'top': 4,
 'engag': 5,
 'member': 6,
 'commun': 7,
 'week': 8,
 ':)': 9,
 'hey': 10,
 'jame': 11,
 'odd': 12,
 ':/': 13,
 'pleas': 14,
 'call': 15,
 'contact': 16,
 'centr': 17,
 '02392441234': 18,
 'abl': 19,
 'assist': 20,
 'mani': 21,
 'thank': 22,
 'listen': 23,
 'last': 24,
 'night': 25,
 'bleed': 26,
 'amaz': 27,
 'track': 28,
 'scotland': 29,
 'congrat': 30,
 'yeaaah': 31,
 'yipppi': 32,
 'accnt': 33,
 'verifi': 34,
 'rqst': 35,
 'succeed': 36,
 'got': 37,
 'blue': 38,
 'tick': 39,
 'mark': 40,
 'fb': 41,
 'profil': 42,
 '15': 43,
 'day': 44,
 'one': 45,
 'irresist': 46,
 'flipkartfashionfriday': 47,
 'like': 48,
 'keep': 49,
 'love': 50,
 'custom': 51,
 'wait': 52,
 'long': 53,
 'hope': 54,
 'enjoy': 55,
 'happi': 56,
 'friday': 57,
 'lwwf': 58,
 'second': 59,
 'thought': 60,
 '’': 61,
 'enough': 62,
 'time': 63,
 'dd': 64,
 'new': 65,
 'short': 66,
 'enter': 67,
 'system': 68,
 'sheep': 69,
 'must': 70,
 'buy':

In [8]:
# Convert a tweet to a tensor (as input for model)
def tweet_to_tensor(tweet, vocab_dict=vocab, unk_token='__UNK__', verbose=False):
    '''
    Input: 
        tweet - A string containing a tweet
        vocab_dict - The words dictionary
        unk_token - The special string for unknown tokens
        verbose - Print info durign runtime
    Output:
        tensor_l - A python list
    '''
    # Process the tweet into a list of words, with stop words removed
    word_list = process_tweet(tweet)
    
    if verbose:
        print("List of words from the processed tweet:")
        print(word_list)
        
    # Initialize the list that will contain the unique integer IDs of each word
    tensor_list = []
    
    # Get unique integer ID of __UNK__ token
    unk_ID = vocab_dict[unk_token]
    
    if verbose:
        print(f"The unique integer ID for the unk_token is {unk_ID}")
        
    # for each word in the list:
    for word in word_list:
        
        # Get word's unique integer ID.
        # If word doesn't exist in the vocab dictionary, use unique ID for __UNK__ instead.        
        word_ID = vocab_dict.get(word, unk_ID)
            
        # Append the unique integer ID to the tensor list.
        tensor_list.append(word_ID)
    
    return tensor_list

# Convert array of tweets to array of tensors
def tweets_to_tensors(tweets, vocab_dict=vocab, unk_token='__UNK__', verbose=False):
    res = []
    for i, tweet in enumerate(tweets):
        tensor = tweet_to_tensor(tweet, vocab_dict=vocab_dict, unk_token=unk_token, verbose=verbose)
        res.append(tensor)
    return res

In [16]:
# Transform x values from list of strings to np.array of tensors of equal length
def transform_x(train_x, val_x):
    train_X = tweets_to_tensors(train_x)
    val_X = tweets_to_tensors(val_x)

    # Max size of input vector (max length of a sentence/tweet)
    MAX = max([len(tensor) for tensor in train_X+val_X])

    # Pad with zeros
    train_X_array = np.zeros((len(train_X), MAX))
    val_X_array = np.zeros((len(val_X), MAX))
    for i, tensor in enumerate(train_X):
        train_X_array[i,:len(tensor)] = tensor
    for i, tensor in enumerate(val_X):
        val_X_array[i,:len(tensor)] = tensor

    return train_X_array, val_X_array, MAX

train_X, val_X, MAX = transform_x(train_x, val_x)

print("Shape of train_X =", train_X.shape)
print("Shape of val_X =", val_X.shape)

Shape of train_X = (8000, 51)
Shape of val_X = (2000, 51)


In [17]:
# Transform y values from 1D array to 2D np.array
def transform_y(y):
    Y = y.reshape((len(y),1))
    Y = np.append(Y, np.flip(Y), axis=1)
    return Y

train_Y = transform_y(train_y)
val_Y = transform_y(val_y)

print("Shape of train_Y =", train_Y.shape)
print("Shape of val_Y =", val_Y.shape)

Shape of train_Y = (8000, 2)
Shape of val_Y = (2000, 2)


# Batch Generator
This doesn't end up getting used in the later sections, because Tensorflow automates the batch generating process.

In [103]:
def data_generator(data_pos, data_neg, batch_size, loop, vocab_dict, shuffle=False):
    '''
    Input: 
        data_pos - Set of positive examples (unprocessed tweets, can be from train or validation)
        data_neg - Set of negative examples (unprocessed tweets, can be from train or validation)
        batch_size - number of samples per batch. Must be even
        loop - True or False
        vocab_dict - The words dictionary
        shuffle - Shuffle the data order
    Yield:
        inputs - Subset of positive and negative examples
        targets - The corresponding labels for the subset
        example_weights - An array specifying the importance of each example
        
    '''     

    # Ensure batch_size is even number to allow for equal numbers of positive and negative samples, else assert an error
    assert batch_size % 2 == 0
    
    # Number of positive/negative examples in each batch is half of the batch size
    n_to_take = batch_size // 2
    
    # Use pos_index to walk through the data_pos array, and neg_index to walk through the data_neg array
    pos_index = 0
    neg_index = 0
    
    len_data_pos = len(data_pos)
    len_data_neg = len(data_neg)
    
    # Get arrays with the data indexes
    pos_index_lines = list(range(len_data_pos))
    neg_index_lines = list(range(len_data_neg))
    # Shuffle array if shuffle is set to True
    if shuffle:
        random.shuffle(pos_index_lines)
        random.shuffle(neg_index_lines)
        
    stop = False
    while not stop: 
        # create a batch with positive and negative examples
        batch = []
        
        # PART ONE: Pack n_to_take positive examples
        for i in range(n_to_take): # Start from 0 and increment i up to n_to_take
                    
            # If the positive index goes past the positive dataset ie. we've finished walking through the whole train data
            if pos_index >= len_data_pos: 
                # If loop is set to False, break once we reach the end of the dataset
                if not loop:
                    stop = True
                    break
                # If user wants to keep re-using the data, reset the index
                pos_index = 0
                if shuffle:
                    # Shuffle the index of the positive sample
                    random.shuffle(pos_index_lines)
                    
            # get the tweet as pos_index
            tweet = data_pos[pos_index_lines[pos_index]]
            # convert tweet into tensor of integers representing the processed words
            tensor = tweet_to_tensor(tweet, vocab_dict)
            # append the tensor to the batch list
            batch.append(tensor)
            
            # Increment pos_index by one
            pos_index = pos_index + 1


            
        # PART TWO: Pack n_to_take negative examples
        for i in range(n_to_take): # Using the same batch list, start from 0 and increment i up to n_to_take
            
            # If the negative index goes past the negative dataset,
            if neg_index >= len_data_neg:
                # If loop is set to False, break once we reach the end of the dataset
                if not loop:
                    stop = True 
                    break
                # If user wants to keep re-using the data, reset the index
                neg_index = 0
                if shuffle:
                    # Shuffle the index of the negative sample
                    random.shuffle(neg_index_lines)
                    
            # get the tweet as neg_index
            tweet = data_neg[neg_index_lines[neg_index]]
            # convert the tweet into tensors of integers representing the processed words
            tensor = tweet_to_tensor(tweet, vocab_dict)
            # append the tensor to the batch list
            batch.append(tensor)
            
            # Increment neg_index by one
            neg_index += 1


        if stop:
            break;

        # Get the max tweet length (the length of the longest tweet) (you will pad all shorter tweets to have this length)
        max_len = max([len(t) for t in batch]) 
        
        
        # Initialize the input_l, which will store the padded versions of the tensors
        tensor_pad_l = []
        # Pad shorter tweets with zeros
        for tensor in batch:
            # Get the number of positions to pad for this tensor so that it will be max_len long
            n_pad = max_len - len(tensor)
            # Generate a list of zeros, with length n_pad
            pad_l = [0 for i in range(n_pad)]
            # concatenate the tensor and the list of padded zeros
            tensor_pad = tensor + pad_l
            
            # append the padded tensor to the list of padded tensors
            tensor_pad_l.append(tensor_pad)
        # Convert list of padded tensors to a numpy array, as the model inputs
        inputs = np.array(tensor_pad_l)
        
        # Generate list of targets for the positive examples (a list of ones), length = no. of positive examples in the batch
        target_pos = [1 for i in range(batch_size//2)]
        # Generate the list of targets for the negative examples (a list of zeros), length = no. of negative examples in the batch
        target_neg = [0 for i in range(batch_size//2)]
        
        # Concatenate the positve and negative targets into a numpy array
        targets = np.array(target_pos + target_neg)

        # Example weights: Treat all examples equally importantly.
        example_weights = np.ones_like(targets)
        

        # note we use yield and not return
        yield inputs, targets, example_weights

# Model

Now we implement a neural networks classifier as below:

<img src = "images/nn.jpg" style="width:400px;height:250px;"/>

## Construction

In [68]:
# Function that returns an untrained model
def classifier(vocab_size=len(vocab), embedding_dim=256, output_dim=2):
            
    # Embedding layer 
    embed_layer = layers.Embedding(
        input_dim=vocab_size, 
        output_dim=embedding_dim
    )
    # Mean layer, to create an "average" word embedding
    class meanLayer(layers.Layer):
        def __init__(self):
            super(meanLayer, self).__init__()
            self.mean = None
        def call(self, inputs):
            self.mean = tf.reduce_mean(inputs, axis=1)
            return self.mean
    mean_layer = meanLayer()
    # Dense layer, one unit for each output, with softmax axtivation
    dense_output_layer = layers.Dense(input_dim=embedding_dim, units=output_dim, activation='softmax')
    
    # Combine all layers
    model = keras.Sequential([
        embed_layer, # embedding layer
        mean_layer, # mean layer
        dense_output_layer, # dense output layer with softmax activation
    ]) 
    
    # return the model of type
    return model

In [69]:
model = classifier()
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, None, 256)         2326528   
                                                                 
 mean_layer_12 (meanLayer)   (None, 256)               0         
                                                                 
 dense_12 (Dense)            (None, 2)                 514       
                                                                 
Total params: 2,327,042
Trainable params: 2,327,042
Non-trainable params: 0
_________________________________________________________________


In [70]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')

## Training

In [71]:
model.fit(train_X, train_Y, epochs=5, batch_size=16, shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x11daf85c950>

## Testing

In [83]:
def test_model(model, val_X, val_Y):
    Y_hat = model.predict(val_X)
    is_pos = Y_hat[:,0] > Y_hat[:,1]
    is_pos = is_pos.astype(np.int32)
    accuracy = accuracy_score(is_pos, val_Y[:,0])
    return accuracy

accuracy = test_model(model, val_X, val_Y)
accuracy



0.9955

In [102]:
def predict(model, tweet, MAX=MAX):
    tensor = tweet_to_tensor(tweet)
    assert len(tensor) <= MAX
    
    # Pad with 0s
    X = tensor + [0 for i in range(MAX - len(tensor))]
    X = np.array([tensor])
    
    Y = model.predict(X)
    pos_score = Y[0][0]

    return pos_score, "positive" if pos_score>=0.5 else "negative"

In [138]:
tweet = "I could not even sleep for the past few days"

prediction = predict(model, tweet)
prediction



(0.6548583, 'positive')

# Appendix??

In [7]:
relu = layers.ReLU() # threshold=0, max_value=None

# Inputs
x = np.array([-2, -1, 0, 1, 2])
print("x =", x)
# Outputs
y = relu(x)
print("y =", y)


x = [-2 -1  0  1  2]
y = tf.Tensor([0 0 0 1 2], shape=(5,), dtype=int32)


In [10]:
V = 100; m = 50; N = 20

# DNN
model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=64, activation='relu', input_shape=(V, m)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=1, activation='relu'),  # regression
])

# RNN for NLP
model = tf.keras.Sequential([
    layers.Embedding(input_dim=V, output_dim=N, input_length=1000),
    layers.Bidirectional(layers.LSTM(32, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(16)),
    layers.Dropout(0.3),
    layers.Dense(16, activation='relu'),
    layers.Dense(6, activation='softmax')  # multiclass classification
])