# Preprocessing
All code within this section is the same as that of the previous '06' notebook's.

## Import Libraries

In [1]:
import string
import numpy as np
import re
import random
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
from keras import layers

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Data

In [2]:
# Essential objects
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
stopwords_english = stopwords.words('english')
stemmer = PorterStemmer()

In [3]:
def load_tweets():
    all_positive_tweets = twitter_samples.strings('positive_tweets.json')
    all_negative_tweets = twitter_samples.strings('negative_tweets.json')  
    return all_positive_tweets, all_negative_tweets

def process_tweet(tweet):
    '''
    Input: 
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    
    '''
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
            word not in string.punctuation): # remove punctuation
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [4]:
def train_val_split():
    # Load positive and negative tweets
    all_positive_tweets, all_negative_tweets = load_tweets()

    # View the total number of positive and negative tweets.
    print(f"The number of positive tweets: {len(all_positive_tweets)}")
    print(f"The number of negative tweets: {len(all_negative_tweets)}")

    # Split positive set into validation and training
    val_pos = all_positive_tweets[4000:] # generating validation set for positive tweets
    train_pos = all_positive_tweets[:4000]# generating training set for positive tweets

    # Split negative set into validation and training
    val_neg = all_negative_tweets[4000:] # generating validation set for negative tweets
    train_neg = all_negative_tweets[:4000] # generating training set for nagative tweets
    
    # Combine training data into one set

    train_x = train_pos + train_neg 

    # Combine validation data into one set
    val_x  = val_pos + val_neg

    # Set the labels for the training set (1 for positive, 0 for negative)
    train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))

    # Set the labels for the validation set (1 for positive, 0 for negative)
    val_y  = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))


    return train_pos, train_neg, train_x, train_y, val_pos, val_neg, val_x, val_y

In [5]:
train_pos, train_neg, train_x, train_y, val_pos, val_neg, val_x, val_y = train_val_split()

print(f"length of train_x {len(train_x)}")
print(f"length of val_x {len(val_x)}")

The number of positive tweets: 5000
The number of negative tweets: 5000
length of train_x 8000
length of val_x 2000


In [6]:
# Get vocab based on train_x only
def get_vocab(train_x):
    
    # Include special tokens started with pad, end of line and unk tokens
    vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

    for tweet in train_x:
        processed_tweet = process_tweet(tweet)
        for word in processed_tweet:
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

vocab = get_vocab(train_x)
print("Total words in vocab are", len(vocab))
display(vocab)

Total words in vocab are 9088


{'__PAD__': 0,
 '__</e>__': 1,
 '__UNK__': 2,
 'followfriday': 3,
 'top': 4,
 'engag': 5,
 'member': 6,
 'commun': 7,
 'week': 8,
 ':)': 9,
 'hey': 10,
 'jame': 11,
 'odd': 12,
 ':/': 13,
 'pleas': 14,
 'call': 15,
 'contact': 16,
 'centr': 17,
 '02392441234': 18,
 'abl': 19,
 'assist': 20,
 'mani': 21,
 'thank': 22,
 'listen': 23,
 'last': 24,
 'night': 25,
 'bleed': 26,
 'amaz': 27,
 'track': 28,
 'scotland': 29,
 'congrat': 30,
 'yeaaah': 31,
 'yipppi': 32,
 'accnt': 33,
 'verifi': 34,
 'rqst': 35,
 'succeed': 36,
 'got': 37,
 'blue': 38,
 'tick': 39,
 'mark': 40,
 'fb': 41,
 'profil': 42,
 '15': 43,
 'day': 44,
 'one': 45,
 'irresist': 46,
 'flipkartfashionfriday': 47,
 'like': 48,
 'keep': 49,
 'love': 50,
 'custom': 51,
 'wait': 52,
 'long': 53,
 'hope': 54,
 'enjoy': 55,
 'happi': 56,
 'friday': 57,
 'lwwf': 58,
 'second': 59,
 'thought': 60,
 '’': 61,
 'enough': 62,
 'time': 63,
 'dd': 64,
 'new': 65,
 'short': 66,
 'enter': 67,
 'system': 68,
 'sheep': 69,
 'must': 70,
 'buy':

In [7]:
# Convert a tweet to a tensor (as input for model)
def tweet_to_tensor(tweet, vocab_dict=vocab, unk_token='__UNK__', verbose=False):
    '''
    Input: 
        tweet - A string containing a tweet
        vocab_dict - The words dictionary
        unk_token - The special string for unknown tokens
        verbose - Print info durign runtime
    Output:
        tensor_l - A python list
    '''
    # Process the tweet into a list of words, with stop words removed
    word_list = process_tweet(tweet)
    
    if verbose:
        print("List of words from the processed tweet:")
        print(word_list)
        
    # Initialize the list that will contain the unique integer IDs of each word
    tensor_list = []
    
    # Get unique integer ID of __UNK__ token
    unk_ID = vocab_dict[unk_token]
    
    if verbose:
        print(f"The unique integer ID for the unk_token is {unk_ID}")
        
    # for each word in the list:
    for word in word_list:
        
        # Get word's unique integer ID.
        # If word doesn't exist in the vocab dictionary, use unique ID for __UNK__ instead.        
        word_ID = vocab_dict.get(word, unk_ID)
            
        # Append the unique integer ID to the tensor list.
        tensor_list.append(word_ID)
    
    return tensor_list

# Convert array of tweets to array of tensors
def tweets_to_tensors(tweets, vocab_dict=vocab, unk_token='__UNK__', verbose=False):
    res = []
    for i, tweet in enumerate(tweets):
        tensor = tweet_to_tensor(tweet, vocab_dict=vocab_dict, unk_token=unk_token, verbose=verbose)
        res.append(tensor)
    return res

In [8]:
# Transform x values from list of strings to np.array of tensors of equal length
def transform_x(train_x, val_x):
    train_X = tweets_to_tensors(train_x)
    val_X = tweets_to_tensors(val_x)

    # Max size of input vector (max length of a sentence/tweet)
    MAX = max([len(tensor) for tensor in train_X+val_X])

    # Pad with zeros
    train_X_array = np.zeros((len(train_X), MAX))
    val_X_array = np.zeros((len(val_X), MAX))
    for i, tensor in enumerate(train_X):
        train_X_array[i,:len(tensor)] = tensor
    for i, tensor in enumerate(val_X):
        val_X_array[i,:len(tensor)] = tensor

    return train_X_array, val_X_array, MAX

train_X, val_X, MAX = transform_x(train_x, val_x)

print("Shape of train_X =", train_X.shape)
print("Shape of val_X =", val_X.shape)

Shape of train_X = (8000, 51)
Shape of val_X = (2000, 51)


In [9]:
# Transform y values from 1D array to 2D np.array
def transform_y(y):
    Y = y.reshape((len(y),1))
    Y = np.append(Y, np.flip(Y), axis=1)
    return Y

train_Y = transform_y(train_y)
val_Y = transform_y(val_y)

print("Shape of train_Y =", train_Y.shape)
print("Shape of val_Y =", val_Y.shape)

Shape of train_Y = (8000, 2)
Shape of val_Y = (2000, 2)


# Model

Now we implement a neural networks classifier as below:

<img src = "images/nn.jpg" style="width:400px;height:250px;"/>

## Construction

In [92]:
# Function that returns an untrained model
def GRNN(vocab_size=len(vocab), embedding_dim=256, n_GRU_layers=2, output_dim=2):
    """Returns a GRU neural network.
    Args:
        vocab_size (int, optional): Size of the vocabulary. Defaults to 256.
        embedding_dim (int, optional): Depth of embedding (n_units in the GRU cell). Defaults to 512.
        GRU_layers (int, optional): Number of GRU layers. Defaults to 2.
        mode (str, optional): 'train', 'eval' or 'predict', predict mode is for fast inference. Defaults to "train".

    Returns:
        trax.layers.combinators.Serial: A GRU language model as a layer that maps from a tensor of tokens to activations over a vocab set.
    """
    # Embedding layer 
    embed_layer = layers.Embedding(
        input_dim=vocab_size, 
        output_dim=embedding_dim
    )
    # GRU layers
    GRU_layers = None
    if n_GRU_layers > 1:
        GRU_layer = layers.GRU(units=embedding_dim, return_sequences=True, activation='tanh')
        GRU_layers = [GRU_layer] + [layers.GRU(units=embedding_dim, activation='tanh') for _ in range(n_GRU_layers-1)]
    else:
        GRU_layers = [layers.GRU(units=embedding_dim, activation='tanh')]
    # Dense layer, one unit for each output, with softmax axtivation
    dense_output_layer = layers.Dense(input_dim=embedding_dim, units=output_dim, activation='softmax')
    
    # Combine all layers
    model = keras.Sequential(
        [embed_layer] +
        GRU_layers +
        [dense_output_layer]
    )
    
    # return the model of type
    return model

In [93]:
model = GRNN(n_GRU_layers=1)
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, None, 256)         2326528   
                                                                 
 gru_8 (GRU)                 (None, 256)               394752    
                                                                 
 dense_8 (Dense)             (None, 2)                 514       
                                                                 
Total params: 2,721,794
Trainable params: 2,721,794
Non-trainable params: 0
_________________________________________________________________


In [94]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')

## Training

In [95]:
model.fit(train_X, train_Y, epochs=40, batch_size=32, shuffle=True)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x19590ec2550>

## Testing

In [96]:
def test_model(model, val_X, val_Y):
    Y_hat = model.predict(val_X)
    is_pos = Y_hat[:,0] > Y_hat[:,1]
    is_pos = is_pos.astype(np.int32)
    accuracy = accuracy_score(is_pos, val_Y[:,0])
    return accuracy

accuracy = test_model(model, val_X, val_Y)
accuracy



0.978

In [97]:
def predict(model, tweet, MAX=MAX):
    tensor = tweet_to_tensor(tweet)
    assert len(tensor) <= MAX
    
    # Pad with 0s
    X = tensor + [0 for i in range(MAX - len(tensor))]
    X = np.array([tensor])
    
    Y = model.predict(X)
    pos_score = Y[0][0]

    return pos_score, "positive" if pos_score>=0.5 else "negative"

In [99]:
tweet = "I felt very bad"

prediction = predict(model, tweet)
prediction



(0.99918336, 'positive')