In [1]:
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import tensorflow as tf
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense

import ssl 

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


2023-12-13 18:54:56.842851: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joeloscarsson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joeloscarsson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joeloscarsson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joeloscarsson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:

stopwords_default = stopwords.words('english')
print('Stopwords in NLTK: ',len(stopwords_default))

Stopwords in NLTK:  179


# Read data

In [3]:
train_df = pd.read_csv('../data/imdb_train_data_small.csv')
test_df = pd.read_csv('../data/imdb_test_data_small.csv')

In [4]:
test_df

Unnamed: 0,text,label
0,This movie has bad writing and bad editing. It...,0
1,I'm still laughing- Not! I'm still asking my m...,0
2,While I'm normally a big fan of John Turturro'...,0
3,<br /><br />The author tried to make a Kevin S...,0
4,Oh boy ! It was just a dream ! What a great id...,0
...,...,...
295,My wife and I struggle to find movies like thi...,1
296,"While watching this film recently, I constantl...",1
297,Trust the excellent and accurate Junagadh75 re...,1
298,Valley Girl is an exceptionally well made film...,1


# Tokenization

Create your own tokenization algorithm. Remember to handle upper/lower case, comma, punctioation and so on.
Each word should hava an integer connected to it. Word as key and integer as value in a dict is one way to do it.

Tensorflow have tokenization models, but try to bild it yourself.

In [5]:
test_df = pd.read_csv('../data/imdb_test_data_small.csv')

test_df
def tokenize(test_df): #-> int: 
    """ Tokenize the text column in the dataframe,
    returns a new column with the tokenized words"""
    
    test_df['text'] = test_df['text'].apply(lambda text: ''.join(c.lower() for c in text if c.isalpha() or c.isspace()).strip())

    text = test_df['text']

    test_df['tokenized_text'] = test_df['text'].apply(lambda row: nltk.word_tokenize(row))

    return test_df


text = tokenize(test_df)
new_df = pd.DataFrame({'tokenized_text': text['tokenized_text']})

tokenized_text = new_df['tokenized_text'].explode().reset_index()


# print(tokenized_text.head())

display(tokenized_text.head(10))
display(tokenized_text['index'].unique())

Unnamed: 0,index,tokenized_text
0,0,this
1,0,movie
2,0,has
3,0,bad
4,0,writing
5,0,and
6,0,bad
7,0,editing
8,0,it
9,0,is


array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

# Remove stopwords

In [6]:
stop_words = set(stopwords.words('english'))
#stop_words

In [7]:
def remove_stopwords(tokenized_text, stop_words):
    """ Remove stopwords from the tokenized text column"""

    tokenized_text['tokenized_text'] = tokenized_text['tokenized_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


    return tokenized_text


tokenized_text = remove_stopwords(tokenized_text.copy(), stop_words)

# Display the new column
tokenized_text = tokenized_text[tokenized_text['tokenized_text'].apply(len) > 0]
# rename column
tokenized_text = tokenized_text.rename(columns={'tokenized_text': 'word'})
tokenized_text.head(14)

Unnamed: 0,index,word
1,0,movie
3,0,bad
4,0,writing
6,0,bad
7,0,editing
10,0,difficult
12,0,follow
15,0,going
18,0,nothing
22,0,characters


# Lemmatization

Is about reducing words to their base form. 

* "running" → "run"
* "better" → "good"
* "mice" → "mouse"

In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
def lemmatize(lemmatizer, tokenized_text):
    """Lemmatize the tokenized 'word' column"""

    # Function to lemmatize a single word
    def lemmatize_word(word):
        return lemmatizer.lemmatize(word)


    tokenized_text['word'] = tokenized_text['word'].apply(lemmatize_word)

    return tokenized_text

lemmatized_df = lemmatize(lemmatizer, tokenized_text)
lemmatized_df.head(14)  # We can see that some words are lemmatized

Unnamed: 0,index,word
1,0,movie
3,0,bad
4,0,writing
6,0,bad
7,0,editing
10,0,difficult
12,0,follow
15,0,going
18,0,nothing
22,0,character


# Word embedding and sentiment analysis model
We want to create a model that can say if a movie review is bad or good.

- Preprocess the text
- Convert text to seqiuence of integers
- Create architecture that includes embeddings
- Build and train your models
- Evaluate preformance

Building models from scratch is not something you usually do, but those who would like to dig deeper into the math behind Simple RNN, LSTM and GRU can do it by creating the cells from scratch.

In [12]:
def pad_data(embedded_text):
    # All sentences should be of the same lenght, but if a sentence is shorter than the longest, pad it.
    return padded_text

## RNN with tensorflow modules
[Simple RNN cell](https://www.tensorflow.org/api_docs/python/tf/keras/layers/SimpleRNN)

[Embedding](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding)

In [13]:
def build_rnn_model():
    return model

## RNN from scratch

In [14]:
class RNNCell(tf.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.Wxh =
        self.Whh =
        self.bh =

    def __call__(self, x, h):
        h_next = 
        return h_next

SyntaxError: invalid syntax (177378052.py, line 4)

In [None]:
# RNN Model Class
class MyRNNModel(tf.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim=1, sequence_length=100):
        super().__init__()
        self.embedding =
        self.rnn_cell = RNNCell(embedding_dim, hidden_dim)
        self.Why = 
        self.by = 

    def __call__(self, x):
        x = 
        h = 

        # Process the input sequence
        for t in range(sequence_length):
            x_t = x[:, t, :]
            h = self.rnn_cell(x_t, h)

        y = 
        return tf.sigmoid(y)

In [None]:
def train_step(model, inputs, targets):
    clip_norm = 1.0
    with tf.GradientTape() as tape:
        predictions = model(inputs)
        loss = loss_function(targets, predictions)
        
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(model.trainable_variables)
    return loss

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((padded_train_data, y)).batch(batch_size)
for epoch in range(NUM_EPOCHS):
    epoch_loss = 0
    epoch_accuracy = 0
    total_batches = 0

    for batch_inputs, batch_targets in train_dataset:
        loss = train_step(model, batch_inputs, batch_targets)
        epoch_loss += loss.numpy()

        # Calculate accuracy
        predictions = model(batch_inputs)
        accuracy = calculate_accuracy(batch_targets, predictions)
        epoch_accuracy += accuracy.numpy()

        total_batches += 1

    avg_loss = epoch_loss / total_batches
    avg_accuracy = epoch_accuracy / total_batches
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.4f}')

## LSTM

[LSTM Cell](https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTMCell)

In [None]:
def build_lstm_model():
    return model

## LSTM from scrtch

In [None]:
# LSTM Cell Class
class LSTMCell(tf.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        # Gates: input, forget, cell, output
        self.Wi =
        self.Wf =
        self.Wc =
        self.Wo =
        self.bi =
        self.bf =
        self.bc =
        self.bo =

    def __call__(self, x, h, c):
        combined = tf.concat([x, h], 1)

        i = 
        f = 
        o = 
        c_ = 

        c_new = 
        h_new =

        return h_new, c_new

In [None]:
# LSTM Model Class
class MyLSTMModel(tf.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding =
        self.lstm_cell = LSTMCell(embedding_dim, hidden_dim)
        self.Why =
        self.by =

    def __call__(self, x):
        x =
        h =
        c =

        for t in range(sequence_length):
            x_t = x[:, t, :]
            h, c = self.lstm_cell(x_t, h, c)

        y =
        return tf.sigmoid(y)

In [None]:
def train_step(model, inputs, targets):
    clip_norm = 1.0
    with tf.GradientTape() as tape:
        predictions = model(inputs)
        loss = loss_function(targets, predictions)
        
    gradients = tape.gradient(loss, model.trainable_variables)
    clipped_gradients = [tf.clip_by_norm(g, clip_norm) for g in gradients]
    optimizer.apply_gradients(model.trainable_variables)
    return loss

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((padded_train_data, y)).batch(batch_size)
for epoch in range(NUM_EPOCHS):
    epoch_loss = 0
    epoch_accuracy = 0
    total_batches = 0

    for batch_inputs, batch_targets in train_dataset:
        loss = train_step(model, batch_inputs, batch_targets)
        epoch_loss += loss.numpy()

        # Calculate accuracy
        predictions = model(batch_inputs)
        accuracy = calculate_accuracy(batch_targets, predictions)
        epoch_accuracy += accuracy.numpy()

        total_batches += 1

    avg_loss = epoch_loss / total_batches
    avg_accuracy = epoch_accuracy / total_batches
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.4f}')

## GRU
[GRU Cell](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GRUCell)

In [None]:
def build_gru_model():
    return model

## GRU from scratch

In [None]:
# GRU Cell Class
class GRUCell(tf.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        # Update gate parameters
        self.Wz =
        self.bz =

        # Reset gate parameters
        self.Wr =
        self.br =

        # Candidate hidden state parameters
        self.Wh =
        self.bh =
        
    def __call__(self, x, h):
        combined = tf.concat([x, h], 1)

        # Update gate
        z =

        # Reset gate
        r =

        # Candidate hidden state
        combined_reset =
        h_candidate =

        # New hidden state
        h_new =

        return h_new

In [None]:
# GRU Model Class
class MyGRUModel(tf.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding =
        self.gru_cell =
        self.Why =
        self.by =

    def __call__(self, x):
        x =
        h =

        for t in range(sequence_length):
            x_t = x[:, t, :]
            h = self.gru_cell(x_t, h)

        y =
        return tf.sigmoid(y)

In [None]:
def train_step(model, inputs, targets):
    clip_norm = 1.0
    with tf.GradientTape() as tape:
        predictions = model(inputs)
        loss = loss_function(targets, predictions)
        
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(model.trainable_variables)
    return loss

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((padded_train_data, y)).batch(batch_size)
for epoch in range(NUM_EPOCHS):
    epoch_loss = 0
    epoch_accuracy = 0
    total_batches = 0

    for batch_inputs, batch_targets in train_dataset:
        loss = train_step(model, batch_inputs, batch_targets)
        epoch_loss += loss.numpy()

        # Calculate accuracy
        predictions = model(batch_inputs)
        accuracy = calculate_accuracy(batch_targets, predictions)
        epoch_accuracy += accuracy.numpy()

        total_batches += 1

    avg_loss = epoch_loss / total_batches
    avg_accuracy = epoch_accuracy / total_batches
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.4f}')