# IMDB Sentiment Analysis

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
#import Libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [13]:
!pip install --upgrade gensim
!pip install transformers



In [14]:
#reading the csv files
train = pd.read_csv('drive/My Drive/data/Train.csv')
test = pd.read_csv('drive/My Drive/data/Test.csv')
valid = pd.read_csv('drive/My Drive/data/Valid.csv')

In [15]:
train.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [None]:
print(train.shape)
print(test.shape)
print(valid.shape)

(40000, 2)
(5000, 2)
(5000, 2)


## Text Preprocessing

In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
#download nltk resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### To-do:
- remove special characters e.g tags, etc.
- convert text to lowercase
- tokenize the text (split it into individual tokens or words)
- remove stop words
- stemming(removing the suffixes from words to create a stem) & lemmitization(convert word to its base form)

In [4]:
def preprocess_text(text):

    # Remove HTML tags
    text = re.sub('<[^>]*>', '', text)

    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower()

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Combine words back into a single string
    preprocessed_text = ' '.join(words)

    return preprocessed_text

In [5]:
#testing the function
preprocess_text("This is Melodie, and she's doing nlp for the first time <tgif>. Hoorahh!!")

'melodie nlp first time hoorahh'

#### Preprocessing our data

In [10]:
train['preprocessed_text'] = train['text'].apply(preprocess_text)
test['preprocessed_text'] = test['text'].apply(preprocess_text)
valid['preprocessed_text'] = valid['text'].apply(preprocess_text)

In [11]:
train.head()

Unnamed: 0,text,label,preprocessed_text
0,I grew up (b. 1965) watching and loving the Th...,0,grew b watching loving thunderbird mate school...
1,"When I put this movie in my DVD player, and sa...",0,put movie dvd player sat coke chip expectation...
2,Why do people who do not know what a particula...,0,people know particular time past like feel nee...
3,Even though I have great interest in Biblical ...,0,even though great interest biblical movie bore...
4,Im a die hard Dads Army fan and nothing will e...,1,im die hard dad army fan nothing ever change g...


## Vectorization & Models


In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec

In [None]:
X_train = train['preprocessed_text']
y_train = train['label']
X_test = test['preprocessed_text']
y_test = test['label']
X_valid = valid['preprocessed_text']
y_valid = valid['label']

#### SVM with Bag of Wordss and TFID Vectorizer

In [None]:
#we'll create a function that uses SVM to train data with Bag of Words, and TFID vectorization techniques

def train_svm_with_representations(train_data, test_data, representation):
    if representation == 'bow':
        vectorizer = CountVectorizer()
    elif representation == 'tfidf':
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError("Invalid representation. Choose 'bow' or 'tfidf'.")

    X_train = vectorizer.fit_transform(train_data)
    X_test = vectorizer.transform(test_data)

    clf = SVC()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    return y_pred


###### Bag of Words

In [None]:
%%time
y_pred_bow = train_svm_with_representations(X_train, X_test, 'bow')
accuracy_bow = accuracy_score(y_test, y_pred_bow)
print(accuracy_bow)

0.8804


##### TFID Vectorizer

In [None]:
%%time
y_pred_tfidf = train_svm_with_representations(X_train, X_test, 'tfidf')
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print(accuracy_tfidf)

0.9004


#### SVM with Custom Word2Vec Vectorizer

In [None]:
def get_word2vec_embeddings(data):
    tokenized_sentences = [sentence.split() for sentence in data]
    model = Word2Vec(tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)
    embeddings = np.array([np.mean([model.wv[word] for word in sentence], axis=0) for sentence in tokenized_sentences])

    return embeddings

def train_svm_with_word2vec(train_data, test_data):
    X_train = get_word2vec_embeddings(train_data)
    X_test = get_word2vec_embeddings(test_data)

    clf = SVC()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    return y_pred

In [None]:
%%time
y_pred_word2vec = train_svm_with_word2vec(X_train, X_test)
accuracy_word2vec = accuracy_score(y_test, y_pred_word2vec)
print(accuracy_word2vec)

0.5262
CPU times: user 3min 46s, sys: 1.14 s, total: 3min 47s
Wall time: 3min 22s


#### SVM with Pretrained Google News 300 Word2Vec Vectorizer

In [None]:
import gensim.downloader as api

def get_google_word2vec_embeddings(data):
    # Load the Google News Word2Vec model
    model = api.load("word2vec-google-news-300")

    tokenized_sentences = [sentence.split() for sentence in data]
    embeddings = []

    for sentence in tokenized_sentences:
        sentence_embeddings = []
        for word in sentence:
            if word in model:
                sentence_embeddings.append(model[word])
        if sentence_embeddings:
            embeddings.append(np.mean(sentence_embeddings, axis=0))
        else:
            embeddings.append(np.zeros(300))

    return np.array(embeddings)

def train_svm_with_google_word2vec(train_data, test_data):
    X_train = get_google_word2vec_embeddings(train_data)
    X_test = get_google_word2vec_embeddings(test_data)

    clf = SVC()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    return y_pred


In [None]:
y_pred_google_word2vec = train_svm_with_google_word2vec(X_train, X_test)
accuracy_google_word2vec = accuracy_score(y_test, y_pred_google_word2vec)
print(accuracy_google_word2vec)

0.8574


####  Recurrent Neural Networks

##### Vanilla RNN

Using preprocessed text

In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder

In [None]:
# Tokenize the data
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train['preprocessed_text'])

In [None]:
X_train = tokenizer.texts_to_sequences(train['preprocessed_text'])
X_test = tokenizer.texts_to_sequences(test['preprocessed_text'])
X_valid = tokenizer.texts_to_sequences(valid['preprocessed_text'])

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
X_valid = pad_sequences(X_valid, maxlen=max_len)

y_train = to_categorical(train['label'])
y_test = to_categorical(test['label'])
y_valid = to_categorical(valid['label'])

In [None]:
# Build the model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(SimpleRNN(64))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=25, batch_size=128)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)

print("Test accuracy:", accuracy)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Test accuracy: 0.7942000031471252


Using raw text

In [None]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train['text'])

X_train1 = tokenizer.texts_to_sequences(train['text'])
X_test1 = tokenizer.texts_to_sequences(test['text'])
X_valid1 = tokenizer.texts_to_sequences(valid['text'])

X_train1 = pad_sequences(X_train1, maxlen=max_len)
X_test1 = pad_sequences(X_test1, maxlen=max_len)
X_valid1 = pad_sequences(X_valid1, maxlen=max_len)

y_train1 = to_categorical(train['label'])
y_test1 = to_categorical(test['label'])
y_valid1 = to_categorical(valid['label'])

# Build the model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(SimpleRNN(64))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [None]:
# Train the model
history = model.fit(X_train1, y_train1, validation_data=(X_valid1, y_valid1), epochs=25, batch_size=128,
                    callbacks=[early_stop])

# Evaluate the model
loss, accuracy = model.evaluate(X_test1, y_test1)

print("Test accuracy:", accuracy)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Test accuracy: 0.8551999926567078


##### LSTM (Long Short Term Memory) RNN

Using Preprocessed text

In [None]:
from tensorflow.keras.layers import LSTM

In [None]:
# Build the model
lstm = Sequential()
lstm.add(Embedding(max_words, 128, input_length=max_len))
lstm.add(LSTM(64))
lstm.add(Dense(2, activation='softmax'))

# Compile the model
lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = lstm.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=25, batch_size=128,
                    callbacks=[early_stop])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25


In [None]:
# Evaluate the model
loss, accuracy = lstm.evaluate(X_test, y_test)

print("Test accuracy:", accuracy)

Test accuracy: 0.8790000081062317


Using Raw text

In [None]:
# Build the model
lstm = Sequential()
lstm.add(Embedding(max_words, 128, input_length=max_len))
lstm.add(LSTM(64))
model.add(Dropout(0.5))
lstm.add(Dense(2, activation='softmax'))

# Compile the model
lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = lstm.fit(X_train1, y_train1, validation_data=(X_valid1, y_valid1), epochs=25, batch_size=128,
                    callbacks=[early_stop])

# Evaluate the model
loss, accuracy = lstm.evaluate(X_test1, y_test1)

print("Test accuracy:", accuracy)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Test accuracy: 0.868399977684021


##### GRU (Gated Recurrent Unit) RNN

Using preprocessed text

In [None]:
from tensorflow.keras.layers import GRU

In [None]:
# Build the model
gru = Sequential()
gru.add(Embedding(max_words, 128, input_length=max_len))
gru.add(GRU(64))
gru.add(Dense(2, activation='softmax'))

# Compile the model
gru.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Fit the model
history = gru.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=25, batch_size=128,
                    callbacks=[early_stop])

# Evaluate the model
loss, accuracy = gru.evaluate(X_test, y_test)

print("Test accuracy:", accuracy)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Test accuracy: 0.8799999952316284


Using raw text

In [None]:
# Build the model
gru = Sequential()
gru.add(Embedding(max_words, 128, input_length=max_len))
gru.add(GRU(64))
gru.add(Dropout(0.5))
gru.add(Dense(2, activation='softmax'))

# Compile the model
gru.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit the model
history = gru.fit(X_train1, y_train1, validation_data=(X_valid1, y_valid1), epochs=25, batch_size=128,
                    callbacks=[early_stop])

# Evaluate the model
loss, accuracy = gru.evaluate(X_test1, y_test1)

print("Test accuracy:", accuracy)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Test accuracy: 0.8561999797821045


##### Bi-Directional LSTM RNN

Using preprocessed text

In [None]:
from tensorflow.keras.layers import LSTM, Bidirectional

In [None]:
# Build the model
bi_lstm = Sequential()
bi_lstm.add(Embedding(max_words, 128, input_length=max_len))
bi_lstm.add(Bidirectional(LSTM(64)))
bi_lstm.add(Dense(2, activation='softmax'))

# Compile the model
bi_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit the model
history = bi_lstm.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=25, batch_size=128,
                    callbacks=[early_stop])

# Evaluate the model
loss, accuracy = bi_lstm.evaluate(X_test, y_test)

print("Test accuracy:", accuracy)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Test accuracy: 0.881600022315979


Using raw text

In [33]:
# Build the model
bi_lstm = Sequential()
bi_lstm.add(Embedding(max_words, 128, input_length=max_len))
bi_lstm.add(GRU(64))
bi_lstm.add(Dropout(0.5))
bi_lstm.add(Dense(2, activation='softmax'))

# Compile the model
bi_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit the model
history = bi_lstm.fit(X_train1, y_train1, validation_data=(X_valid1, y_valid1), epochs=25, batch_size=128,
                    callbacks=[early_stop])

# Evaluate the model
loss, accuracy = bi_lstm.evaluate(X_test1, y_test1)

print("Test accuracy:", accuracy)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Test accuracy: 0.8687999844551086


#### Transformer Models

##### Pre-Trained DistilBERT Model

Using preprocessed text

In [8]:
import os
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer


# Detect TPU and set up distribution strategy
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
tpu_strategy = tf.distribute.TPUStrategy(tpu)

model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [16]:
#prepare datasets
max_len = 512

X_train = tokenizer(text=train['preprocessed_text'].tolist(), add_special_tokens=True, max_length=max_len, truncation=True, padding=True, return_tensors='tf')
y_train = train['label'].values
X_test = tokenizer(text=test['preprocessed_text'].tolist(), add_special_tokens=True, max_length=max_len, truncation=True, padding=True, return_tensors='tf')
y_test = test['label'].values
X_valid = tokenizer(text=valid['preprocessed_text'].tolist(), add_special_tokens=True, max_length=max_len, truncation=True, padding=True, return_tensors='tf')
y_valid = valid['label'].values

X_train_np = X_train['input_ids'].numpy()
X_valid_np = X_valid['input_ids'].numpy()

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_np, y_train)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test['input_ids'], X_test['attention_mask'])).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((X_valid_np, y_valid)).batch(32)


In [17]:
with tpu_strategy.scope():
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Set up early stopping
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

batch_size = 32
epochs = 10
history = model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, callbacks=[early_stop])

y_pred = model.predict(test_dataset)
y_pred_labels = np.argmax(y_pred.logits, axis=1)
accuracy = np.sum(y_pred_labels == y_test) / len(y_test)
print(f"Test accuracy: {accuracy:.2f}")

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Test accuracy: 0.88


Using raw text

In [13]:
#prepare datasets
max_len = 512

X_train1 = tokenizer(text=train['text'].tolist(), add_special_tokens=True, max_length=max_len, truncation=True, padding=True, return_tensors='tf')
y_train1 = train['label'].values
X_test1 = tokenizer(text=test['text'].tolist(), add_special_tokens=True, max_length=max_len, truncation=True, padding=True, return_tensors='tf')
y_test1 = test['label'].values
X_valid1 = tokenizer(text=valid['text'].tolist(), add_special_tokens=True, max_length=max_len, truncation=True, padding=True, return_tensors='tf')
y_valid1 = valid['label'].values

X_train_np1 = X_train1['input_ids'].numpy()
X_valid_np1 = X_valid1['input_ids'].numpy()

# Create TensorFlow datasets
train_dataset1 = tf.data.Dataset.from_tensor_slices((X_train_np1, y_train1)).batch(32)
test_dataset1 = tf.data.Dataset.from_tensor_slices((X_test1['input_ids'], X_test1['attention_mask'])).batch(32)
val_dataset1 = tf.data.Dataset.from_tensor_slices((X_valid_np1, y_valid1)).batch(32)


In [14]:
with tpu_strategy.scope():
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Set up early stopping
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

batch_size = 32
epochs = 10
history = model.fit(train_dataset1, epochs=epochs, validation_data=val_dataset1, callbacks=[early_stop])

y_pred = model.predict(test_dataset1)
y_pred_labels = np.argmax(y_pred.logits, axis=1)
accuracy = np.sum(y_pred_labels == y_test1) / len(y_test1)
print(f"Test accuracy: {accuracy:.2f}")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Test accuracy: 0.93


##### Pre-Trained Roberta Model

For this we'll just use raw text since Neural Networks tend to perform best with the raw text.

In [9]:
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [16]:
#prepare datasets
max_len = 512

X_train1 = tokenizer(text=train['text'].tolist(), add_special_tokens=True, max_length=max_len, truncation=True, padding=True, return_tensors='tf')
y_train1 = train['label'].values
X_test1 = tokenizer(text=test['text'].tolist(), add_special_tokens=True, max_length=max_len, truncation=True, padding=True, return_tensors='tf')
y_test1 = test['label'].values
X_valid1 = tokenizer(text=valid['text'].tolist(), add_special_tokens=True, max_length=max_len, truncation=True, padding=True, return_tensors='tf')
y_valid1 = valid['label'].values

X_train_np1 = X_train1['input_ids'].numpy()
X_valid_np1 = X_valid1['input_ids'].numpy()

# Create TensorFlow datasets
train_dataset1 = tf.data.Dataset.from_tensor_slices((X_train_np1, y_train1)).batch(16)
test_dataset1 = tf.data.Dataset.from_tensor_slices((X_test1['input_ids'], X_test1['attention_mask'])).batch(16)
val_dataset1 = tf.data.Dataset.from_tensor_slices((X_valid_np1, y_valid1)).batch(16)

In [17]:
with tpu_strategy.scope():
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-6)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Set up early stopping
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

batch_size = 16
epochs = 10
history = model.fit(train_dataset1, epochs=epochs, validation_data=val_dataset1, callbacks=[early_stop])

y_pred = model.predict(test_dataset1)
y_pred_labels = np.argmax(y_pred.logits, axis=1)
accuracy = np.sum(y_pred_labels == y_test1) / len(y_test1)
print(f"Test accuracy: {accuracy:.2f}")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Test accuracy: 0.95


#### Comparing the Accuracy of all the models for our Sentiment Analysis:
- Accuracy of SVM with BOW: 88.04%
- Accuracy of SVM with Tf-IDF: 90.04%
- Accuracy of SVM with Custom Word2Vec: 52.62%
- Accuracy of SVM with Google Word2Vec: 85.74%
- Accuracy of Vanilla RNN Model: 85.51%
- Accuracy of LSTM Model: 87.90%
- Accuracy of GRU Model: 88.00%
- Accuracy of Bi-Directional LSTM Model: 88.16%
- Accuracy of DistilBERT Model: 93%
- Accuracy of Roberta Model: 95%