In [1]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m77.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.2-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.5/268.5 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m112.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m68.6 MB/s[0m eta [36m0:00:

In [24]:
data = pd.read_csv('/content/IMDB Dataset.csv')
import random

# Assuming you have a list or array of 50,000 reviews called 'reviews'
sampled_data = data.sample(n=5000, random_state=42)
#If your system having hing high ram and amazing GPU you can increase the sample size

print(sampled_data )

                                                  review sentiment
33553  I really liked this Summerslam due to the look...  positive
9427   Not many television shows appeal to quite as m...  positive
199    The film quickly gets to a major chase scene w...  negative
12447  Jane Austen would definitely approve of this o...  positive
39489  Expectations were somewhat high for me when I ...  negative
...                                                  ...       ...
39885  One of eastwood's best movies after he had sep...  positive
17566  My blurred childhood memories have kept the ec...  negative
16062  I love Zombie-Movies and I love amateur-produc...  negative
48445  Chan is in New York and he gets involved with ...  positive
20382  My wife and I both thought this film a watered...  negative

[5000 rows x 2 columns]


Stop Word is a commonly used words in a sentence, usually a search engine is programmed to ignore this words (i.e. "the", "a", "an", "of", etc.)
Declaring the english stop words

In [6]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
english_stops = set(stopwords.words('english'))

**Data Preprocessing**

In [8]:
def load_dataset():
    #df = pd.read_csv('IMDB Dataset.csv')
    x_data = sampled_data ['review']       # Reviews/Input
    y_data = sampled_data ['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    #x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    x_data = x_data.apply(lambda review: ' '.join([w.lower() for w in review]))   # convert to lowercase and join as string


    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()


Splitting The Data

In [11]:
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

**Tokenizing the Data**

In [12]:
import tensorflow as tf

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

tokenized_data_train = tokenizer(x_train.to_list(),return_tensors = 'np',padding=True,truncation=True)
tokenized_data_test = tokenizer(x_val.to_list(),return_tensors = 'np',padding=True,truncation=True)

label_train = np.array(y_train)
label_test = np.array(y_val)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [13]:
model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])




In [15]:
# Train the model
model.fit(
    dict(tokenized_data_train),
    label_train,
    validation_data=(dict(tokenized_data_test),label_test),
    epochs=2,
    batch_size=10
)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f978440bbb0>

Prediction

In [17]:
y_pred = model.predict(dict(tokenized_data_test))['logits']
y_pred[:5]



array([[-1.5500342,  1.1846002],
       [ 2.5752804, -2.4035392],
       [-1.2753054,  1.0895375],
       [ 2.6860976, -2.5132704],
       [-0.8292189,  0.642058 ]], dtype=float32)

In [18]:
y_test_prob = tf.nn.softmax(y_pred)
y_test_prob[:5]

<tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[0.06096033, 0.9390397 ],
       [0.99316484, 0.00683514],
       [0.08589318, 0.9141068 ],
       [0.9945102 , 0.00548975],
       [0.18674861, 0.81325144]], dtype=float32)>

In [19]:
y_test_class_pred = np.argmax(y_test_prob,axis=1)
y_test_class_pred[:5]

array([1, 0, 1, 0, 1])

In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test_class_pred,y_val)

0.875

**Applying Learning Decay** , It  can be done to improve model performance

In [None]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay
batch_size = 8
num_epochs = 3
num_train_steps = (len(train_tokenized)// batch_size) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5,
    end_learning_rate=0.,
    decay_steps=num_train_steps
)

In [None]:
from tensorflow.keras.optimizers import Adam

opt = Adam(learning_rate=lr_scheduler)
model.compile(loss=loss, optimizer=opt)