# Sentiment Analysis of Movie Reviews

In [68]:
!pip install matplotlib



Import necessary librairies

In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Read dataset

In [70]:
train_df=pd.read_csv("../data/train.tsv",sep='\t')
train_df.shape

(156060, 4)

Remove duplicates

In [71]:
train_df.drop_duplicates(subset=["SentenceId"], keep="first")
train_df.shape

(156060, 4)

Create sample

In [72]:
# Échantillonnage aléatoire de 10 lignes du DataFrame df
sample_train = train_df.sample(n=30000, random_state=42)

# Preprocessing 

We need to preprocess the text source before feeding it to BERT. To do so, we download the BertTokenizer

In [73]:
from transformers import BertTokenizer

In [74]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [75]:
seq_len = 512
num_samples = len(sample_train)
num_samples

30000

In [76]:

tokens = tokenizer(sample_train["Phrase"].tolist(),
                  max_length=seq_len,
                  truncation=True,
                  padding="max_length",
                  add_special_tokens=True, # the sequences will be encoded with the special tokens relative to their model
                  return_tensors="np")

In [77]:
tokens.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [78]:
tokens["input_ids"],tokens["attention_mask"]
#input_ids correspond aux indices des tokens crées
#attention_mask contient des 0 et des 1: 0:"padded values" (pour pas que le modèle leur prête attention) 1: nos vrais tokens

(array([[  101,   112,   188, ...,     0,     0,     0],
        [  101, 20359,  3789, ...,     0,     0,     0],
        [  101,  1104,  1292, ...,     0,     0,     0],
        ...,
        [  101,  1105,   169, ...,     0,     0,     0],
        [  101,  3085,  2618, ...,     0,     0,     0],
        [  101,  1188,  1156, ...,     0,     0,     0]]),
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]]))

In [79]:
#target variable
classes_arr = sample_train["Sentiment"].values
classes_arr

array([2, 4, 2, ..., 2, 2, 0], dtype=int64)

In [80]:
len(classes_arr)

30000

In [81]:
#On initialise la matrice labels à 0
labels = np.zeros((num_samples, classes_arr.max()+1))
labels.shape
#our dataframe has 5 class labels = {0 1 2 3 4}

(30000, 5)

make one hot labels

In [82]:
labels[np.arange(num_samples), classes_arr] = 1
labels

array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.]])

"classes_arr" refer to which column must be 1

Building a Dataset

In [83]:
import tensorflow as tf

In [84]:
dataset = tf.data.Dataset.from_tensor_slices((tokens["input_ids"],tokens["attention_mask"],labels))

In [85]:
dataset.take(1)

<_TakeDataset element_spec=(TensorSpec(shape=(512,), dtype=tf.int32, name=None), TensorSpec(shape=(512,), dtype=tf.int32, name=None), TensorSpec(shape=(5,), dtype=tf.float64, name=None))>

In [86]:
def map_func(input_ids, masks, labels):
    # we convert our three-item tuple into a two-item tuple where the input item is a dictionary
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

# then we use the dataset map method to apply this transformation
dataset = dataset.map(map_func)

dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)}, TensorSpec(shape=(5,), dtype=tf.float64, name=None))>

Shuffle data (on mélange les données)

In [87]:
batch_size = 16
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

In [88]:
dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(16, 5), dtype=tf.float64, name=None))>

Split data to train and validation

In [89]:
split = 0.8  #la première accuracy avec 20 000 c'était avec 0.9
size = int(( tokens["input_ids"].shape[0] / batch_size) * split)
size

1500

In [90]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

In [91]:
train_ds.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(16, 5), dtype=tf.float64, name=None))>

In [92]:
train_ds

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(16, 5), dtype=tf.float64, name=None))>

Now we want to use pretrained bert model

In [93]:
from transformers import TFAutoModel

In [94]:
bert = TFAutoModel.from_pretrained("bert-base-cased")

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [95]:
bert.summary()

Model: "tf_bert_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


Building the model architecture

In [96]:
'''
Building the model architecture: The input layers for the input tokens and attention masks are defined. The BERT model is applied to the inputs, and the final activations are extracted. Additional dense layers are added on top of the BERT embeddings to map them to the desired output classes. 
'''

'\nBuilding the model architecture: The input layers for the input tokens and attention masks are defined. The BERT model is applied to the inputs, and the final activations are extracted. Additional dense layers are added on top of the BERT embeddings to map them to the desired output classes. \n'

In [97]:
'''
1. Input Layers
2. BERT Layers
3.Dense Layers
'''
input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')

# we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)
embeddings = bert.bert(input_ids, attention_mask=mask)[1]  # access final activations (alread max-pooled) [1]
#The BERT model returns a tuple of outputs, and we are interested in the first output, which contains the final activations (embeddings) of the model.
# convert bert embeddings into 5 output classes
x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(5, activation='softmax', name='outputs')(x)

In [98]:
# initialize model
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

# freeze bert layer
model.layers[2].trainable = False #This ensures that the BERT weights are not updated during training.

# print out model summary
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                         

The model takes tokenized input sequences and attention masks as input. It passes the input through the BERT model to obtain embeddings. These embeddings are then passed through dense layers for further transformation. Finally, the model produces output probabilities for each sentiment class using a softmax activation function. The BERT layer is frozen, and only the additional dense layers are trained during model training.

In [99]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5) #decay=1e-6
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [100]:
history = model.fit(train_ds, validation_data=val_ds, epochs=3)

Epoch 1/3
  99/1500 [>.............................] - ETA: 13:07:46 - loss: 1.3329 - accuracy: 0.4924

In [None]:
#0.5035