<a href="https://colab.research.google.com/github/MatteoGuglielmi-tech/Polarity-and-Subjectivity-Detection/blob/main/src/MyModel/BERT-Fine-Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Embedding Fine Tuning

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
rootdir = '/content/gdrive/MyDrive/Colab Notebooks/Polarity-Subjectivity-Detection/'

In [21]:
import tensorflow as tf
import torch

# Get the GPU device name
device = tf.test.gpu_device_name()

if 'GPU' in device:
  print(f'GPU available : {device}')
  device = torch.device('cuda'+device[-2:])
else :
  device = torch.device("cpu")
  raise SystemError("GPU not found, use CPU instead")

GPU available : /device:GPU:0


In [22]:
device

device(type='cuda', index=0)

In [23]:
import pandas as pd


# loading dataset
movie_reviews = pd.read_csv(rootdir+'movie_rews.csv')
subj_obj_dataset = pd.read_csv(rootdir+'subj_obj_dataset.csv')

In [24]:
movie_reviews

Unnamed: 0.1,Unnamed: 0,text,pos,neg
0,0,films adapted comic books plenty success wheth...,1,0
1,1,every movie comes along suspect studio every i...,1,0
2,2,got mail works alot better deserves order make...,1,0
3,3,jaws rare film grabs attention shows single im...,1,0
4,4,moviemaking lot like general manager nfl team ...,1,0
...,...,...,...,...
1995,1995,anything stigmata taken warning releasing simi...,0,1
1996,1996,john boorman zardoz goofy cinematic debacle fu...,0,1
1997,1997,kids hall acquired taste took least season wat...,0,1
1998,1998,time john carpenter great horror director cour...,0,1


In [25]:
subj_obj_dataset

Unnamed: 0.1,Unnamed: 0,text,tag
0,0,"smart and alert , thirteen conversations about...",subj
1,1,"color , musical bounce and warm seas lapping o...",subj
2,2,it is not a mass-market entertainment but an u...,subj
3,3,a light-hearted french film about the spiritua...,subj
4,4,my wife is an actress has its moments in looki...,subj
...,...,...,...
9995,9995,"in the end , they discover that balance in lif...",obj
9996,9996,a counterfeit 1000 tomin bank note is passed i...,obj
9997,9997,enter the beautiful and mysterious secret agen...,obj
9998,9998,after listening to a missionary from china spe...,obj


### Major commands :
- .tokenize(sent)
- .convert_tokens_to_ids(tokenized_sent)
- .encode.plus() [source](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus)

In [112]:
# BERT model script from: huggingface.co
from transformers import BertTokenizer, BertModel
from typing import Tuple, List, Dict
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import logging
import gc

# to not see warning everytime
logging.set_verbosity_error()


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
model = BertModel.from_pretrained("bert-base-uncased").to(device)

In [78]:
type(model)

transformers.models.bert.modeling_bert.BertModel

In [62]:
def embedding(dataset : pd.DataFrame, sentence_column : str) -> Tuple[List]:#Tuple[tf.Tensor]:
    embeddings = {
        'embedding' : [],
        'attention_mask' : []
        }
        
    for sent in dataset[sentence_column]:
        dic_sent_encoding = tokenizer.encode_plus(sent, # untokenized sentence
                                                add_special_tokens = True,  # add '[CLS]' and '[SEP]'
                                                truncation = True,  # truncate to maximum length
                                                padding = "max_length",  # pad to maximum admissible sentence
                                                return_attention_mask = True,  # return attention mask
                                                return_tensors = "pt") # returns tensorflow constant obj
    
        # extracting embeddings and attention masks in list form
        embeddings['embedding'].append(dic_sent_encoding['input_ids'])
        embeddings['attention_mask'].append(dic_sent_encoding['attention_mask'])

    # convert lists of tensors into tensors
    input_ids = torch.cat(embeddings['embedding'], axis=0)
    attention_masks = torch.cat(embeddings['attention_mask'], axis=0)
    
    return input_ids, attention_masks
    #return embeddings['embedding'], embeddings['attention_mask']

In [63]:
ids_mr, attention_masks_mr = embedding(dataset=movie_reviews, sentence_column='text')

In [None]:
ids_mr[0]

In [None]:
attention_masks_mr[0]

In [65]:
ids_subj_obj , attention_masks_subj_obj = embedding(dataset=subj_obj_dataset, sentence_column='text')

In [None]:
ids_subj_obj[0]

In [None]:
attention_masks_subj_obj[0]

In [66]:
print(type(ids_mr))
print(type(ids_mr[0]))

<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [67]:
BATCH_SIZE = 128

In [81]:
def batching_data(dataset: pd.DataFrame, column_name: str ='text', batch_size: int=BATCH_SIZE) -> Tuple[torch.utils.data.DataLoader]:
    '''Function to batch sentences to fit BERT model and get embeddings

        Params:
        ------
            dataset : pd.DataFrame
                dataset to be batched
            column_name : str
                column of 'dataset' DataFrame to catch 
            batch_size : int
                batch size
        
        Return:
        ------
            tuple of dataloader to iterate over
    '''

    # getting encodings and attention masks for whole dataset
    ids, msk = embedding(dataset=dataset, sentence_column=column_name)
    # defining DataLoader that batches ids randomly
    ids_dataloader = DataLoader(dataset=ids, sampler=RandomSampler(ids), batch_size=batch_size, shuffle=False)

    # defining DataLoader that batches masks
    msk_dataloader = DataLoader(dataset=msk, batch_size=batch_size, shuffle=False)

    return ids_dataloader, msk_dataloader

In [77]:
ids_dataloader, _ = batching_data(dataset=movie_reviews)
for idx, ids in enumerate(ids_dataloader):
    print(idx, ids, len(ids))

0 tensor([[  101,  3152,  5967,  ...,  3533, 14913,   102],
        [  101,  2296,  3185,  ...,     0,     0,     0],
        [  101,  2288,  5653,  ...,     0,     0,     0],
        ...,
        [  101, 20705,  2419,  ...,     0,     0,     0],
        [  101,  2360,  5199,  ...,  2290,  3185,   102],
        [  101,  3923,  5722,  ...,  4066,  3291,   102]]) 128
1 tensor([[  101, 18269,  2310,  ...,     0,     0,     0],
        [  101,  2293,  2155,  ...,     0,     0,     0],
        [  101,  3666, 14008,  ...,     0,     0,     0],
        ...,
        [  101,  2204,  5933,  ...,     0,     0,     0],
        [  101, 12731,  4103,  ..., 12731,  4103,   102],
        [  101,  2411,  2056,  ...,     0,     0,     0]]) 128
2 tensor([[  101,  2111,  4906,  ...,  2744,  7616,   102],
        [  101,  3160,  2356,  ...,     0,     0,     0],
        [  101, 21072, 10338,  ...,  7601,  4183,   102],
        ...,
        [  101,  6812, 20349,  ..., 23176,  7828,   102],
        [  101, 1

In [110]:
# embedding taken from last layer of BERT
# avoid touching and computing gradients
# https://towardsdatascience.com/what-is-npy-files-and-why-you-should-use-them-603373c78883

def fine_tune_BERT(model, dataset: pd.DataFrame, sentence_column: str='text', batch_size: int=BATCH_SIZE, device: str=device) -> List:
    """Return Embeddings of dataset

        Params:
        ------
            model: transformers.models.bert.modeling_bert.BertModel
                BertModel to get embeddings from
            dataset : pd.Dataframe
                dataframe containing sentences to embed
            sentence_column : str
                column of 'dataset' to get
            batch_size : int
                batch size
        
        Return:
        ------
            Nothing
    """
    embs = []
    # getting dataloaders
    ids_dataloader, msk_dataloader = batching_data(dataset=dataset)
    # disabling gradients computation --> I'm using a pre-trained net. Don't want to rewrite weights
    with torch.no_grad():
        # iterating through batches
        for idx, (ids,msk) in enumerate(zip(ids_dataloader,msk_dataloader)):

            # move data to device 
            ids = ids.to(device)
            msk = msk.to(device)

            # 'forward pass'
            outputs = model(ids,msk)

            # extracting tensor at last layer : https://github.com/esrel/NLU.Lab.2022.Public/blob/master/notebooks/10_sequence_nn.ipynb
            last_hidden_state = outputs.last_hidden_state

            print(f"Done batch #{idx}")
            print(last_hidden_state[0].cpu().numpy().shape)
            embs.append(last_hidden_state)
            gc.collect()

    return embs

In [111]:
embs = fine_tune_BERT(model=model, dataset=movie_reviews)

Done batch #0
(512, 768)
Done batch #1
(512, 768)
Done batch #2
(512, 768)
Done batch #3
(512, 768)
Done batch #4
(512, 768)
Done batch #5
(512, 768)
Done batch #6
(512, 768)
Done batch #7
(512, 768)
Done batch #8
(512, 768)
Done batch #9
(512, 768)
Done batch #10
(512, 768)
Done batch #11
(512, 768)
Done batch #12
(512, 768)
Done batch #13
(512, 768)
Done batch #14
(512, 768)
Done batch #15
(512, 768)


In [116]:
torch.cuda.empty_cache()
gc.collect()

3295

In [115]:
embs

[tensor([[[-0.2096,  0.2979,  0.2162,  ..., -0.3315,  0.3169,  0.0640],
          [-0.1862, -0.2766,  0.7094,  ...,  0.2248,  1.2239,  0.0567],
          [ 0.3930,  0.1975,  0.6842,  ..., -0.1941, -0.3273, -0.7929],
          ...,
          [-0.7192,  0.4280,  0.7641,  ...,  0.3138, -0.3108, -1.2864],
          [ 0.0133, -0.0689,  0.0690,  ..., -0.3608,  0.9020, -0.4136],
          [ 0.4792,  0.5522,  0.0889,  ..., -0.0378,  0.1840,  0.0069]],
 
         [[-0.5665,  0.4084, -0.1526,  ..., -0.1960,  0.8564, -0.1071],
          [-0.0391, -0.7108, -0.0044,  ...,  0.4230,  0.6107, -0.9881],
          [ 0.7440,  1.0482,  0.1219,  ..., -0.4117,  0.3420, -0.3000],
          ...,
          [ 0.0103,  0.5418,  0.4878,  ..., -0.6001,  0.0078, -1.0462],
          [ 0.0190,  0.4100,  0.5398,  ..., -0.5565, -0.1267, -0.9473],
          [ 0.0309,  0.2969,  0.4816,  ..., -0.4762, -0.0287, -0.8630]],
 
         [[ 0.2285,  0.0806,  0.1325,  ..., -0.5113, -0.2019,  0.1697],
          [-0.7067,  0.5703,

In [89]:
embs[0]

tensor([[[-3.3980e-01,  7.6917e-02,  2.5262e-01,  ..., -3.3652e-01,
           1.1213e-01,  2.4475e-02],
         [-1.5973e-01,  3.1622e-01,  1.2962e+00,  ...,  2.1412e-01,
           5.0686e-01,  5.9950e-02],
         [-2.1416e-01,  6.5749e-01,  5.4433e-01,  ...,  5.0269e-01,
           8.0383e-01, -1.3324e+00],
         ...,
         [-3.5936e-01, -3.3734e-01,  3.4019e-01,  ..., -7.2257e-01,
           2.0028e-02,  1.9825e-03],
         [-8.2911e-01, -2.2191e-02,  5.6317e-02,  ...,  4.4762e-04,
           5.6780e-01, -1.2770e+00],
         [ 3.3751e-01,  5.8908e-01,  2.3102e-01,  ..., -1.6345e-01,
           8.2359e-02,  2.7212e-03]],

        [[ 6.3370e-02,  9.0062e-02,  3.2712e-01,  ..., -6.6207e-02,
           2.6030e-01, -4.9614e-02],
         [-1.9468e-01, -3.9446e-01,  6.6947e-01,  ...,  2.0789e-01,
           2.1416e-01,  7.0328e-02],
         [ 9.2438e-01, -4.2251e-01,  9.2635e-01,  ..., -3.9512e-01,
          -2.9785e-01,  1.4068e-02],
         ...,
         [ 3.5353e-01,  1

In [90]:
type(embs[0])

torch.Tensor

In [93]:
# since working in GPU, the tensors need to be copied to the cpu first
type(embs[0].cpu().numpy())

numpy.ndarray