<a href="https://colab.research.google.com/github/MatteoGuglielmi-tech/Polarity-and-Subjectivity-Detection/blob/main/src/MyModel/BERT-Fine-Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Embedding Fine Tuning

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 14.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 47.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 52.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
rootdir = '/content/gdrive/MyDrive/Colab Notebooks/Polarity-Subjectivity-Detection/'

In [4]:
import tensorflow as tf
import torch

# Get the GPU device name
device = tf.test.gpu_device_name()

if 'GPU' in device:
  print(f'GPU available : {device}')
  device = torch.device('cuda'+device[-2:])
else :
  device = torch.device("cpu")
  raise SystemError("GPU not found, use CPU instead")

GPU available : /device:GPU:0


In [5]:
device

device(type='cuda', index=0)

In [6]:
import pandas as pd


# loading dataset
movie_reviews = pd.read_csv(rootdir+'movie_rews.csv')
subj_obj_dataset = pd.read_csv(rootdir+'subj_obj_dataset.csv')

In [7]:
movie_reviews

Unnamed: 0.1,Unnamed: 0,text,labels
0,0,films adapted comic books plenty success wheth...,1
1,1,every movie comes along suspect studio every i...,1
2,2,got mail works alot better deserves order make...,1
3,3,jaws rare film grabs attention shows single im...,1
4,4,moviemaking lot like general manager nfl team ...,1
...,...,...,...
1995,1995,anything stigmata taken warning releasing simi...,0
1996,1996,john boorman zardoz goofy cinematic debacle fu...,0
1997,1997,kids hall acquired taste took least season wat...,0
1998,1998,time john carpenter great horror director cour...,0


In [8]:
subj_obj_dataset

Unnamed: 0.1,Unnamed: 0,text,labels
0,0,"smart and alert , thirteen conversations about...",1
1,1,"color , musical bounce and warm seas lapping o...",1
2,2,it is not a mass-market entertainment but an u...,1
3,3,a light-hearted french film about the spiritua...,1
4,4,my wife is an actress has its moments in looki...,1
...,...,...,...
9995,9995,"in the end , they discover that balance in lif...",0
9996,9996,a counterfeit 1000 tomin bank note is passed i...,0
9997,9997,enter the beautiful and mysterious secret agen...,0
9998,9998,after listening to a missionary from china spe...,0


In [9]:
subjective = subj_obj_dataset[['text','labels']][:len(subj_obj_dataset)//2].sample(n=1000)

In [10]:
objective = subj_obj_dataset[['text', 'labels']][len(subj_obj_dataset)//2:].sample(n=1000)

In [11]:
subj_obj_dataset = pd.concat([subjective,objective], axis=0)

In [12]:
subj_obj_dataset

Unnamed: 0,text,labels
1806,the values that have held the enterprise crew ...,1
4754,movies like this are selling the old european ...,1
4850,succeeds very well in its primary aim of makin...,1
1759,coral reef adventure is a heavyweight film tha...,1
1821,"there's a neat twist , subtly rendered , that ...",1
...,...,...
5518,"directed by rob bowman ( "" the x-files "" ) , ""...",0
7488,"el toro&#180 ; s sons , cain and abel , were s...",0
5187,"in addition , the detective has to deal with a...",0
9040,"while mitchell is there , his mother shows up ...",0


### Major commands :
- .tokenize(sent)
- .convert_tokens_to_ids(tokenized_sent)
- .encode.plus() [source](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus)

In [13]:
# BERT model script from: huggingface.co
from transformers import BertTokenizer, BertModel
from typing import Tuple, List, Dict
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import logging
import gc

# to not see warning everytime
logging.set_verbosity_error()


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
model = BertModel.from_pretrained("bert-base-uncased").to(device)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [14]:
type(model)

transformers.models.bert.modeling_bert.BertModel

In [15]:
type(tokenizer)

transformers.models.bert.tokenization_bert.BertTokenizer

In [24]:
def embedding(dataset : pd.DataFrame, sentence_column: str, tokenizer: BertTokenizer) -> Tuple[torch.Tensor]:  # Tuple[List]:
    '''Extract embedding information using passed tokenizer and model
        Params:
        ------
            dataset : pd.DataFrame
                dataframe containig sentences to be encoded
            sentence_column : str
                column in the beforementioned dataframe to be encoded
            tokenizer : transformers.models.bert.tokenization_bert.BertTokenizer
                tokenizer used to process sentences and extract related information
        Return:
        ------
            Return encodings and attention masks as pytorch tensors
    '''
    
    embeddings = {
        'embedding' : [],
        'attention_mask' : []
        }
        
    for sent in dataset[sentence_column]:
        dic_sent_encoding = tokenizer.encode_plus(sent, # untokenized sentence
                                                add_special_tokens = True,  # add '[CLS]' and '[SEP]'
                                                truncation = True,  # truncate to maximum length
                                                padding = "max_length",  # pad to maximum admissible sentence
                                                return_attention_mask = True,  # return attention mask
                                                return_tensors = "pt") # returns pytorch tensors
    
        # extracting embeddings and attention masks in list form
        embeddings['embedding'].append(dic_sent_encoding['input_ids'])
        embeddings['attention_mask'].append(dic_sent_encoding['attention_mask'])

    # convert lists of tensors into tensors
    #input_ids = torch.cat(embeddings['embedding'], axis=0)
    #attention_masks = torch.cat(embeddings['attention_mask'], axis=0)
    
    #return input_ids, attention_masks
    return embeddings['embedding'], embeddings['attention_mask']
    #return dic_sent_encoding

In [26]:
#embedding(movie_reviews, 'text', tokenizer)

In [50]:
#mr_encoding, attention_masks_mr = embedding(dataset=movie_reviews, sentence_column='text', tokenizer=tokenizer)

In [51]:
#type(mr_encoding), type(mr_encoding[0])

In [34]:
#to display a sample tensor uncomment following lines
#mr_encoding[0], attention_masks_mr[0]

In [52]:
#so_encoding, attention_masks_so = embedding(dataset=subj_obj_dataset, sentence_column='text', tokenizer=tokenizer)

In [46]:
#to display a sample tensor uncomment following lines
#so_encoding[0], attention_masks_so[0]

In [47]:
#to investigate size of a tensor uncomment following lones
#so_encoding[0].size(), attention_masks_so[0].size()

In [49]:
BATCH_SIZE = 128 # reduced due to RAM limitations

In [58]:
def batching_data(dataset: pd.DataFrame, tokenizer: BertTokenizer, column_name: str='text', batch_size: int=BATCH_SIZE) -> Tuple[torch.utils.data.DataLoader]:
    '''Function to batch sentences to fit BERT model and get embeddings

        Params:
        ------
            dataset : pd.DataFrame
                dataset to be batched
            column_name : str
                column of 'dataset' DataFrame to catch 
            batch_size : int
                batch size
        
        Return:
        ------
            tuple of dataloader to iterate over
    '''

    # getting encodings and attention masks for whole dataset
    emb, msk = embedding(dataset=dataset, sentence_column=column_name, tokenizer=tokenizer)

    # defining DataLoader that batches ids randomly
    #emb_dataloader = DataLoader(dataset=emb, batch_size=batch_size, shuffle=False)
    for e in zip(emb,msk):
        output = model(e[0].to(device),e[1].to(device))
    # defining DataLoader that batches masks
    #msk_dataloader = DataLoader(dataset=msk, batch_size=batch_size, shuffle=False)

    #return ids_dataloader, msk_dataloader
    return output

In [59]:
batching_data(movie_reviews, tokenizer)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.4424,  0.0517,  0.0861,  ..., -0.3419,  0.3107,  0.0740],
         [-0.1545, -0.0906,  0.7247,  ...,  0.1055,  1.3482, -0.1013],
         [-0.2433, -0.7178,  1.0836,  ...,  0.5467,  0.7199, -0.3362],
         ...,
         [ 0.2447,  0.2098,  0.5096,  ...,  0.0152, -0.0250, -0.5137],
         [ 0.1360,  0.2764,  0.5797,  ..., -0.0135, -0.0326, -0.5723],
         [ 0.1134,  0.0813,  0.5243,  ..., -0.0306,  0.0437, -0.4213]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-4.3934e-01, -4.4720e-01, -4.2942e-01,  1.4759e-01, -1.4939e-01,
         -1.2933e-01, -4.4358e-02,  3.8541e-01, -1.1615e-01, -9.9960e-01,
          1.2993e-01,  3.4936e-01,  9.4202e-01,  8.8197e-02,  6.2682e-01,
         -1.0838e-01,  4.3199e-01, -4.7832e-01,  3.3852e-01,  6.5896e-01,
          5.0559e-01,  9.9597e-01,  4.1468e-01,  3.0092e-01,  3.2507e-01,
          3.5818e-01, -4.3893e-01,  7.1569e-01,  7

In [None]:
#ids_dataloader, _ = batching_data(dataset=movie_reviews, tokenizer=tokenizer)
#for idx, ids in enumerate(ids_dataloader):
#    print(idx, ids, len(ids))
#    if idx==2:
#        break

In [None]:
# embedding taken from last layer of BERT
# avoid touching and computing gradients -> torch.no_grad()
# https://towardsdatascience.com/what-is-npy-files-and-why-you-should-use-them-603373c78883

def fine_tune_BERT( model: BertModel, dataset: pd.DataFrame, tokenizer: BertTokenizer,
                   sentence_column: str='text', batch_size: int=BATCH_SIZE, 
                   device: str=device, rootdir: str=rootdir, filename: str='pol') -> List[torch.Tensor]:
    """Return Embeddings of dataset

        Params:
        ------
            model: transformers.models.bert.modeling_bert.BertModel
                BertModel to get embeddings from
            dataset : pd.Dataframe
                dataframe containing sentences to embed
            tokenizer : transformers.models.bert.tokenization_bert.BertTokenizer
                tokenizer model to use
            sentence_column : str
                column of 'dataset' to get
            batch_size : int
                batch size
            device : str
                device to load data on
            rootdir : str
                root directory where to store embeddings
            filename : str
                name on which saving embeddings
        
        Return:
        ------
            List of embeddings
    """

    embs = []
    # getting dataloaders
    ids_dataloader, msk_dataloader = batching_data(dataset=dataset, tokenizer=tokenizer, batch_size=batch_size)
    # disabling gradients computation --> I'm using a pre-trained net. Don't want to rewrite weights
    with torch.no_grad():
        # iterating through batches
        for idx, (ids,msk) in enumerate(zip(ids_dataloader,msk_dataloader)):

            # move data to device 
            ids = ids.to(device)
            msk = msk.to(device)

            # 'forward pass'
            outputs = model(ids,msk)

            # extracting tensor at last layer : https://github.com/esrel/NLU.Lab.2022.Public/blob/master/notebooks/10_sequence_nn.ipynb
            last_hidden_state = outputs.last_hidden_state

            print(f"Done batch #{idx}")
            print(last_hidden_state.cpu().numpy().shape)
            embs.append(last_hidden_state)
        if len(embs)>0:
            print('Embs list has been successfully built')
        else:
            raise ValueError('List has not been filled')

    return embs

In [None]:
def save_embs(embs: List[torch.Tensor], filename: str) -> None:
    # collecting all embeddings together
    # each element of the list is a list of tensors concerning each batch
    to_save = []
    for b in embs: # getting one tensor at a time -> embs[i]
        for gs in b: # getting the t-th sample -> embs[i][t]
            for s in gs:
                # s is a tensor now
                to_save.append(s.cpu().numpy())
    
    np.save(rootdir+filename, to_save)

In [None]:
embs_per_batch_mr = fine_tune_BERT(model=model, dataset=movie_reviews, tokenizer=tokenizer)

Done batch #0
(64, 512, 768)
Done batch #1
(64, 512, 768)
Done batch #2
(64, 512, 768)
Done batch #3
(64, 512, 768)
Done batch #4
(64, 512, 768)
Done batch #5
(64, 512, 768)
Done batch #6
(64, 512, 768)
Done batch #7
(64, 512, 768)
Done batch #8
(64, 512, 768)
Done batch #9
(64, 512, 768)
Done batch #10
(64, 512, 768)
Done batch #11
(64, 512, 768)
Done batch #12
(64, 512, 768)
Done batch #13
(64, 512, 768)
Done batch #14
(64, 512, 768)
Done batch #15
(64, 512, 768)
Done batch #16
(64, 512, 768)
Done batch #17
(64, 512, 768)
Done batch #18
(64, 512, 768)
Done batch #19
(64, 512, 768)
Done batch #20
(64, 512, 768)
Done batch #21
(64, 512, 768)
Done batch #22
(64, 512, 768)
Done batch #23
(64, 512, 768)
Done batch #24
(64, 512, 768)
Done batch #25
(64, 512, 768)
Done batch #26
(64, 512, 768)
Done batch #27
(64, 512, 768)
Done batch #28
(64, 512, 768)
Done batch #29
(64, 512, 768)
Done batch #30
(64, 512, 768)
Done batch #31
(16, 512, 768)
Embs list has been successfully built


In [None]:
torch.cuda.empty_cache()
gc.collect()

166

In [None]:
#number of batches made by the DataLoader
#len(embs_per_batch_mr)

32

In [None]:
#padded sequence length (maximum in the dataset)
#len(embs_per_batch_mr[0])

64

In [None]:
#number of feature extracted per sentence by the BertModel
#len(embs_per_batch_mr[0][0])

512

In [None]:
#len(embs_per_batch_mr[0][0][0])

768

In [None]:
# uncomment if you want to display the embeddings
#embs_per_batch_mr[0][0][1]

In [None]:
embs_per_batch_subj_obj = fine_tune_BERT(model=model, dataset=subj_obj_dataset, tokenizer=tokenizer)

Done batch #0
(64, 512, 768)
Done batch #1
(64, 512, 768)
Done batch #2
(64, 512, 768)
Done batch #3
(64, 512, 768)
Done batch #4
(64, 512, 768)
Done batch #5
(64, 512, 768)
Done batch #6
(64, 512, 768)
Done batch #7
(64, 512, 768)
Done batch #8
(64, 512, 768)
Done batch #9
(64, 512, 768)
Done batch #10
(64, 512, 768)
Done batch #11
(64, 512, 768)
Done batch #12
(64, 512, 768)
Done batch #13
(64, 512, 768)
Done batch #14
(64, 512, 768)
Done batch #15
(64, 512, 768)
Done batch #16
(64, 512, 768)
Done batch #17
(64, 512, 768)
Done batch #18
(64, 512, 768)
Done batch #19
(64, 512, 768)
Done batch #20
(64, 512, 768)
Done batch #21
(64, 512, 768)
Done batch #22
(64, 512, 768)
Done batch #23
(64, 512, 768)
Done batch #24
(64, 512, 768)
Done batch #25
(64, 512, 768)
Done batch #26
(64, 512, 768)
Done batch #27
(64, 512, 768)
Done batch #28
(64, 512, 768)
Done batch #29
(64, 512, 768)
Done batch #30
(64, 512, 768)
Done batch #31
(16, 512, 768)
Embs list has been successfully built


In [None]:
save_embs(embs=embs_per_batch_mr, filename='pol.npy')

In [None]:
save_embs(embs=embs_per_batch_subj_obj, filename='subj_obj.npy')

In [None]:
torch.cuda.empty_cache()
gc.collect()

133

In [None]:
#subj_obj = {}

# 64*15+40 = 1000
#for i in range(1000,11000,1000):
    #subj_obj[str(i)] = subj_obj_dataset[i-1000:i]
    #embs_per_batch_subj_obj = fine_tune_BERT(model=model, dataset=subj_obj_dataset[i-1000:i], tokenizer=tokenizer)
    #save_embs(embs_per_batch_subj_obj, f'subj_obj_{i}.npy')

In [None]:
#save_embs(embs=embs_per_batch_subj_obj, filename='subj_obj.npy')

In [None]:
#subj_obj_embs = np.load(rootdir+'subj_obj_10000.npy')
#for i in range(1000, 11000, 1000):
#    subj_obj_embs = np.concatenate((subj_obj_embs, np.load(rootdir+f'subj_obj_{i}.npy')), axis=0)

#save_embs(dataset=subj_obj_embs, filename='subj_obj_embs.npy')