<a href="https://colab.research.google.com/github/MatteoGuglielmi-tech/Polarity-and-Subjectivity-Detection/blob/main/src/MyModel/BERT-Fine-Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Embedding Fine Tuning

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
rootdir = '/content/gdrive/MyDrive/Colab Notebooks/Polarity-Subjectivity-Detection/'

In [4]:
import tensorflow as tf
import torch

# Get the GPU device name
device = tf.test.gpu_device_name()

if 'GPU' in device:
  print(f'GPU available : {device}')
  device = torch.device('cuda'+device[-2:])
else :
  device = torch.device("cpu")
  raise SystemError("GPU not found, use CPU instead")

GPU available : /device:GPU:0


In [5]:
device

device(type='cuda', index=0)

In [6]:
import pandas as pd


# loading dataset
movie_reviews = pd.read_csv(rootdir+'movie_rews.csv')
subj_obj_dataset = pd.read_csv(rootdir+'subj_obj_dataset.csv')

In [7]:
movie_reviews

Unnamed: 0.1,Unnamed: 0,text,pos,neg
0,0,films adapted comic books plenty success wheth...,1,0
1,1,every movie comes along suspect studio every i...,1,0
2,2,got mail works alot better deserves order make...,1,0
3,3,jaws rare film grabs attention shows single im...,1,0
4,4,moviemaking lot like general manager nfl team ...,1,0
...,...,...,...,...
1995,1995,anything stigmata taken warning releasing simi...,0,1
1996,1996,john boorman zardoz goofy cinematic debacle fu...,0,1
1997,1997,kids hall acquired taste took least season wat...,0,1
1998,1998,time john carpenter great horror director cour...,0,1


In [8]:
subj_obj_dataset

Unnamed: 0.1,Unnamed: 0,text,tag
0,0,"smart and alert , thirteen conversations about...",subj
1,1,"color , musical bounce and warm seas lapping o...",subj
2,2,it is not a mass-market entertainment but an u...,subj
3,3,a light-hearted french film about the spiritua...,subj
4,4,my wife is an actress has its moments in looki...,subj
...,...,...,...
9995,9995,"in the end , they discover that balance in lif...",obj
9996,9996,a counterfeit 1000 tomin bank note is passed i...,obj
9997,9997,enter the beautiful and mysterious secret agen...,obj
9998,9998,after listening to a missionary from china spe...,obj


### Major commands :
- .tokenize(sent)
- .convert_tokens_to_ids(tokenized_sent)
- .encode.plus() [source](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus)

In [9]:
# BERT model script from: huggingface.co
from transformers import BertTokenizer, BertModel
from typing import Tuple, List, Dict
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import logging
import gc

# to not see warning everytime
logging.set_verbosity_error()


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
model = BertModel.from_pretrained("bert-base-uncased").to(device)

In [11]:
type(model)

transformers.models.bert.modeling_bert.BertModel

In [10]:
type(tokenizer)

transformers.models.bert.tokenization_bert.BertTokenizer

In [12]:
def embedding(dataset : pd.DataFrame, sentence_column: str, tokenizer: BertTokenizer) -> Tuple[tf.Tensor]:  # Tuple[List]:
    '''Extract embedding information using passed tokenizer and model
        Params:
        ------
            dataset : pd.DataFrame
                dataframe containig sentences to be encoded
            sentence_column : str
                column in the beforementioned dataframe to be encoded
            tokenizer : transformers.models.bert.tokenization_bert.BertTokenizer
                tokenizer used to process sentences and extract related information
        Return:
        ------
            Return encodings and attention masks as pytorch tensors
    '''
    
    embeddings = {
        'embedding' : [],
        'attention_mask' : []
        }
        
    for sent in dataset[sentence_column]:
        dic_sent_encoding = tokenizer.encode_plus(sent, # untokenized sentence
                                                add_special_tokens = True,  # add '[CLS]' and '[SEP]'
                                                truncation = True,  # truncate to maximum length
                                                padding = "max_length",  # pad to maximum admissible sentence
                                                return_attention_mask = True,  # return attention mask
                                                return_tensors = "pt") # returns pytorch tensors
    
        # extracting embeddings and attention masks in list form
        embeddings['embedding'].append(dic_sent_encoding['input_ids'])
        embeddings['attention_mask'].append(dic_sent_encoding['attention_mask'])

    # convert lists of tensors into tensors
    input_ids = torch.cat(embeddings['embedding'], axis=0)
    attention_masks = torch.cat(embeddings['attention_mask'], axis=0)
    
    return input_ids, attention_masks
    #return embeddings['embedding'], embeddings['attention_mask']

In [13]:
#ids_mr, attention_masks_mr = embedding(dataset=movie_reviews, sentence_column='text', tokenizer=tokenizer)

In [14]:
#type(ids_mr)

In [15]:
#ids_mr[0]

In [16]:
#to display a sample tensor uncomment following lines
#attention_masks_mr[0]

In [17]:
#attention_masks_mr[1]

In [18]:
#ids_subj_obj , attention_masks_subj_obj = embedding(dataset=subj_obj_dataset, sentence_column='text', tokenizer=tokenizer)

In [19]:
#ids_subj_obj[0]

In [20]:
#attention_masks_subj_obj[0]

In [13]:
BATCH_SIZE = 64 # reduced due to RAM limitations
BATCH_SIZE_SUBJ = 16

In [14]:
def batching_data(dataset: pd.DataFrame, tokenizer: BertTokenizer, column_name: str='text', batch_size: int=BATCH_SIZE) -> Tuple[torch.utils.data.DataLoader]:
    '''Function to batch sentences to fit BERT model and get embeddings

        Params:
        ------
            dataset : pd.DataFrame
                dataset to be batched
            column_name : str
                column of 'dataset' DataFrame to catch 
            batch_size : int
                batch size
        
        Return:
        ------
            tuple of dataloader to iterate over
    '''

    # getting encodings and attention masks for whole dataset
    ids, msk = embedding(dataset=dataset, sentence_column=column_name, tokenizer=tokenizer)
    # defining DataLoader that batches ids randomly
    ids_dataloader = DataLoader(dataset=ids, sampler=RandomSampler(ids), batch_size=batch_size, shuffle=False)

    # defining DataLoader that batches masks
    msk_dataloader = DataLoader(dataset=msk, batch_size=batch_size, shuffle=False)

    return ids_dataloader, msk_dataloader

In [24]:
#ids_dataloader, _ = batching_data(dataset=movie_reviews, tokenizer=tokenizer)
#for idx, ids in enumerate(ids_dataloader):
#    print(idx, ids, len(ids))
#    if idx==2:
#        break

In [15]:
# embedding taken from last layer of BERT
# avoid touching and computing gradients -> torch.no_grad()
# https://towardsdatascience.com/what-is-npy-files-and-why-you-should-use-them-603373c78883

def fine_tune_BERT( model: BertModel, dataset: pd.DataFrame, tokenizer: BertTokenizer,
                   sentence_column: str='text', batch_size: int=BATCH_SIZE, 
                   device: str=device, rootdir: str=rootdir, filename: str='pol') -> List[torch.Tensor]:
    """Return Embeddings of dataset

        Params:
        ------
            model: transformers.models.bert.modeling_bert.BertModel
                BertModel to get embeddings from
            dataset : pd.Dataframe
                dataframe containing sentences to embed
            tokenizer : transformers.models.bert.tokenization_bert.BertTokenizer
                tokenizer model to use
            sentence_column : str
                column of 'dataset' to get
            batch_size : int
                batch size
            device : str
                device to load data on
            rootdir : str
                root directory where to store embeddings
            filename : str
                name on which saving embeddings
        
        Return:
        ------
            List of embeddings
    """

    embs = []
    # getting dataloaders
    ids_dataloader, msk_dataloader = batching_data(dataset=dataset, tokenizer=tokenizer, batch_size=batch_size)
    # disabling gradients computation --> I'm using a pre-trained net. Don't want to rewrite weights
    with torch.no_grad():
        # iterating through batches
        for idx, (ids,msk) in enumerate(zip(ids_dataloader,msk_dataloader)):

            # move data to device 
            ids = ids.to(device)
            msk = msk.to(device)

            # 'forward pass'
            outputs = model(ids,msk)

            # extracting tensor at last layer : https://github.com/esrel/NLU.Lab.2022.Public/blob/master/notebooks/10_sequence_nn.ipynb
            last_hidden_state = outputs.last_hidden_state

            print(f"Done batch #{idx}")
            print(last_hidden_state.cpu().numpy().shape)
            embs.append(last_hidden_state)
        #if len(embs)>0:
        #    print('Embs list has been successfully built')
        #else:
        #    raise TypeError('List has not been filled')

    return embs

In [16]:
def save_embs(embs: List[torch.Tensor], filename: str) -> None:
    # collecting all embeddings together
    # each element of the list is a list of tensors concerning each batch
    to_save = []
    for b in embs: # getting one tensor at a time -> embs[i]
        for gs in b: # getting the t-th sample -> embs[i][t]
            for s in gs:
                # s is a tensor now
                to_save.append(s.cpu().numpy())
    
    np.save(rootdir+filename, to_save)

In [26]:
embs_per_batch_mr = fine_tune_BERT(model=model, dataset=movie_reviews, tokenizer=tokenizer)

Done batch #0
(64, 512, 768)
Done batch #1
(64, 512, 768)
Done batch #2
(64, 512, 768)
Done batch #3
(64, 512, 768)
Done batch #4
(64, 512, 768)
Done batch #5
(64, 512, 768)
Done batch #6
(64, 512, 768)
Done batch #7
(64, 512, 768)
Done batch #8
(64, 512, 768)
Done batch #9
(64, 512, 768)
Done batch #10
(64, 512, 768)
Done batch #11
(64, 512, 768)
Done batch #12
(64, 512, 768)
Done batch #13
(64, 512, 768)
Done batch #14
(64, 512, 768)
Done batch #15
(64, 512, 768)
Done batch #16
(64, 512, 768)
Done batch #17
(64, 512, 768)
Done batch #18
(64, 512, 768)
Done batch #19
(64, 512, 768)
Done batch #20
(64, 512, 768)
Done batch #21
(64, 512, 768)
Done batch #22
(64, 512, 768)
Done batch #23
(64, 512, 768)
Done batch #24
(64, 512, 768)
Done batch #25
(64, 512, 768)
Done batch #26
(64, 512, 768)
Done batch #27
(64, 512, 768)
Done batch #28
(64, 512, 768)
Done batch #29
(64, 512, 768)
Done batch #30
(64, 512, 768)
Done batch #31
(16, 512, 768)


In [17]:
torch.cuda.empty_cache()
gc.collect()

642

In [28]:
#number of batches made by the DataLoader
#len(embs_per_batch_mr)

32

In [29]:
#padded sequence length (maximum in the dataset)
#len(embs_per_batch_mr[0])

64

In [30]:
#number of feature extracted per sentence by the BertModel
#len(embs_per_batch_mr[0][0])

512

In [31]:
#len(embs_per_batch_mr[0][0][0])

768

In [None]:
# uncomment if you want to display the embeddings
#embs_per_batch_mr[0][0][1]

In [18]:
embs_per_batch_subj_obj = fine_tune_BERT(model=model, dataset=subj_obj_dataset, tokenizer=tokenizer, batch_size=16)

Done batch #0
(16, 512, 768)
Done batch #1
(16, 512, 768)
Done batch #2
(16, 512, 768)
Done batch #3
(16, 512, 768)
Done batch #4
(16, 512, 768)
Done batch #5
(16, 512, 768)
Done batch #6
(16, 512, 768)
Done batch #7
(16, 512, 768)
Done batch #8
(16, 512, 768)
Done batch #9
(16, 512, 768)
Done batch #10
(16, 512, 768)
Done batch #11
(16, 512, 768)
Done batch #12
(16, 512, 768)
Done batch #13
(16, 512, 768)
Done batch #14
(16, 512, 768)
Done batch #15
(16, 512, 768)
Done batch #16
(16, 512, 768)
Done batch #17
(16, 512, 768)
Done batch #18
(16, 512, 768)
Done batch #19
(16, 512, 768)
Done batch #20
(16, 512, 768)
Done batch #21
(16, 512, 768)
Done batch #22
(16, 512, 768)
Done batch #23
(16, 512, 768)
Done batch #24
(16, 512, 768)
Done batch #25
(16, 512, 768)
Done batch #26
(16, 512, 768)
Done batch #27
(16, 512, 768)
Done batch #28
(16, 512, 768)
Done batch #29
(16, 512, 768)
Done batch #30
(16, 512, 768)
Done batch #31
(16, 512, 768)
Done batch #32
(16, 512, 768)
Done batch #33
(16, 

RuntimeError: ignored

In [33]:
save_embs(embs=embs_per_batch_mr, filename='pol.npy')

In [21]:
subj_obj = {}

for i in range(1000,10000,1000):
    subj_obj[str(i)] = subj_obj_dataset[i-1000:i]
subj_obj

{'1000':      Unnamed: 0                                               text   tag
 0             0  smart and alert , thirteen conversations about...  subj
 1             1  color , musical bounce and warm seas lapping o...  subj
 2             2  it is not a mass-market entertainment but an u...  subj
 3             3  a light-hearted french film about the spiritua...  subj
 4             4  my wife is an actress has its moments in looki...  subj
 ..          ...                                                ...   ...
 995         995  with youthful high spirits , tautou remains ca...  subj
 996         996  despite the surface attractions -- conrad l . ...  subj
 997         997  it's a drawling , slobbering , lovable run-on ...  subj
 998         998  alas , the black-and-white archival footage of...  subj
 999         999  a movie that hovers somewhere between an acute...  subj
 
 [1000 rows x 3 columns],
 '2000':       Unnamed: 0                                               text

In [None]:
#save_embs(embs=embs_per_batch_subj_obj, filename='subj_obj.npy')

In [34]:
pol = np.load(rootdir+'pol.npy')

In [37]:
pol[0][:10]

array([-0.43667775,  0.39621976, -0.03247878,  0.51228744, -0.36099547,
       -0.07411188,  0.52061343,  0.02332104, -0.1497904 ,  0.10958889],
      dtype=float32)