<a href="https://colab.research.google.com/github/MatteoGuglielmi-tech/Polarity-and-Subjectivity-Detection/blob/main/src/MyModel/BERT-Fine-Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Embedding Fine Tuning

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
rootdir = '/content/gdrive/MyDrive/Colab Notebooks/Polarity-Subjectivity-Detection/'

In [4]:
import tensorflow as tf
import torch

# Get the GPU device name
device = tf.test.gpu_device_name()

if 'GPU' in device:
  print(f'GPU available : {device}')
  device = torch.device('cuda'+device[-2:])
else :
  device = torch.device("cpu")
  raise SystemError("GPU not found, use CPU instead")

GPU available : /device:GPU:0


In [5]:
device

device(type='cuda', index=0)

In [6]:
import pandas as pd


# loading dataset
movie_reviews = pd.read_csv(rootdir+'movie_rews.csv')
subj_obj_dataset = pd.read_csv(rootdir+'subj_obj_dataset.csv')

In [7]:
movie_reviews

Unnamed: 0.1,Unnamed: 0,text,labels
0,0,films adapted comic books plenty success wheth...,1
1,1,every movie comes along suspect studio every i...,1
2,2,got mail works alot better deserves order make...,1
3,3,jaws rare film grabs attention shows single im...,1
4,4,moviemaking lot like general manager nfl team ...,1
...,...,...,...
1995,1995,anything stigmata taken warning releasing simi...,0
1996,1996,john boorman zardoz goofy cinematic debacle fu...,0
1997,1997,kids hall acquired taste took least season wat...,0
1998,1998,time john carpenter great horror director cour...,0


In [8]:
subj_obj_dataset

Unnamed: 0.1,Unnamed: 0,text,labels
0,0,"smart and alert , thirteen conversations about...",1
1,1,"color , musical bounce and warm seas lapping o...",1
2,2,it is not a mass-market entertainment but an u...,1
3,3,a light-hearted french film about the spiritua...,1
4,4,my wife is an actress has its moments in looki...,1
...,...,...,...
9995,9995,"in the end , they discover that balance in lif...",0
9996,9996,a counterfeit 1000 tomin bank note is passed i...,0
9997,9997,enter the beautiful and mysterious secret agen...,0
9998,9998,after listening to a missionary from china spe...,0


In [9]:
subjective = subj_obj_dataset[['text','labels']][:len(subj_obj_dataset)//2].sample(n=1000)

In [10]:
objective = subj_obj_dataset[['text', 'labels']][len(subj_obj_dataset)//2:].sample(n=1000)

In [11]:
subj_obj_dataset = pd.concat([subjective,objective], axis=0)

In [12]:
subj_obj_dataset

Unnamed: 0,text,labels
3050,"in gleefully , thumpingly hyperbolic terms , i...",1
3538,the first half bursts with a goofy energy prev...,1
2225,the last kiss will probably never achieve the ...,1
3984,"if this dud had been made in the '70s , it wou...",1
1802,watching queen of the damned is like reading a...,1
...,...,...
7372,"stands paralyzed , and she is only saved by th...",0
7212,when the singing veggies encounter some car tr...,0
6172,"on their way back , they drive through death v...",0
5781,"before the gang returns , jack and erin's conn...",0


### Major commands :
- .tokenize(sent)
- .convert_tokens_to_ids(tokenized_sent)
- .encode.plus() [source](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus)

In [13]:
# BERT model script from: huggingface.co
import transformers
from transformers import BertTokenizer, BertModel
from typing import Tuple, List, Dict
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import logging
import gc

# to not see warning everytime
logging.set_verbosity_error()


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
model = BertModel.from_pretrained("bert-base-uncased").to(device)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [14]:
type(model)

transformers.models.bert.modeling_bert.BertModel

In [15]:
type(tokenizer)

transformers.models.bert.tokenization_bert.BertTokenizer

In [16]:
def embedding(dataset : pd.DataFrame, sentence_column: str, tokenizer: BertTokenizer) -> Tuple[torch.Tensor]:  # Tuple[List]:
    '''Extract embedding information using passed tokenizer and model
        Params:
        ------
            dataset : pd.DataFrame
                dataframe containig sentences to be encoded
            sentence_column : str
                column in the beforementioned dataframe to be encoded
            tokenizer : transformers.models.bert.tokenization_bert.BertTokenizer
                tokenizer used to process sentences and extract related information
        Return:
        ------
            Return encodings and attention masks as pytorch tensors
    '''
    
    embeddings = {
        'embedding' : [],
        'attention_mask' : []
        }
        
    for sent in dataset[sentence_column]:
        dic_sent_encoding = tokenizer.encode_plus(sent, # untokenized sentence
                                                add_special_tokens = True,  # add '[CLS]' and '[SEP]'
                                                truncation = True,  # truncate to maximum length
                                                padding = "max_length",  # pad to maximum admissible sentence
                                                return_attention_mask = True,  # return attention mask
                                                return_tensors = "pt") # returns pytorch tensors
    
        # extracting embeddings and attention masks in list form
        embeddings['embedding'].append(dic_sent_encoding['input_ids'])
        embeddings['attention_mask'].append(dic_sent_encoding['attention_mask'])

    # convert lists of tensors into tensors
    #input_ids = torch.cat(embeddings['embedding'], axis=0)
    #attention_masks = torch.cat(embeddings['attention_mask'], axis=0)
    
    #return input_ids, attention_masks
    return embeddings['embedding'], embeddings['attention_mask']
    #return dic_sent_encoding

In [None]:
#mr_encoding, attention_masks_mr = embedding(dataset=movie_reviews, sentence_column='text', tokenizer=tokenizer)

In [19]:
#to display a sample tensor uncomment following lines
#mr_encoding[0], attention_masks_mr[0]

In [20]:
#so_encoding, attention_masks_so = embedding(dataset=subj_obj_dataset, sentence_column='text', tokenizer=tokenizer)

In [21]:
#to display a sample tensor uncomment following lines
#so_encoding[0], attention_masks_so[0]

In [22]:
#to investigate size of a tensor uncomment following lones
#so_encoding[0].size(), attention_masks_so[0].size()

In [23]:
#BATCH_SIZE = 128 # reduced due to RAM limitations

In [24]:
def embeddings(dataset: pd.DataFrame, tokenizer: BertTokenizer, column_name: str='text')-> List:
    '''Function to batch sentences to fit BERT model and get embeddings

        Params:
        ------
            dataset : pd.DataFrame
                dataset to be batched
            column_name : str
                column of 'dataset' DataFrame to catch 
        
        Return:
        ------
            tuple of dataloader to iterate over
    '''
    embs = []
    # getting encodings and attention masks for whole dataset
    emb, msk = embedding(dataset=dataset, sentence_column=column_name, tokenizer=tokenizer)

    # defining DataLoader that batches ids randomly
    #emb_dataloader = DataLoader(dataset=emb, batch_size=batch_size, shuffle=False)
    with torch.no_grad():
        for e in zip(emb,msk):
            last_hidden_states = model(e[0].to(device),e[1].to(device))
            #print(last_hidden_states[0].shape)
            embs.append(last_hidden_states[0])


    # defining DataLoader that batches masks
    #msk_dataloader = DataLoader(dataset=msk, batch_size=batch_size, shuffle=False)

    #return ids_dataloader, msk_dataloader
    return embs

In [25]:
def save_embs(embs: List[torch.Tensor], filename: str) -> None:
    # collecting all embeddings together
    # each element of the list is a list of tensors concerning each batch
    to_save = []
    for b in embs: # getting one tensor at a time -> embs[i]
        #for gs in b: # getting the t-th sample -> embs[i][t]
        for s in b:
            # s is a tensor now
            to_save.append(s.cpu().numpy())
    
    np.save(rootdir+filename, to_save)

In [26]:
mr_embs = embeddings(movie_reviews, tokenizer)

In [27]:
subj_embs = embeddings(subj_obj_dataset, tokenizer)

In [28]:
torch.cuda.empty_cache()
gc.collect()

56

In [29]:
save_embs(embs=mr_embs, filename='pol_embs.npy')

In [30]:
save_embs(embs=subj_embs, filename='subj_obj_embs.npy')

In [31]:
#torch.cuda.empty_cache()
#gc.collect()

### Unused

In [32]:
#ids_dataloader, _ = batching_data(dataset=movie_reviews, tokenizer=tokenizer)
#for idx, ids in enumerate(ids_dataloader):
#    print(idx, ids, len(ids))
#    if idx==2:
#        break

In [33]:
# embedding taken from last layer of BERT
# avoid touching and computing gradients -> torch.no_grad()
# https://towardsdatascience.com/what-is-npy-files-and-why-you-should-use-them-603373c78883

def fine_tune_BERT( model: BertModel, dataset: pd.DataFrame, tokenizer: BertTokenizer,
                   sentence_column: str='text', batch_size: int=BATCH_SIZE, 
                   device: str=device, rootdir: str=rootdir, filename: str='pol') -> List[torch.Tensor]:
    """Return Embeddings of dataset

        Params:
        ------
            model: transformers.models.bert.modeling_bert.BertModel
                BertModel to get embeddings from
            dataset : pd.Dataframe
                dataframe containing sentences to embed
            tokenizer : transformers.models.bert.tokenization_bert.BertTokenizer
                tokenizer model to use
            sentence_column : str
                column of 'dataset' to get
            batch_size : int
                batch size
            device : str
                device to load data on
            rootdir : str
                root directory where to store embeddings
            filename : str
                name on which saving embeddings
        
        Return:
        ------
            List of embeddings
    """

    embs = []
    # getting dataloaders
    ids_dataloader, msk_dataloader = batching_data(dataset=dataset, tokenizer=tokenizer, batch_size=batch_size)
    # disabling gradients computation --> I'm using a pre-trained net. Don't want to rewrite weights
    with torch.no_grad():
        # iterating through batches
        for idx, (ids,msk) in enumerate(zip(ids_dataloader,msk_dataloader)):

            # move data to device 
            ids = ids.to(device)
            msk = msk.to(device)

            # 'forward pass'
            outputs = model(ids,msk)

            # extracting tensor at last layer : https://github.com/esrel/NLU.Lab.2022.Public/blob/master/notebooks/10_sequence_nn.ipynb
            last_hidden_state = outputs.last_hidden_state

            print(f"Done batch #{idx}")
            print(last_hidden_state.cpu().numpy().shape)
            embs.append(last_hidden_state)
        if len(embs)>0:
            print('Embs list has been successfully built')
        else:
            raise ValueError('List has not been filled')

    return embs

NameError: ignored

In [None]:
#subj_obj_embs = np.load(rootdir+'subj_obj_10000.npy')
#for i in range(1000, 11000, 1000):
#    subj_obj_embs = np.concatenate((subj_obj_embs, np.load(rootdir+f'subj_obj_{i}.npy')), axis=0)

#save_embs(dataset=subj_obj_embs, filename='subj_obj_embs.npy')

In [None]:
#save_embs(embs=embs_per_batch_subj_obj, filename='subj_obj.npy')

In [None]:
#subj_obj = {}

# 64*15+40 = 1000
#for i in range(1000,11000,1000):
    #subj_obj[str(i)] = subj_obj_dataset[i-1000:i]
    #embs_per_batch_subj_obj = fine_tune_BERT(model=model, dataset=subj_obj_dataset[i-1000:i], tokenizer=tokenizer)
    #save_embs(embs_per_batch_subj_obj, f'subj_obj_{i}.npy')