<a href="https://colab.research.google.com/github/MatteoGuglielmi-tech/Polarity-and-Subjectivity-Detection/blob/main/src/MyModel/BERT-Fine-Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Embedding Fine Tuning

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 937 kB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 46.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 43.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.2 transformers-4.24.0


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
rootdir = '/content/gdrive/MyDrive/Colab Notebooks/Polarity-Subjectivity-Detection/'

In [4]:
import tensorflow as tf
import torch

# Get the GPU device name
device = tf.test.gpu_device_name()

if 'GPU' in device:
  print(f'GPU available : {device}')
  device = torch.device('cuda'+device[-2:])
else :
  device = torch.device("cpu")
  print("GPU not found, CPU will be instead")
  #raise SystemError("GPU not found, use CPU instead")

GPU available : /device:GPU:0


In [5]:
device

device(type='cuda', index=0)

In [6]:
import pandas as pd


# loading datasets
movie_reviews = pd.read_csv(rootdir+'Datasets/movie_rews.csv')                  # with labels
movie_rews_clean = pd.read_csv(rootdir+'Datasets/movie_rews_clean.csv')         # without labels
movie_rews_raw = pd.read_csv(rootdir+'Datasets/movie_rews_raw.csv')             # without labels and pre-processing

subj_obj = pd.read_csv(rootdir+'Datasets/subj_obj_dataset.csv')                 # with labels
subj_obj_clean = pd.read_csv(rootdir+'Datasets/subj_obj_dataset_clean.csv')     # without labels

In [7]:
movie_reviews

Unnamed: 0.1,Unnamed: 0,text,labels
0,0,films adapted from comic books have had plenty...,1
1,1,every now and then movie comes along from susp...,1
2,2,you got mail works alot better than deserves o...,1
3,3,jaws rare film that grabs your attention befor...,1
4,4,moviemaking lot like being the general manager...,1
...,...,...,...
1995,1995,anything stigmata should taken warning against...,0
1996,1996,john boorman zardoz goofy cinematic debacle fu...,0
1997,1997,the kids the hall are acquired taste took leas...,0
1998,1998,there was time when john carpenter was great h...,0


In [8]:
#movie_rews_clean

In [9]:
#movie_rews_raw

In [10]:
subj_obj

Unnamed: 0.1,Unnamed: 0,text,labels
0,0,smart and alert thirteen conversations about o...,1
1,1,color musical bounce and warm seas lapping isl...,1
2,2,not mass market entertainment but uncompromisi...,1
3,3,light hearted french film about the spiritual ...,1
4,4,wife actress has its moments looking the comic...,1
...,...,...,...
9995,9995,the end they discover that balance life simila...,0
9996,9996,counterfeit 1000 tomin bank note passed bazaar,0
9997,9997,enter the beautiful and mysterious secret agen...,0
9998,9998,after listening missionary from china speak ch...,0


In [11]:
#subj_obj_clean

In [12]:
#subjective = subj_obj_dataset[['text','labels']][:len(subj_obj_dataset)//2].sample(n=1000)

In [13]:
#objective = subj_obj_dataset[['text', 'labels']][len(subj_obj_dataset)//2:].sample(n=1000)

In [14]:
#subj_obj_dataset = pd.concat([subjective,objective], axis=0)

In [15]:
#subj_obj_dataset

In [16]:
subjective = subj_obj_clean[['text']][:len(subj_obj_clean)//2].sample(n=1000)
objective = subj_obj_clean[['text']][len(subj_obj_clean)//2:].sample(n=1000)
subj_obj_dataset = pd.concat([subjective,objective], axis=0)

In [17]:
subj_obj_dataset

Unnamed: 0,text
995,"with youthful high spirits , tautou remains ca..."
4376,meyjes' provocative film might be called an ex...
4672,stanley kwan has directed not only one of the ...
3356,"if this movie belonged to a sorority , it woul..."
2353,we've seen the hippie-turned-yuppie plot befor...
...,...
9541,"but josh , is not interested in call girls and..."
8091,"amudha , adopted by thiru and indira and growi..."
9889,when the king meets the god in a face-to-face ...
9007,but even greater dangers lie ahead when jim di...


### Major commands :
- .tokenize(sent)
- .convert_tokens_to_ids(tokenized_sent)
- .encode.plus() [source](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus)

In [18]:
# BERT model script from: huggingface.co
import transformers
from transformers import BertTokenizer, BertModel
from typing import Tuple, List, Dict
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import logging
import gc

# to not see warning everytime
logging.set_verbosity_error()

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
model = BertModel.from_pretrained("bert-base-uncased").to(device)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [19]:
# from transformers import BertForSequenceClassification, AdamW, BertConfig
#model_class, tokenizer_class, pretrained_weights = (transformers.BertModel, transformers.BertTokenizer,'bert-base-uncased')
#
#tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
#model = model_class.from_pretrained(pretrained_weights)

In [20]:
def embedding(dataset : pd.DataFrame, 
              sentence_column: str, 
              tokenizer: BertTokenizer, 
              maxlen: int
              ) -> Tuple[torch.Tensor]:  
    '''Extract embedding information using passed tokenizer and model
        Params:
        ------
            dataset : pd.DataFrame
                dataframe containig sentences to be encoded
            sentence_column : str
                column in the beforementioned dataframe to be encoded
            tokenizer : transformers.models.bert.tokenization_bert.BertTokenizer
                tokenizer used to process sentences and extract related information
        Return:
        ------
            Return encodings and attention masks as pytorch tensors
    '''
    
    embeddings = {
        'embedding' : [],
        'attention_mask' : []
        }
        
    for sent in dataset[sentence_column]:
        dic_sent_encoding = tokenizer.encode_plus(sent,                         # untokenized sentence
                                                add_special_tokens = True,      # add '[CLS]' and '[SEP]'
                                                truncation = True,              # truncate to maximum length
                                                max_length = maxlen,            # due to ram limiatation 
                                                padding = "max_length",         # pad to maximum admissible sentence
                                                return_attention_mask = True,   # return attention mask
                                                return_tensors = "pt")          # returns pytorch tensors
    
        # extracting embeddings and attention masks in list form
        embeddings['embedding'].append(dic_sent_encoding['input_ids'])
        embeddings['attention_mask'].append(dic_sent_encoding['attention_mask'])

    # convert lists of tensors into tensors
    #input_ids = torch.cat(embeddings['embedding'], axis=0)
    #attention_masks = torch.cat(embeddings['attention_mask'], axis=0)
    
    #return input_ids, attention_masks
    return embeddings['embedding'], embeddings['attention_mask']

In [21]:
#mr_encoding, attention_masks_mr = embedding(dataset=movie_reviews, sentence_column='text', tokenizer=tokenizer)

In [22]:
#to display a sample tensor uncomment following lines
#mr_encoding[0], attention_masks_mr[0]

In [23]:
#so_encoding, attention_masks_so = embedding(dataset=subj_obj_dataset, sentence_column='text', tokenizer=tokenizer)

In [24]:
#to display a sample tensor uncomment following lines
#so_encoding[0], attention_masks_so[0]

In [25]:
#to investigate size of a tensor uncomment following lones
#so_encoding[0].size(), attention_masks_so[0].size()

In [26]:
#BATCH_SIZE = 128 # reduced due to RAM limitations

In [27]:
import sys

In [28]:
def embeddings(dataset: pd.DataFrame, 
               tokenizer: BertTokenizer, 
               model: BertModel, 
               maxlen: int, 
               column_name: str='text',
               device: str=device
               )-> List:
    '''Function to batch sentences to fit BERT model and get embeddings

        Params:
        ------
            dataset : pd.DataFrame
                dataset to be batched
            column_name : str
                column of 'dataset' DataFrame to catch 
        
        Return:
        ------
            tuple of dataloader to iterate over
    '''

    model.eval()

    embs = []
    # getting encodings and attention masks for whole dataset
    emb, msk = embedding(dataset=dataset, sentence_column=column_name, tokenizer=tokenizer, maxlen=maxlen)

    with torch.no_grad():
        for sen, e in enumerate(zip(emb,msk)):
            #print(f"Sanity check emb!=msk : {e[0]==e[1]}", end='\r')
            last_hidden_states = model(e[0].to(device),e[1].to(device))
            sys.stdout.write('\r'+"Sentence {} of shape {}".format(sen, last_hidden_states[0].shape))
            sys.stdout.flush()
            #print("Sentence {} of shape {}".format(sen, last_hidden_states[0].shape), end='\x1b[1K\r')
            embs.append(last_hidden_states[0])

    return embs

In [29]:
import os, shutil

def clear_folder_content(folder):
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

In [30]:
#clear_folder_content(rootdir+'BERT/Polarity-Embeddings')
#clear_folder_content(rootdir+'BERT/Subjectivity-Embeddings')

In [31]:
clear_folder_content(rootdir+'BERT/Polarity-Embeddings-Truncated')
clear_folder_content(rootdir+'BERT/Subjectivity-Embeddings-Truncated')

In [32]:
#clear_folder_content(rootdir+'BERT/Subjectivity-Embeddings-Truncated-Raw')
#clear_folder_content(rootdir+'BERT/Subjectivity-Embeddings-Truncated-Raw')

In [33]:
from tqdm import tqdm

In [34]:
def save_embs(embs: List[torch.Tensor], dest_folder: str, kind: str) -> None:
    for i in tqdm(range(20), unit='group'):
        to_save = []
        data = embs[i*100:(i+1)*100]
        for b in data: # getting one tensor at a time -> embs[i]
            #for s in b: # to have (512x768)
                #print(f"Print s {s} \n")
            to_save.append(b[0].cpu().numpy())
        np.save(rootdir+'BERT/{}/bert_emb_{}_{}.npy'.format(dest_folder, kind, (i+1)*100), to_save)

In [35]:
mr_embs = embeddings(dataset=movie_rews_clean, tokenizer=tokenizer, model=model, maxlen=100)

Sentence 1999 of shape torch.Size([1, 100, 768])

In [36]:
torch.cuda.empty_cache()
gc.collect()

64

In [37]:
#mr_embs_raw = embeddings(dataset=movie_rews_raw, tokenizer=tokenizer, model=model, maxlen=512)

In [38]:
subj_embs = embeddings(dataset=subj_obj_dataset, tokenizer=tokenizer, model=model, maxlen=100)

Sentence 1999 of shape torch.Size([1, 100, 768])

In [39]:
print(len(mr_embs))

2000


In [40]:
#mr_embs[0].size()

In [41]:
#mr_embs[0][0].size()

In [42]:
mr_embs[0][0] == mr_embs[0]

tensor([[[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]]], device='cuda:0')

In [43]:
# saves at groups of 100 encodings (sentences)
save_embs(embs=mr_embs, dest_folder='Polarity-Embeddings-Truncated', kind='pol')

100%|██████████| 20/20 [00:02<00:00,  7.80group/s]


In [44]:
#save_embs(embs=mr_embs_raw, dest_folder='Polarity-Embeddings-Truncated-Raw', kind='pol')

In [45]:
save_embs(embs=subj_embs, dest_folder='Subjectivity-Embeddings-Truncated', kind='subj')

100%|██████████| 20/20 [00:02<00:00,  7.13group/s]


In [46]:
bert_emb_100 = np.load(rootdir+'BERT/Polarity-Embeddings-Truncated/bert_emb_pol_100.npy')
bert_emb_200 = np.load(rootdir+'BERT/Polarity-Embeddings-Truncated/bert_emb_pol_200.npy')

In [47]:
bert_emb_100.shape

(100, 100, 768)

In [48]:
bert_emb_200.shape

(100, 100, 768)

In [49]:
bert_emb_100[:5]==bert_emb_200[:5]

array([[[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, Fal

In [50]:
torch.cuda.empty_cache()
gc.collect()

22

### Unused

In [51]:
#ids_dataloader, _ = batching_data(dataset=movie_reviews, tokenizer=tokenizer)
#for idx, ids in enumerate(ids_dataloader):
#    print(idx, ids, len(ids))
#    if idx==2:
#        break

In [52]:
# embedding taken from last layer of BERT
# avoid touching and computing gradients -> torch.no_grad()
# https://towardsdatascience.com/what-is-npy-files-and-why-you-should-use-them-603373c78883

#def fine_tune_BERT( model: BertModel, dataset: pd.DataFrame, tokenizer: BertTokenizer,
#                   sentence_column: str='text', batch_size: int=BATCH_SIZE, 
#                   device: str=device, rootdir: str=rootdir, filename: str='pol') -> List[torch.Tensor]:
#    """Return Embeddings of dataset
#
#        Params:
#        ------
#            model: transformers.models.bert.modeling_bert.BertModel
#                BertModel to get embeddings from
#            dataset : pd.Dataframe
#                dataframe containing sentences to embed
#            tokenizer : transformers.models.bert.tokenization_bert.BertTokenizer
#                tokenizer model to use
#            sentence_column : str
#                column of 'dataset' to get
#            batch_size : int
#                batch size
#            device : str
#                device to load data on
#            rootdir : str
#                root directory where to store embeddings
#            filename : str
#                name on which saving embeddings
#        
#        Return:
#        ------
#            List of embeddings
#    """
#
#    embs = []
#    # getting dataloaders
#    ids_dataloader, msk_dataloader = batching_data(dataset=dataset, tokenizer=tokenizer, batch_size=batch_size)
#    # disabling gradients computation --> I'm using a pre-trained net. Don't want to rewrite weights
#    with torch.no_grad():
#        # iterating through batches
#        for idx, (ids,msk) in enumerate(zip(ids_dataloader,msk_dataloader)):
#
#            # move data to device 
#            ids = ids.to(device)
#            msk = msk.to(device)
#
#            # 'forward pass'
#            outputs = model(ids,msk)
#
#            # extracting tensor at last layer : https://github.com/esrel/NLU.Lab.2022.Public/blob/master/notebooks/10_sequence_nn.ipynb
#            last_hidden_state = outputs.last_hidden_state
#
#            print(f"Done batch #{idx}")
#            print(last_hidden_state.cpu().numpy().shape)
#            embs.append(last_hidden_state)
#        if len(embs)>0:
#            print('Embs list has been successfully built')
#        else:
#            raise ValueError('List has not been filled')
#
#    return embs

In [53]:
#subj_obj_embs = np.load(rootdir+'subj_obj_10000.npy')
#for i in range(1000, 11000, 1000):
#    subj_obj_embs = np.concatenate((subj_obj_embs, np.load(rootdir+f'subj_obj_{i}.npy')), axis=0)

#save_embs(dataset=subj_obj_embs, filename='subj_obj_embs.npy')

In [54]:
#save_embs(embs=embs_per_batch_subj_obj, filename='subj_obj.npy')

In [55]:
#subj_obj = {}

# 64*15+40 = 1000
#for i in range(1000,11000,1000):
    #subj_obj[str(i)] = subj_obj_dataset[i-1000:i]
    #embs_per_batch_subj_obj = fine_tune_BERT(model=model, dataset=subj_obj_dataset[i-1000:i], tokenizer=tokenizer)
    #save_embs(embs_per_batch_subj_obj, f'subj_obj_{i}.npy')

In [56]:
#save_embs(embs=mr_embs, filename='pol_embs.npy')

In [57]:
#save_embs(embs=subj_embs, filename='subj_obj_embs.npy')