<a href="https://colab.research.google.com/github/MuraliB123/MLOPS/blob/master/BERT_LR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch

In [None]:
!pip install datasets -q

In [None]:
!pip install transformers -q

In [None]:
from transformers import BertModel, BertTokenizerFast

model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states = True,)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
import datasets

finer_train = datasets.load_dataset("nlpaueb/finer-139", split="test")

In [None]:
finer_train = finer_train.select(range(30))

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    # while tokenising sub words are created by tokeniser, our approach is to assign the label of the root word to these sub words.
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids is the list consisting of id of original word from which the token has arrived.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
sample_tokenised_dataset = finer_train.map(tokenize_and_align_labels, batched=True)


In [None]:
print(sample_tokenised_dataset)

Dataset({
    features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 30
})


In [None]:
# input_ids - integer encoding of the tokens
# labels    - corresponding tags

In [None]:
import torch
from collections import OrderedDict

def bert_text_preparation(tokenized_text,indexed_tokens,tokenizer):
    segments_ids = [1] * len(indexed_tokens)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = torch.tensor([segments_ids])
    return tokenized_text, tokens_tensor, segments_tensor

def get_bert_embeddings(tokens_tensor, segments_tensor, model):
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensor)
        hidden_states = outputs[2]

    token_embeddings = torch.stack(hidden_states, dim=0)
    # token_embeddings size is [13, 1, x, 768]
    # where 13 layers,1 refers batch size, x refers to tokens, 768 refers features in each layer
    # remove dimension 1, the "batches"
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    # [13, x, 768]
    # swap dimensions 0 and 1 so we can loop over tokens
    token_embeddings = token_embeddings.permute(1, 0, 2)
    # [x,13,768]
    # intialized list to store embeddings
    token_vecs_sum = []

    # "token_embeddings" is a [X x 13 x 768] tensor
    # where Y is the number of tokens in the sentence

    # loop over tokens in sentence
    for token in token_embeddings:

        # "token" is a [13 x 768] tensor

        # sum the vectors from the last four layers
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)

    return token_vecs_sum

In [None]:
input_ids    = sample_tokenised_dataset["input_ids"]
input_tags   = sample_tokenised_dataset["labels"]

context_embeddings = []
final_tokens = []
token_labels   = []

for i in range(0,len(input_ids)):
    tags     = input_tags[i]
    ids      = input_ids[i]
    sentence = tokenizer.batch_decode(ids)

    for tag in tags:
      if tag!= -100:
        token_labels.append(tag)

    tokenized_text, tokens_tensor, segments_tensor = bert_text_preparation(sentence,ids, tokenizer)
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensor, model)

    # make ordered dictionary to keep track of the position of each word
    tokens = OrderedDict()
    size = len(tokenized_text)
    for i in range(1,size-1):
        token = tokenized_text[i]
        # keep track of position of word and whether it occurs multiple times
        if token in tokens:
            tokens[token] += 1
        else:
            tokens[token] = 1
        # compute the position of the current token
        token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
        current_index = token_indices[tokens[token] - 1]
        token_vec = list_token_embeddings[current_index]
        final_tokens.append(token)
        context_embeddings.append(token_vec)


In [None]:
print(len(context_embeddings),len(final_tokens),len(token_labels))

1792 1792 1792


In [None]:
hash = {'token' : final_tokens,'embedding' : context_embeddings,'label' : token_labels}
dataset = pd.DataFrame(hash)
print(dataset.head())

     token                                          embedding  label
0      the  [tensor(-2.4772), tensor(-4.8490), tensor(1.76...      0
1  changes  [tensor(-2.8018), tensor(1.4222), tensor(2.313...      0
2       in  [tensor(-5.0774), tensor(0.6679), tensor(-2.34...      0
3      the  [tensor(-1.6575), tensor(-1.2954), tensor(1.04...      0
4     fair  [tensor(4.8718), tensor(-0.2470), tensor(3.063...      0


In [None]:
from sklearn.decomposition import PCA
Y = dataset["label"]
X = dataset["embedding"]
X = np.array([np.array(embedding) for embedding in X ])
pca = PCA(n_components=10)
X = pca.fit_transform(X)

In [None]:
print(X)

[[ 16.3946      -4.8062944    0.7270676  ...  -5.1078415   -1.4797348
    3.8995507 ]
 [ 13.587046     2.594197    10.825192   ...  -0.39473566  -6.480942
    8.210312  ]
 [ 19.653809     1.6561687   -1.7764463  ...   2.5668957   -9.5782
   10.238576  ]
 ...
 [-40.628166    19.958746   -12.894397   ...  -5.8652463   12.640239
    5.40856   ]
 [ -7.0131874   18.649908   -22.009874   ...  -3.2943673    8.68596
    7.8456564 ]
 [-17.449858   -16.172169    -0.29137462 ...   3.4184458   -2.55471
    2.6662385 ]]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1234)

In [None]:
model = LogisticRegression(multi_class='ovr') # one vs rest for multi class classification
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9916434540389972
