### Experimenting with BERTScore

In [None]:
import sys
sys.path.append(r"D:\COURS\A4\S8 - ESILV\Stage\Work\Repositories\bert_score")

from bert_score.score import score as bscore

In [None]:
o = bscore(["I am Marius"], ["My name is marius"], lang="en")

### Experimenting with Roberta tokenizer

In [None]:
from transformers import RobertaTokenizer

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
test = tokenizer("I am Marius")

In [None]:
out = tokenizer("I am Marius")

In [None]:
out

In [None]:
test.input_ids

### Experimenting with AutoModel class

In [None]:
from transformers import AutoModel
from transformers import RobertaTokenizer

In [None]:
model = AutoModel.from_pretrained("roberta-large")
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

In [None]:
sentence = "I am Marius"
tokens = tokenizer(sentence)
token_ids = tokens["input_ids"]
masks = tokens["attention_mask"]

In [None]:
type(masks)

In [None]:
out = model(token_ids, attention_mask=masks, output_hidden_states=True)

### BERT Embedding - Discovery

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states=True)
model.eval()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text = "My name is Marius."
marked_text = "[CLS]"+text+"[SEP]"
tokenized_text = tokenizer.tokenize(marked_text)
print(tokenized_text)

In [None]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12}{:>6,}'.format(tup[0], tup[1]))

In [None]:
#mark tokenks as belonging to sentence 1.
segment_ids = [1]*len(tokenized_text)
print(segment_ids)

In [None]:
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensor = torch.tensor([segment_ids])
print(tokens_tensor)
print(segments_tensor)

In [None]:
with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensor)
outputs.keys()

In [None]:
bert_hidden_states = outputs[-1]
len(bert_hidden_states)

In [None]:
token_embeddings = torch.stack(bert_hidden_states, dim=0)
token_embeddings.size()

In [None]:
#here we have 1 sentence so we remove the batch size
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings.size()

In [None]:
#swap dim 0 and 1
token_embeddings = token_embeddings.permute(1, 0, 2)
token_embeddings.size()

#### Word Vectors

In [None]:
#concatenate layers
token_vec_cat = []
for token in token_embeddings:
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    token_vec_cat.append(cat_vec)
print("Shape:", len(token_vec_cat),"x", len(token_vec_cat[0]))

In [None]:
for i in range(len(token_vec_cat)):
    if len(token_vec_cat[i] != 3072):
        print(i)
        print(len(token_vec_cat[i]))

In [None]:
test = [token.tolist() for token in token_vec_cat]
test = np.array(test)

In [None]:
test.shape

In [None]:
#sum layers
token_vec_sum = []
for token in token_embeddings:
    cat_vec = torch.sum(token[-4:], dim=0)
    token_vec_sum.append(cat_vec)
print("Shape:", len(token_vec_sum),"x", len(token_vec_sum[0]))

#### Sentence Vectors

In [None]:
token_vecs = bert_hidden_states[-2][0]
sentence_embedding = torch.mean(token_vecs, dim=0)
sentence_embedding.size()

### Vector Visualization

In [None]:
from umap import UMAP
import plotly.express as px

In [None]:
umap3D = UMAP(n_components=3, init='random', random_state=0)
proj3D = umap3D.fit_transform(test)

In [None]:
f = px.scatter_3d(proj3D, x=0, y=1, z=2)
f.show()

### BERT Sentence-level Embedding - Paragraphs 

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

In [None]:
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
corpus = "Wikipedia is a multilingual, free, online encyclopedia written and maintained by a community of volunteers, known as Wikipedians, through open collaboration and using a wiki-based editing system called MediaWiki. Wikipedia is the largest and most-read reference work in history. It is consistently one of the 10 most popular websites ranked by Similarweb and formerly Alexa; as of 2023, Wikipedia was ranked the 5th most popular site in the world according to Semrush. It is hosted by the Wikimedia Foundation, an American non-profit organization funded mainly through donations."
delimiter="."
print(corpus)

In [None]:
splited = [sentence+delimiter for sentence in corpus.split(".")]
max_len = max(len(x) for x in splited)

input_ids = []
attention_masks = []
for sentence in splited:
    encoded = tokenizer.encode_plus(sentence, 
                                    add_special_tokens=True,
                                    max_length=max_len+1,
                                    pad_to_max_length=True,
                                    return_attention_mask=True,
                                    return_tensors='pt',
                                    truncation=True)
    input_ids.append(encoded["input_ids"])
    attention_masks.append(encoded["attention_mask"])

inputs_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

with torch.no_grad():
    output = model(inputs_ids, attention_mask=attention_masks)
hidden_state = output.last_hidden_state
cls_emb = hidden_state[:,0,:]
cls_emb = cls_emb.detach().numpy()
np.shape(cls_emb)

In [None]:
cls_emb

In [None]:
len(splited)

### BERT Word-level Embedding - Paragraphs

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from umap import UMAP
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

corpus = "Wikipedia is a multilingual, free, online encyclopedia written and maintained by a community of volunteers, known as Wikipedians, through open collaboration and using a wiki-based editing system called MediaWiki. Wikipedia is the largest and most-read reference work in history. It is consistently one of the 10 most popular websites ranked by Similarweb and formerly Alexa; as of 2023, Wikipedia was ranked the 5th most popular site in the world according to Semrush. It is hosted by the Wikimedia Foundation, an American non-profit organization funded mainly through donations."
delimiter="."
print(corpus)

In [None]:
len(splited[0])

In [None]:
input_size = 512 - 1
corpusWords = corpus.split(" ")
splited = [" ".join(corpusWords[i:i+input_size]) for i in range(0, len(corpusWords), input_size)]

input_ids = []
attention_masks = []
for sentence in splited:
    encoded = tokenizer.encode_plus(sentence, 
                                    add_special_tokens=True,
                                    max_length=input_size+1,
                                    pad_to_max_length=True,
                                    return_attention_mask=True,
                                    return_tensors='pt',
                                    truncation=True)
    input_ids.append(encoded["input_ids"])
    attention_masks.append(encoded["attention_mask"])

#inputs_ids = torch.Tensor(len(input_ids),1, max_len+1)
#torch.cat(input_ids, out=inputs_ids)
inputs_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

with torch.no_grad():
    output = model(inputs_ids, attention_mask=attention_masks)

In [None]:
"""
splited = [sentence+delimiter for sentence in corpus.split(".")]
max_len = max(len(x) for x in splited)

input_ids = []
attention_masks = []
for sentence in splited:
    encoded = tokenizer.encode_plus(sentence, 
                                    add_special_tokens=True,
                                    max_length=max_len+1,
                                    pad_to_max_length=True,
                                    return_attention_mask=True,
                                    return_tensors='pt',
                                    truncation=True)
    input_ids.append(encoded["input_ids"])
    attention_masks.append(encoded["attention_mask"])

#inputs_ids = torch.Tensor(len(input_ids),1, max_len+1)
#torch.cat(input_ids, out=inputs_ids)
inputs_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

with torch.no_grad():
    output = model(inputs_ids, attention_mask=attention_masks)
"""

In [None]:
hidden_states = output.hidden_states

In [None]:
token_embeddings = torch.stack(hidden_states, dim=0)
print(token_embeddings.size())
#token_embeddings = torch.squeeze(token_embeddings, dim=1)
#print(token_embeddings.size())


In [None]:
token_embeddings = token_embeddings.permute(1,2,0,3)
print(token_embeddings.size())

In [None]:
embs = []
for batch in token_embeddings:
    for token in batch:
        emb = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
        embs.append(emb)

In [None]:
embs[0].size()

In [None]:
test = [token.tolist() for token in embs]
test = np.array(test)

In [None]:
umap3D = UMAP(n_components=3, init='random', random_state=0)
proj3D = umap3D.fit_transform(test).T

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]
temp = flatten([batch.tolist() for batch in input_ids])
labels = np.array(temp)
labels = labels.reshape((labels.shape[0]*labels.shape[1]))
labels = tokenizer.convert_ids_to_tokens(labels)

In [None]:
data = {"x": proj3D[0],
        "y": proj3D[1],
        "z": proj3D[2], 
        "labels": labels}

In [None]:
token_indexes = [i for i in range(len(labels)) if labels[i] != "[PAD]" and labels[i] != "[CLS]" and labels[i] != "[SEP]"]
for k in data.keys():
    data[k] = [data[k][i] for i in range(len(data[k])) if i in token_indexes]

In [None]:
traces = []
for i in range(len(data['x'])):
    trace = go.Scatter3d(
        x=[data['x'][i]],
        y=[data['y'][i]],
        z=[data['z'][i]],
        mode='markers',
        marker=dict(size=6),
        text=[data['labels'][i]],
        name=data['labels'][i]
    )
    traces.append(trace)

# Create layout
layout = go.Layout(
    title='3D Scatter Plot',
    scene=dict(
        xaxis=dict(title='X'),
        yaxis=dict(title='Y'),
        zaxis=dict(title='Z')
    )
)

# Create figure
fig = go.Figure(data=traces, layout=layout)

# Show the plot
fig.show()

### BERT Sentence Transformers

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

In [None]:
embedding.shape

### Embedding a corpus

In [6]:
from transformers import BertTokenizer, BertModel
import torch
from umap import UMAP
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [7]:
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

corpus = "Wikipedia is a multilingual, free, online encyclopedia written and maintained by a community of volunteers, known as Wikipedians, through open collaboration and using a wiki-based editing system called MediaWiki. Wikipedia is the largest and most-read reference work in history. It is consistently one of the 10 most popular websites ranked by Similarweb and formerly Alexa; as of 2023, Wikipedia was ranked the 5th most popular site in the world according to Semrush. It is hosted by the Wikimedia Foundation, an American non-profit organization funded mainly through donations."
delimiter="."
print(corpus)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Wikipedia is a multilingual, free, online encyclopedia written and maintained by a community of volunteers, known as Wikipedians, through open collaboration and using a wiki-based editing system called MediaWiki. Wikipedia is the largest and most-read reference work in history. It is consistently one of the 10 most popular websites ranked by Similarweb and formerly Alexa; as of 2023, Wikipedia was ranked the 5th most popular site in the world according to Semrush. It is hosted by the Wikimedia Foundation, an American non-profit organization funded mainly through donations.


In [8]:
def tokenizeCorpus(corpus, model=BertModel.from_pretrained('bert-base-uncased', 
                                                           output_hidden_states=True), 
                           tokenizer = BertTokenizer.from_pretrained('bert-base-uncased'), 
                           model_input_size=512):
    def flatten(l):
        return [item for sublist in l for item in sublist]
    input_size = model_input_size - 1
    corpusWords = corpus.split(" ")
    splited = [" ".join(corpusWords[i:i+input_size]) for i in range(0, len(corpusWords), input_size)]

    input_ids = []
    attention_masks = []
    for sentence in splited:
        encoded = tokenizer.encode_plus(sentence, 
                                        add_special_tokens=True,
                                        max_length=input_size+1,
                                        pad_to_max_length=True,
                                        return_attention_mask=True,
                                        return_tensors='pt',
                                        truncation=True)
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])

    #inputs_ids = torch.Tensor(len(input_ids),1, max_len+1)
    #torch.cat(input_ids, out=inputs_ids)
    inputs_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    temp = flatten([batch.tolist() for batch in input_ids])
    labels = np.array(temp)
    labels = labels.reshape((labels.shape[0]*labels.shape[1]))
    labels = tokenizer.convert_ids_to_tokens(labels)
    with torch.no_grad():
        output = model(inputs_ids, attention_mask=attention_masks)
    return output, labels

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
def vectorizeCorpus(model_output, allStates=True):
    if allStates==True:
        hidden_states = model_output.hidden_states
    else:
        hidden_states = [model_output.last_hidden_state]
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = token_embeddings.permute(1,2,0,3)
    embs = []
    for batch in token_embeddings:
        for token in batch:
            emb = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
            embs.append(emb)
    return embs

In [10]:
def visualizeCorpus(embs, labels):
    
    formated_embs = [token.tolist() for token in embs]
    formated_embs = np.array(formated_embs)
    umap3D = UMAP(n_components=3, init='random', random_state=0)
    proj3D = umap3D.fit_transform(formated_embs).T

    data = {"x": proj3D[0],
            "y": proj3D[1],
            "z": proj3D[2], 
            "labels": labels}
    
    token_indexes = [i for i in range(len(labels)) if labels[i] != "[PAD]" and labels[i] != "[CLS]" and labels[i] != "[SEP]"]
    for k in data.keys():
        data[k] = [data[k][i] for i in range(len(data[k])) if i in token_indexes]

    traces = []
    for i in range(len(data['x'])):
        trace = go.Scatter3d(
            x=[data['x'][i]],
            y=[data['y'][i]],
            z=[data['z'][i]],
            mode='markers',
            marker=dict(size=6),
            text=[data['labels'][i]],
            name=data['labels'][i]
        )
        traces.append(trace)

    # Create layout
    layout = go.Layout(
        title='3D Scatter Plot',
        scene=dict(
            xaxis=dict(title='X'),
            yaxis=dict(title='Y'),
            zaxis=dict(title='Z')
        )
    )

    # Create figure
    fig = go.Figure(data=traces, layout=layout)

    # Show the plot
    fig.show()

In [None]:
o, l = tokenizeCorpus(corpus)
v = vectorizeCorpus(o)
visualizeCorpus(v, l)

### Test on billsum

In [2]:
import pandas as pd

In [3]:
dataset_url="https://drive.google.com/file/d/1Wd0M3qepNF6B4YwFYrpo7CaSERpudAG_/view?usp=share_link"
dataset_url='https://drive.google.com/uc?export=download&id=' + dataset_url.split('/')[-2]
dataset = pd.read_json(dataset_url, lines=True)
dataset = dataset.loc[:, ["text", "summary"]]

In [4]:
elem0 = dataset.iloc[0, 0]
print(elem0)

SECTION 1. SHORT TITLE.

    This Act may be cited as the ``National Science Education Tax 
Incentive for Businesses Act of 2007''.

SEC. 2. CREDITS FOR CERTAIN CONTRIBUTIONS BENEFITING SCIENCE, 
              TECHNOLOGY, ENGINEERING, AND MATHEMATICS EDUCATION AT THE 
              ELEMENTARY AND SECONDARY SCHOOL LEVEL.

    (a) In General.--Subpart D of part IV of subchapter A of chapter 1 
of the Internal Revenue Code of 1986 (relating to business related 
credits) is amended by adding at the end the following new section:

``SEC. 45O. CONTRIBUTIONS BENEFITING SCIENCE, TECHNOLOGY, ENGINEERING, 
              AND MATHEMATICS EDUCATION AT THE ELEMENTARY AND SECONDARY 
              SCHOOL LEVEL.

    ``(a) In General.--For purposes of section 38, the elementary and 
secondary science, technology, engineering, and mathematics (STEM) 
contributions credit determined under this section for the taxable year 
is an amount equal to 100 percent of the qualified STEM contributions 
of the taxp

In [11]:
o, l = tokenizeCorpus(elem0)
v = vectorizeCorpus(o)
visualizeCorpus(v, l)



### HDBScan (DBScan temporary)

In [None]:
! pip install hdbscan