In [21]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

paragraph = "This is a sample sentence.\n\nTwo sentences are form a paragraph."

# Add the special tokens for all the sentences.

sentences = paragraph.split(".")

for i in range(len(sentences)):
    sentences[i] = "[CLS] " + sentences[i] + " [SEP]"

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

print ("Tokenize the first sentence:")
print (tokenized_texts[0])

print ("Tokenize the second sentence:")
print (tokenized_texts[1])

# Now, we flatten the list.

flattened_list = [y for x in tokenized_texts for y in x]

print ("Flatten the list:")
print (flattened_list)


Tokenize the first sentence:
['[CLS]', 'this', 'is', 'a', 'sample', 'sentence', '[SEP]']
Tokenize the second sentence:
['[CLS]', 'two', 'sentences', 'are', 'form', 'a', 'paragraph', '[SEP]']
Flatten the list:
['[CLS]', 'this', 'is', 'a', 'sample', 'sentence', '[SEP]', '[CLS]', 'two', 'sentences', 'are', 'form', 'a', 'paragraph', '[SEP]', '[CLS]', '[SEP]']


In [22]:
from transformers import DistilBertModel, DistilBertTokenizer
import torch

# Load the pre-trained model and tokenizer
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Encode the sentences
encoded_inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Get the embeddings
with torch.no_grad():
    outputs = model(**encoded_inputs)

# Extract the embeddings
embeddings = outputs.last_hidden_state

# Print the embeddings
print(embeddings)


tensor([[[-0.2112, -0.0557,  0.1161,  ..., -0.0561,  0.0115,  0.3577],
         [-0.1860, -0.0776,  0.1181,  ..., -0.0676,  0.0334,  0.3792],
         [-0.7086, -0.4587,  0.0367,  ..., -0.2837, -0.0278,  0.5087],
         ...,
         [ 0.9535,  0.1016, -0.3307,  ...,  0.2683, -0.8601, -0.1683],
         [ 0.9535,  0.1017, -0.3307,  ...,  0.2682, -0.8601, -0.1683],
         [-0.1686, -0.2987,  0.0707,  ...,  0.0899,  0.0276,  0.4075]],

        [[-0.2322,  0.0139,  0.0940,  ..., -0.0257,  0.1141,  0.2558],
         [-0.2110,  0.0233,  0.0991,  ..., -0.0474,  0.1314,  0.2686],
         [-0.2774, -0.3501,  0.0289,  ..., -0.1392,  0.2158, -0.0511],
         ...,
         [ 0.2917, -0.1940, -0.1034,  ...,  0.2925,  0.0747, -0.2758],
         [ 0.9654,  0.2049, -0.2678,  ...,  0.2712, -0.7869, -0.1975],
         [ 0.9654,  0.2050, -0.2677,  ...,  0.2712, -0.7869, -0.1975]],

        [[-0.2519,  0.0543,  0.0957,  ..., -0.0635,  0.0188,  0.1685],
         [-0.2274,  0.0480,  0.0948,  ..., -0

In [58]:
def preprocess(raw_text, tokenizer, max_pos=512):
    """
    - Remove \n
    - Sentence Tokenize
    - Add [SEP] [CLS] as sentence boundary
    """

    sentences = raw_text.split("\n\n")
    sentences = ["[CLS] " + sent + " [SEP]" for sent in sentences]
    paragraph = " ".join(sentences)

    return paragraph, len(sentences)

def load_text(paragraph, tokenizer, max_pos=512, device="cpu"):
    sep_vid = tokenizer.vocab["[SEP]"]
    cls_vid = tokenizer.vocab["[CLS]"]

    paragraph = paragraph.strip().lower()
    paragraph = paragraph.replace("[cls]", "[CLS]")
    paragraph = paragraph.replace("[sep]", "[SEP]")

    paragraph_subtokens = tokenizer.tokenize(paragraph)
    paragraph_subtokens = ["[CLS]"] + paragraph_subtokens + ["[SEP]"]

    print(paragraph)

    subtokens_idxs = tokenizer.convert_tokens_to_ids(paragraph_subtokens)  

    print(subtokens_idxs)

    subtokens_idxs = subtokens_idxs[:-1][:max_pos]

    subtokens_idxs[-1] = sep_vid

    _segs = [-1] + [i for i, t in enumerate(subtokens_idxs) if t == sep_vid]
    segs = [_segs[i] - _segs[i-1] for i in range(1, len(_segs))]

    segments_ids = []
    for i, s in enumerate(segs):
        if (i % 2 == 0):
            segments_ids += s * [0]
        else:
            segments_ids += s * [1]

    src = torch.tensor(subtokens_idxs)[None, :].to(device)
    mask_src = (1 - (src == 0).float()).to(device)
    cls_ids = [[i for i, t in enumerate(subtokens_idxs) if t == cls_vid]]
    clss = torch.tensor(cls_ids).to(device)
    mask_cls = 1 - (clss == -1).float()
    clss[clss == -1] = 0

    segs = torch.tensor(segments_ids).to(device)
    src_text = [[sent.replace("[SEP]", "").strip() for sent in paragraph.split("[CLS]")]]

    return src, mask_src, segs, clss, mask_cls, src_text


In [59]:
paragraph = "This.\n\nTwo sentences."
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

paragraph, nb_sent = preprocess(paragraph, tokenizer)

print(paragraph)
print(nb_sent)

src, mask_src, segs, clss, mask_cls, src_txt = load_text(paragraph, tokenizer)

print(src)
print(mask_src)
print(segs)
print(clss)
print(mask_cls)

[CLS] This. [SEP] [CLS] Two sentences. [SEP]
2
[CLS] this. [SEP] [CLS] two sentences. [SEP]
[101, 101, 2023, 1012, 102, 101, 2048, 11746, 1012, 102, 102]
tensor([[  101,   101,  2023,  1012,   102,   101,  2048, 11746,  1012,   102]])
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
tensor([[0, 1, 5]])
tensor([[1., 1., 1.]])


In [60]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [64]:
import nltk
from nltk import sent_tokenize

nltk.download('punkt')

def preprocess(text):
    sentences = sent_tokenize(text)
    tokenized_text = ['[CLS] ' + sent + ' [SEP]' for sent in sentences]
    return tokenized_text

[nltk_data] Downloading package punkt to /home/jose/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [65]:
example_text = "This is a sample sentence. The model is able to predict the sentiment of the sentence."

tokenized_text = preprocess(example_text)

print(tokenized_text)

['[CLS] This is a sample sentence. [SEP]', '[CLS] The model is able to predict the sentiment of the sentence. [SEP]']


In [70]:
def initialize_segment_embeddings(model):
    ea_embedding = torch.nn.Embedding(2, model.config.dim)
    eb_embedding = torch.nn.Embedding(2, model.config.dim)
    return ea_embedding, eb_embedding

def encode_sentences(sentences, tokenizer):
    input_ids = []
    attention_masks = []
    segment_ids = []  # Keep track of segments (odd or even)

    for i, sent in enumerate(sentences):
        encoded_dict = tokenizer.encode_plus(
                            sent,
                            add_special_tokens = False,
                            max_length = 128,
                            pad_to_max_length = True,
                            return_attention_mask = True,
                            return_tensors = 'pt',
                       )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        segment_ids.append(torch.full((1, 128), i % 2))  # 0 for even, 1 for odd sentences
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    segment_ids = torch.cat(segment_ids, dim=0)
    
    return input_ids, attention_masks, segment_ids

def add_segment_embeddings(embeddings, segment_ids, ea_embedding, eb_embedding):
    # Get the batch size and sequence length from embeddings
    batch_size, seq_length, hidden_size = embeddings.size()

    # Expand segment embeddings to match the dimensions of BERT embeddings
    ea_embeddings = ea_embedding(segment_ids).view(batch_size, seq_length, hidden_size)
    eb_embeddings = eb_embedding(segment_ids).view(batch_size, seq_length, hidden_size)

    # Add segment embeddings to the original embeddings
    enhanced_embeddings = embeddings + torch.where(segment_ids.unsqueeze(-1) == 0, ea_embeddings, eb_embeddings)

    return enhanced_embeddings

In [67]:
def get_embeddings(input_ids, attention_masks, model):
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
    return outputs.last_hidden_state

In [68]:
def extract_sentence_embeddings(embeddings):
    # Assuming that the first token of each sentence is [CLS]
    return embeddings[:,0,:]

In [71]:
def pipeline(text, model, tokenizer, ea_embedding, eb_embedding):
    sentences = preprocess(text)
    input_ids, attention_masks, segment_ids = encode_sentences(sentences, tokenizer)
    embeddings = get_embeddings(input_ids, attention_masks, model)
    embeddings = add_segment_embeddings(embeddings, segment_ids, ea_embedding, eb_embedding)
    sentence_embeddings = extract_sentence_embeddings(embeddings)
    return sentences, sentence_embeddings

sentences, sentence_embeddings = pipeline(example_text, model, tokenizer, *initialize_segment_embeddings(model))

print(sentences)
print(sentence_embeddings)

['[CLS] This is a sample sentence. [SEP]', '[CLS] The model is able to predict the sentiment of the sentence. [SEP]']
tensor([[ 0.1718,  1.3256, -0.1499,  ...,  0.8166,  0.3480, -0.3955],
        [-2.4270, -0.9475, -2.0156,  ...,  0.0617,  0.4753,  0.7546]],
       grad_fn=<SliceBackward0>)


In [72]:
import torch.nn as nn

# Define the linear layer
linear_layer = nn.Linear(in_features=sentence_embeddings.size(1), out_features=1)

# Apply the linear layer to the sentence embeddings
classification_output = linear_layer(sentence_embeddings)

# Apply sigmoid activation function
probabilities = nn.functional.sigmoid(classification_output)

# Print the probabilities
print(probabilities)


tensor([[0.2615],
        [0.4197]], grad_fn=<SigmoidBackward0>)


In [77]:
text = 'a X b X c'
text = text.split(' X ')[1:]
print(" ".join(text))

b c
