# Content to Embedding

In [1]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


# Load the dataset

In [2]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load and download the dataset from huggingface
dataset = load_dataset("GonzaloA/fake_news", download_mode="reuse_cache_if_exists", cache_dir="dataset")

Repo card metadata block was not found. Setting CardData to empty.
Generating train split: 100%|██████████| 24353/24353 [00:00<00:00, 38969.44 examples/s]
Generating validation split: 100%|██████████| 8117/8117 [00:00<00:00, 37654.04 examples/s]
Generating test split: 100%|██████████| 8117/8117 [00:00<00:00, 32389.51 examples/s]


In [4]:
print(f"Dataset Type: {type(dataset)}")
print(f"{dataset}")
print(f"Dataset keys: {dataset.keys()}")

Dataset Type: <class 'datasets.dataset_dict.DatasetDict'>
DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 24353
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 8117
    })
    test: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 8117
    })
})
Dataset keys: dict_keys(['train', 'validation', 'test'])


In [5]:
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]
print(f"Train dataset type: {type(train_dataset)}")
print(f"Validation dataset type: {type(val_dataset)}")
print(f"Test dataset type: {type(test_dataset)}")

Train dataset type: <class 'datasets.arrow_dataset.Dataset'>
Validation dataset type: <class 'datasets.arrow_dataset.Dataset'>
Test dataset type: <class 'datasets.arrow_dataset.Dataset'>


In [7]:
# First element of the train dataset
print(f"{train_dataset[0].keys()}")
print(f"Title: {train_dataset[0]['title']}")
print(f"Text: {train_dataset[0]['text']}")
print(f"Label: {train_dataset[0]['label']}")

dict_keys(['Unnamed: 0', 'title', 'text', 'label'])
Title:  ‘Maury’ Show Official Facebook Posts F*CKED UP Caption On Guest That Looks Like Ted Cruz (IMAGE)
Text: Maury is perhaps one of the trashiest shows on television today. It s right in line with the likes of the gutter trash that is Jerry Springer, and the fact that those shows are still on the air with the shit they air really is a sad testament to what Americans find to be entertaining. However, Maury really crossed the line with a Facebook post regarding one of their guest s appearance with a vile, disgusting caption on Tuesday evening.There was a young woman on there doing one of their episodes regarding the paternity of her child. However, on the page, the show posted an image of the woman, who happens to bear a striking resemblance to Senator and presidential candidate Ted Cruz. The caption from the Maury Show page read: The Lie Detector Test determined .that was a LIE!  Ted Cruz is just NOT that SEXY! As if that weren t ho

# Load Tokenizer and Model

In [8]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [9]:
tokenizer = AutoTokenizer.from_pretrained(f"distilbert/distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(f"distilbert/distilbert-base-uncased").to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# get the input ids
inputs = tokenizer(train_dataset[0]['text'], return_tensors="pt", truncation=True).to(device)
print(f"Input keys: {inputs.keys()}")
print(f"Input ids: {inputs['input_ids']}")
print(f"Attention mask: {inputs['attention_mask']}")

Input keys: dict_keys(['input_ids', 'attention_mask'])
Input ids: tensor([[  101,  5003, 13098,  2003,  3383,  2028,  1997,  1996, 11669, 10458,
          3065,  2006,  2547,  2651,  1012,  2009,  1055,  2157,  1999,  2240,
          2007,  1996,  7777,  1997,  1996,  9535,  3334, 11669,  2008,  2003,
          6128, 17481,  1010,  1998,  1996,  2755,  2008,  2216,  3065,  2024,
          2145,  2006,  1996,  2250,  2007,  1996,  4485,  2027,  2250,  2428,
          2003,  1037,  6517,  9025,  2000,  2054,  4841,  2424,  2000,  2022,
         14036,  1012,  2174,  1010,  5003, 13098,  2428,  4625,  1996,  2240,
          2007,  1037,  9130,  2695,  4953,  2028,  1997,  2037,  4113,  1055,
          3311,  2007,  1037, 25047,  1010, 19424, 14408,  3258,  2006,  9857,
          3944,  1012,  2045,  2001,  1037,  2402,  2450,  2006,  2045,  2725,
          2028,  1997,  2037,  4178,  4953,  1996,  6986, 11795,  3012,  1997,
          2014,  2775,  1012,  2174,  1010,  2006,  1996,  3931, 

## Get the embeddings of the content

In [12]:
model.config.output_hidden_states = True

# Get model output with hidden states
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**inputs)

# Now, outputs will have the hidden states
hidden_states = outputs.hidden_states

# The last layer's hidden state can be accessed like this
last_hidden_state = hidden_states[-1]

# If you still want to extract embeddings similar to the previous approach
embeddings = last_hidden_state.mean(dim=1)

In [17]:
print(f"There are {len(hidden_states)} hidden states")
print(f"Shape of the last hidden state: {last_hidden_state.shape}")

There are 7 hidden states
Shape of the last hidden state: torch.Size([1, 512, 768])


In [18]:
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embeddings: {embeddings}")

Embeddings shape: torch.Size([1, 768])
Embeddings: tensor([[-3.1861e-02,  4.9893e-02,  2.0683e-01,  2.3420e-02,  3.4852e-02,
         -8.9138e-02,  1.2603e-01,  6.1797e-01, -1.6187e-02, -8.3815e-02,
          7.8477e-02, -3.5581e-01, -2.5342e-01,  3.8328e-01, -2.8318e-01,
          4.2140e-01,  2.3357e-01,  1.2756e-01, -1.7217e-01,  3.9825e-01,
          2.3975e-01, -1.0083e-01,  3.2119e-02,  3.6854e-01,  2.3199e-01,
         -1.9419e-02,  6.1420e-02, -1.5400e-01, -1.5163e-01, -4.9244e-02,
          5.3825e-01, -1.7541e-01, -8.5408e-02, -1.3305e-01, -8.4796e-02,
         -2.1256e-01,  6.0817e-02, -8.8527e-02, -2.5797e-02,  2.2620e-01,
         -6.4063e-01, -2.0540e-01, -2.6151e-02,  8.0417e-03, -3.1432e-01,
         -2.7633e-01,  3.7468e-01,  4.8099e-02,  8.5311e-02,  8.4348e-03,
         -2.2293e-01,  4.1168e-01, -1.1108e-01,  5.8610e-02,  2.3323e-01,
          3.8766e-01, -1.2526e-01, -3.7300e-01, -5.1305e-01, -2.0898e-01,
          1.3066e-01,  8.2867e-02, -3.6408e-02, -5.3573e-01, 

# Create Class Model

Now that we know how to get the embeddings of the content, 

we can create a class model that will take the dataset and get the embeddings of the content.

In [46]:
print(type(train_dataset))

<class 'datasets.arrow_dataset.Dataset'>


In [29]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class CustomDataset(Dataset):
    def __init__(self, texts, labels, model_name='bert-base-uncased', max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')

        # Move input tensors to the right shape for model (batch_size=1)
        input_ids = inputs['input_ids'].squeeze(0)  # Shape: (seq_len,)
        attention_mask = inputs['attention_mask'].squeeze(0)  # Shape: (seq_len,)

        # Get the BERT embeddings
        with torch.no_grad():
            outputs = self.model(input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
            embeddings = outputs.last_hidden_state.squeeze(0)  # Shape: (seq_len, hidden_dim)

        # Pool the embeddings (here, we use the [CLS] token embedding)
        cls_embedding = embeddings[0]  # Shape: (hidden_dim,)

        return cls_embedding, label


In [30]:
new_dataset = CustomDataset(texts=train_dataset['text'], labels=train_dataset['label'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# show the first element
print(f"Dataset length: {len(new_dataset)}")
print(f"Dataset first element: {new_dataset[0]}")

Dataset length: 24353
Dataset first element: (tensor([ 3.4876e-02, -5.6218e-01,  2.0681e-01, -5.0859e-02, -6.7468e-02,
        -8.2454e-01,  4.2402e-01,  5.0450e-01,  6.3672e-02, -2.4987e-01,
         2.3391e-01,  3.1432e-02, -2.4666e-01,  5.9632e-02,  4.0116e-01,
         3.6769e-01, -5.6225e-02,  6.0175e-01,  5.6815e-01,  2.3334e-01,
        -4.0697e-01, -5.5781e-01,  7.2422e-01, -5.2590e-01, -1.6810e-02,
        -2.6917e-01, -1.9714e-01, -2.8483e-01, -5.2153e-01, -4.6544e-02,
        -2.3110e-01,  6.0273e-02, -3.3898e-01, -3.2351e-01,  4.3331e-01,
        -4.3363e-01,  1.3181e-01,  9.5156e-02,  2.9911e-01,  2.3036e-01,
        -2.7992e-01,  4.2920e-01, -7.2668e-02,  2.4537e-02, -5.7243e-02,
        -2.2982e-01, -4.0438e+00,  4.4943e-01,  1.1087e-01, -2.8877e-01,
         5.0105e-01, -3.7387e-01, -2.3249e-01,  4.1634e-01,  4.5004e-01,
         4.4103e-01, -8.9959e-01, -8.5779e-02,  1.3592e-01, -1.5080e-01,
         1.4691e-01, -2.4806e-01, -1.0559e-01, -5.5534e-01, -1.4635e-01,
     