In [3]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Dataset

In [6]:
from datasets import load_dataset

# load data
dataset = load_dataset("GonzaloA/fake_news", download_mode="reuse_cache_if_exists", cache_dir="datasets")

Repo card metadata block was not found. Setting CardData to empty.


Generating train split:   0%|          | 0/24353 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8117 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8117 [00:00<?, ? examples/s]

In [8]:
# data
print(f"Dataset: {dataset}")
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

Dataset: DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 24353
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 8117
    })
    test: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 8117
    })
})


In [10]:
text = test_dataset[0]["text"]
print(f"Text: {text}")

Text: JOE DIGENOVA has been around D.C for decades and has seen it all. He probably didn t see his one coming. The incoming president  was set-up to be taken down. A soft coup is in the works and DiGenova has this to say about it:"It's very clear that they conspired to frame the incoming President of the United States."  Joe diGenova on allegations of anti-Trump bias at FBI and TheJusticeDept #Tucker https://t.co/qUNjAenzJc pic.twitter.com/VDlhb45Ghi  G. Ashley Hawkins (@g_ashleyhawkins) December 16, 2017DiGenova on Tucker Carlson tonight: Inside the FBI and Department of Justice under Obama was a brazen plot to do two things. To exonerate Hillary Clinton because of an animous for Donald Trump, and then if she lost to frame the incoming president for either a criminal act or impeachment. This is one of the most disgusting performances by the senior officials at the FBI and the Department of Justice that everyone of these agents should be fired and the people who are still in the Justic

# Directly Text Classification (Pipeline)

In [4]:
model_name = 'roberta-base'
# ['bert-base-uncased', 'distilbert-base-uncased', 'roberta-base']

In [12]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model=f"LittleFish-Coder/{model_name}-fake-news-tfg", truncation=True, device=device)

In [13]:
pipe(text)

[{'label': 'fake', 'score': 0.9998623132705688}]

# Tokenizer and Pretrained-Model

In [16]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(f"LittleFish-Coder/{model_name}-fake-news-tfg")
model = AutoModelForSequenceClassification.from_pretrained(f"LittleFish-Coder/{model_name}-fake-news-tfg")

## Predict via tokenizer & model

### Tokenize the text and get the class

In [20]:
inputs = tokenizer(text, return_tensors="pt", truncation=True)

In [22]:
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

predicted_class_id = logits.argmax().item()
prediction = model.config.id2label[predicted_class_id]
print(f"Prediction: {prediction}")

Prediction: fake


### Get the embedding of the text

In [23]:
model.config.output_hidden_states = True

# Get model output with hidden states
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**inputs)

# Now, outputs will have the hidden states
hidden_states = outputs.hidden_states

# The last layer's hidden state can be accessed like this
last_hidden_state = hidden_states[-1]

# If you still want to extract embeddings similar to the previous approach
embeddings = last_hidden_state.mean(dim=1)

In [25]:
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embeddings: {embeddings}")

Embeddings shape: torch.Size([1, 768])
Embeddings: tensor([[-2.9920e-02, -1.7140e-01,  2.7610e-01,  1.0799e-01, -1.0619e+00,
          1.6642e+00, -3.8511e-01, -6.5864e-01, -2.8680e-01, -4.4714e-01,
         -4.9551e-01, -3.3066e-01, -1.0937e-01,  5.8195e-01,  9.2182e-01,
          3.2029e-01, -4.4218e-02,  4.2980e-01, -8.0234e-01, -8.4733e-01,
          2.7803e-01,  4.9765e-01,  9.9182e-01,  7.6151e-01, -4.2630e-01,
         -1.5596e+00,  6.4133e-01, -1.2850e-01, -1.7808e-01, -7.2685e-02,
         -1.3189e-01, -4.5664e-01, -6.0735e-02,  1.3563e-01,  8.8216e-01,
          1.4637e+00,  1.5257e+00, -1.4174e-01,  4.6739e-01,  2.6380e-01,
         -1.2061e+00,  1.7454e+00, -1.1629e+00, -8.5190e-01,  7.8971e-01,
          4.3815e-01,  1.1670e+00,  1.4467e+00,  9.8973e-01, -1.3201e-01,
         -1.8164e-01, -1.9481e+00, -1.2132e+00,  2.3281e-01, -1.2659e+00,
         -7.7241e-02, -8.6417e-01,  2.3744e-01, -3.7228e-01, -6.6503e-01,
         -6.3943e-01,  1.2179e+00,  9.7103e-01,  5.4843e-01, 

# Embedding Encoder

In [26]:
from transformers import AutoModel

tokenizer = AutoTokenizer.from_pretrained(f"LittleFish-Coder/{model_name}-fake-news-tfg")
# Load the base model as an encoder
model = AutoModel.from_pretrained(f"LittleFish-Coder/{model_name}-fake-news-tfg")

Some weights of RobertaModel were not initialized from the model checkpoint at LittleFish-Coder/roberta-base-fake-news-tfg and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
inputs = tokenizer(text, return_tensors="pt", truncation=True)

In [28]:
embeddings = model(**inputs).last_hidden_state.mean(dim=1)

In [29]:
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embeddings: {embeddings}")

Embeddings shape: torch.Size([1, 768])
Embeddings: tensor([[-2.9920e-02, -1.7140e-01,  2.7610e-01,  1.0799e-01, -1.0619e+00,
          1.6642e+00, -3.8511e-01, -6.5864e-01, -2.8680e-01, -4.4714e-01,
         -4.9551e-01, -3.3066e-01, -1.0937e-01,  5.8195e-01,  9.2182e-01,
          3.2029e-01, -4.4218e-02,  4.2980e-01, -8.0234e-01, -8.4733e-01,
          2.7803e-01,  4.9765e-01,  9.9182e-01,  7.6151e-01, -4.2630e-01,
         -1.5596e+00,  6.4133e-01, -1.2850e-01, -1.7808e-01, -7.2685e-02,
         -1.3189e-01, -4.5664e-01, -6.0735e-02,  1.3563e-01,  8.8216e-01,
          1.4637e+00,  1.5257e+00, -1.4174e-01,  4.6739e-01,  2.6380e-01,
         -1.2061e+00,  1.7454e+00, -1.1629e+00, -8.5190e-01,  7.8971e-01,
          4.3815e-01,  1.1670e+00,  1.4467e+00,  9.8973e-01, -1.3201e-01,
         -1.8164e-01, -1.9481e+00, -1.2132e+00,  2.3281e-01, -1.2659e+00,
         -7.7241e-02, -8.6417e-01,  2.3744e-01, -3.7228e-01, -6.6503e-01,
         -6.3943e-01,  1.2179e+00,  9.7103e-01,  5.4843e-01, 