In [1]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


# Load Dataset

In [2]:
from datasets import load_dataset

# load data
dataset = load_dataset("GonzaloA/fake_news", download_mode="reuse_cache_if_exists", cache_dir="dataset")

  from .autonotebook import tqdm as notebook_tqdm
Repo card metadata block was not found. Setting CardData to empty.
Generating train split: 100%|██████████| 24353/24353 [00:00<00:00, 39114.98 examples/s]
Generating validation split: 100%|██████████| 8117/8117 [00:00<00:00, 33091.15 examples/s]
Generating test split: 100%|██████████| 8117/8117 [00:00<00:00, 33993.05 examples/s]


In [3]:
# data
print(f"Dataset: {dataset}")
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

Dataset: DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 24353
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 8117
    })
    test: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 8117
    })
})


In [4]:
# quick look at the data
first_train = train_dataset[0]
print(f"First training sample")
print(f"Keys: {first_train.keys()}")
print(f"Title: {first_train['title']}")
print(f"Text: {first_train['text']}")
print(f"Label: {first_train['label']}")

First training sample
Keys: dict_keys(['Unnamed: 0', 'title', 'text', 'label'])
Title:  ‘Maury’ Show Official Facebook Posts F*CKED UP Caption On Guest That Looks Like Ted Cruz (IMAGE)
Text: Maury is perhaps one of the trashiest shows on television today. It s right in line with the likes of the gutter trash that is Jerry Springer, and the fact that those shows are still on the air with the shit they air really is a sad testament to what Americans find to be entertaining. However, Maury really crossed the line with a Facebook post regarding one of their guest s appearance with a vile, disgusting caption on Tuesday evening.There was a young woman on there doing one of their episodes regarding the paternity of her child. However, on the page, the show posted an image of the woman, who happens to bear a striking resemblance to Senator and presidential candidate Ted Cruz. The caption from the Maury Show page read: The Lie Detector Test determined .that was a LIE!  Ted Cruz is just NOT that

In [5]:
text = test_dataset[0]["text"]
print(f"Text: {text}")

Text: JOE DIGENOVA has been around D.C for decades and has seen it all. He probably didn t see his one coming. The incoming president  was set-up to be taken down. A soft coup is in the works and DiGenova has this to say about it:"It's very clear that they conspired to frame the incoming President of the United States."  Joe diGenova on allegations of anti-Trump bias at FBI and TheJusticeDept #Tucker https://t.co/qUNjAenzJc pic.twitter.com/VDlhb45Ghi  G. Ashley Hawkins (@g_ashleyhawkins) December 16, 2017DiGenova on Tucker Carlson tonight: Inside the FBI and Department of Justice under Obama was a brazen plot to do two things. To exonerate Hillary Clinton because of an animous for Donald Trump, and then if she lost to frame the incoming president for either a criminal act or impeachment. This is one of the most disgusting performances by the senior officials at the FBI and the Department of Justice that everyone of these agents should be fired and the people who are still in the Justic

# Directly Text Classification (Pipeline)

In [6]:
model_name = 'distilbert-base-uncased'
# ['bert-base-uncased', 'distilbert-base-uncased', 'roberta-base']

In [9]:
# Use a pipeline as a high-level helper
from transformers import pipeline

# get the model from huggingface model hub
pipe = pipeline("text-classification", model=f"LittleFish-Coder/{model_name}-fake-news-tfg", truncation=True, device=device)

In [10]:
pipe(text)

[{'label': 'fake', 'score': 0.9999011754989624}]

# Tokenizer and Pretrained-Model

In [11]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(f"LittleFish-Coder/{model_name}-fake-news-tfg")
model = AutoModelForSequenceClassification.from_pretrained(f"LittleFish-Coder/{model_name}-fake-news-tfg")

## Predict via tokenizer & model

### Tokenize the text and get the class

In [12]:
inputs = tokenizer(text, return_tensors="pt", truncation=True)

In [14]:
print(f"Input: {inputs}")

Input: {'input_ids': tensor([[  101,  3533, 10667, 16515,  3567,  2038,  2042,  2105,  1040,  1012,
          1039,  2005,  5109,  1998,  2038,  2464,  2009,  2035,  1012,  2002,
          2763,  2134,  1056,  2156,  2010,  2028,  2746,  1012,  1996, 14932,
          2343,  2001,  2275,  1011,  2039,  2000,  2022,  2579,  2091,  1012,
          1037,  3730,  8648,  2003,  1999,  1996,  2573,  1998, 10667, 16515,
          3567,  2038,  2023,  2000,  2360,  2055,  2009,  1024,  1000,  2009,
          1005,  1055,  2200,  3154,  2008,  2027,  9530, 13102, 27559,  2000,
          4853,  1996, 14932,  2343,  1997,  1996,  2142,  2163,  1012,  1000,
          3533, 10667, 16515,  3567,  2006,  9989,  1997,  3424,  1011,  8398,
         13827,  2012,  8495,  1998,  1996, 29427,  6610,  3207, 13876,  1001,
          9802, 16770,  1024,  1013,  1013,  1056,  1012,  2522,  1013, 24209,
          2078,  3900,  2368,  2480,  3501,  2278, 27263,  1012, 10474,  1012,
          4012,  1013,  1058, 1

In [18]:
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

predicted_class_id = logits.argmax().item()
prediction = model.config.id2label[predicted_class_id]
print(f"Output: {outputs}")
print(f"Logits: {logits}")
print(f"Prediction: {prediction}")

Output: SequenceClassifierOutput(loss=None, logits=tensor([[ 4.6103, -4.6113]]), hidden_states=None, attentions=None)
Logits: tensor([[ 4.6103, -4.6113]])
Prediction: fake


### Get the embedding of the text

In [19]:
model.config.output_hidden_states = True

# Get model output with hidden states
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**inputs)

# Now, outputs will have the hidden states
hidden_states = outputs.hidden_states

# The last layer's hidden state can be accessed like this
last_hidden_state = hidden_states[-1]

# If you still want to extract embeddings similar to the previous approach
embeddings = last_hidden_state.mean(dim=1)

In [23]:
print(f"hidden states type: {type(hidden_states)}")
print(f"hidden states Length: {len(hidden_states)}")
print(f"hidden states: {hidden_states}")

hidden states type: <class 'tuple'>
hidden states Length: 7
hidden states: (tensor([[[ 0.3543, -0.1496, -0.2336,  ...,  0.1597,  0.0914,  0.1433],
         [-1.0742,  0.2854, -1.6506,  ...,  0.4961,  0.9174, -0.3135],
         [-0.7164,  0.1168,  0.3597,  ...,  0.6972, -0.0320,  0.7682],
         ...,
         [ 0.4378, -0.0189,  0.0179,  ...,  0.1552,  0.2591, -0.2083],
         [ 0.1024,  0.0415,  0.4841,  ...,  0.2031,  0.3825, -0.3255],
         [-0.2721, -0.4431, -0.0198,  ..., -0.1932,  0.5117, -1.5857]]]), tensor([[[-0.2305,  0.1327, -0.2107,  ...,  0.2218,  0.0292,  0.0870],
         [-0.6776,  0.1769, -1.3328,  ...,  0.0223,  0.7583, -0.4918],
         [-0.5049, -0.0610,  0.2447,  ...,  0.6225,  1.6254,  1.4309],
         ...,
         [ 0.2436,  0.1279, -0.1877,  ..., -0.0023,  0.8627, -0.0630],
         [-0.1404,  0.0988,  0.5467,  ...,  0.0445,  0.2219,  0.1201],
         [-0.5331, -0.2343, -0.1652,  ..., -0.1417, -0.0838, -0.5035]]]), tensor([[[-0.3181, -0.1545, -0.4090,  

In [24]:
print(f"last hidden state type: {type(last_hidden_state)}")
print(f"last hidden state shape: {last_hidden_state.shape}")
print(f"last hidden state: {last_hidden_state}")

last hidden state type: <class 'torch.Tensor'>
last hidden state shape: torch.Size([1, 512, 768])
last hidden state: tensor([[[ 0.1969, -0.4562, -0.7499,  ...,  0.6036,  0.2989, -0.2620],
         [ 0.0748, -0.1579, -0.8113,  ..., -0.1256,  0.4523, -0.4735],
         [-0.3517, -0.5193,  0.0441,  ..., -0.0964,  0.1647,  0.6966],
         ...,
         [ 0.2468,  0.1859, -0.3148,  ...,  0.2002,  0.5649, -0.0689],
         [ 0.0435, -0.3555, -0.5899,  ...,  0.5562,  0.5851,  0.1449],
         [ 0.4628, -0.2911, -0.6976,  ...,  0.6346,  0.4035, -0.3816]]])


In [25]:
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embeddings: {embeddings}")

Embeddings shape: torch.Size([1, 768])
Embeddings: tensor([[ 1.2506e-01, -2.1919e-01, -2.2023e-01,  3.3304e-01,  7.4512e-01,
          2.3137e-01,  3.9668e-01,  5.3508e-01, -1.9261e-01, -1.1161e-01,
          1.3692e-01, -1.2280e-02, -3.0342e-01, -6.8824e-02, -5.1451e-01,
         -3.3084e-02,  1.6914e-01,  3.7226e-01, -4.6562e-01,  5.5116e-01,
          4.2993e-01,  2.3005e-01, -3.7719e-02,  1.1942e-01,  2.2736e-01,
          3.0696e-01, -2.1824e-01, -7.5034e-01,  9.4149e-02,  3.3059e-01,
         -8.5560e-02, -5.0753e-01,  5.1240e-01, -6.2063e-01, -4.4819e-01,
         -4.0813e-02, -1.7134e-01,  3.6643e-01,  5.5913e-01,  3.3241e-01,
         -2.9266e-01,  2.7584e-01, -1.5998e-01,  3.1492e-01, -5.1416e-01,
         -4.2035e-01,  4.3773e-02, -2.4468e-01, -5.0865e-01, -2.3225e-01,
         -2.7917e-01,  4.5818e-01, -4.2036e-01,  3.4395e-01,  2.4133e-01,
          4.9281e-01,  6.1817e-02, -8.0673e-01, -6.3471e-01, -5.5167e-01,
          3.1047e-01,  3.8040e-01, -1.6843e-01, -4.4576e-01, 

# Embedding Encoder

In [30]:
# select model
model_name = 'distilbert-base-uncased'

In [31]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(f"LittleFish-Coder/{model_name}-fake-news-tfg")
model = AutoModel.from_pretrained(f"LittleFish-Coder/{model_name}-fake-news-tfg")

In [32]:
inputs = tokenizer(text, return_tensors="pt", truncation=True)

In [33]:
embeddings = model(**inputs).last_hidden_state.mean(dim=1)

In [34]:
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embeddings: {embeddings}")

Embeddings shape: torch.Size([1, 768])
Embeddings: tensor([[ 1.2506e-01, -2.1919e-01, -2.2023e-01,  3.3304e-01,  7.4512e-01,
          2.3137e-01,  3.9668e-01,  5.3508e-01, -1.9261e-01, -1.1161e-01,
          1.3692e-01, -1.2280e-02, -3.0342e-01, -6.8824e-02, -5.1451e-01,
         -3.3084e-02,  1.6914e-01,  3.7226e-01, -4.6562e-01,  5.5116e-01,
          4.2993e-01,  2.3005e-01, -3.7719e-02,  1.1942e-01,  2.2736e-01,
          3.0696e-01, -2.1824e-01, -7.5034e-01,  9.4149e-02,  3.3059e-01,
         -8.5560e-02, -5.0753e-01,  5.1240e-01, -6.2063e-01, -4.4819e-01,
         -4.0813e-02, -1.7134e-01,  3.6643e-01,  5.5913e-01,  3.3241e-01,
         -2.9266e-01,  2.7584e-01, -1.5998e-01,  3.1492e-01, -5.1416e-01,
         -4.2035e-01,  4.3773e-02, -2.4468e-01, -5.0865e-01, -2.3225e-01,
         -2.7917e-01,  4.5818e-01, -4.2036e-01,  3.4395e-01,  2.4133e-01,
          4.9281e-01,  6.1817e-02, -8.0673e-01, -6.3471e-01, -5.5167e-01,
          3.1047e-01,  3.8040e-01, -1.6843e-01, -4.4576e-01, 