# GonzaloA/fake_news

https://huggingface.co/datasets/GonzaloA/fake_news

Columns:
- `id`: int
- `title`: str
- `text`: str
- `label`: boolean 
    - `0`: fake
    - `1`: true

In [None]:
# import package
from datasets import DatasetDict, load_dataset

In [2]:
# load and download the dataset from huggingface
dataset = load_dataset("GonzaloA/fake_news")

Repo card metadata block was not found. Setting CardData to empty.
Generating train split: 100%|██████████| 24353/24353 [00:00<00:00, 77580.21 examples/s]
Generating validation split: 100%|██████████| 8117/8117 [00:00<00:00, 84721.50 examples/s]
Generating test split: 100%|██████████| 8117/8117 [00:00<00:00, 91056.64 examples/s]


# Add BERT Embeddings

In [3]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
from transformers import AutoTokenizer, AutoModel

bert_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", clean_up_tokenization_spaces=True)
bert_model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)

In [5]:
def generate_bert_embeddings(text):
    # Tokenize the input text
    inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
    # Get the BERT embeddings
    with torch.no_grad():
        outputs = bert_model(**inputs)
    
    # get the cls token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()   # (1, 768)

    # flatten the embeddings
    embeddings = embeddings.flatten()   # (768,)
    
    return embeddings

In [6]:
# Generate BERT embeddings
def add_bert_embeddings(batch):
    batch['bert_embeddings'] = [generate_bert_embeddings(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_bert_embeddings, batched=True)

Map: 100%|██████████| 24353/24353 [03:57<00:00, 102.41 examples/s]
Map: 100%|██████████| 8117/8117 [01:19<00:00, 102.60 examples/s]
Map: 100%|██████████| 8117/8117 [01:19<00:00, 101.90 examples/s]


# Add RoBERTa Embeddings

In [7]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from transformers import AutoTokenizer, AutoModel

roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base", clean_up_tokenization_spaces=True)
roberta_model = AutoModel.from_pretrained("roberta-base").to(device)

In [9]:
def generate_roberta_embeddings(text):
    # Tokenize the input text
    inputs = roberta_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
    # Get the RoBERTa embeddings
    with torch.no_grad():
        outputs = roberta_model(**inputs)
    
    # get the cls token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()   # (1, 768)

    # flatten the embeddings
    embeddings = embeddings.flatten()   # (768,)
    
    return embeddings

In [10]:
# Generate RoBERTa embeddings
def add_roberta_embeddings(batch):
    batch['roberta_embeddings'] = [generate_roberta_embeddings(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_roberta_embeddings, batched=True)

Map: 100%|██████████| 24353/24353 [03:55<00:00, 103.44 examples/s]
Map: 100%|██████████| 8117/8117 [01:18<00:00, 102.90 examples/s]
Map: 100%|██████████| 8117/8117 [01:18<00:00, 103.24 examples/s]


# Reformat the dataset

In original dataset, the `label` are designed as `0` for fake news and `1` for real news.

In our research, we will change the `label` to `0` as real news and `1` as fake news for better usage.

In [11]:
# reformat the `label` column
# 0: real, 1: fake
def format_label(batch):
    batch['label'] = [1 if label == 0 else 0 for label in batch['label']]
    return batch

dataset = dataset.map(format_label, batched=True)

Map: 100%|██████████| 24353/24353 [00:00<00:00, 97807.12 examples/s] 
Map: 100%|██████████| 8117/8117 [00:00<00:00, 87633.02 examples/s]
Map: 100%|██████████| 8117/8117 [00:00<00:00, 97575.50 examples/s]


In [12]:
# remove the `unnamed: 0` column
dataset = dataset.remove_columns(['Unnamed: 0'])

# Upload to HuggingFace Hub

In [13]:
# push the dataset to the hub 
dataset.push_to_hub('LittleFish-Coder/Fake_News_TFG')   # type:ignore

Creating parquet from Arrow format: 100%|██████████| 25/25 [00:01<00:00, 12.83ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:16<00:00, 16.82s/it]
Creating parquet from Arrow format: 100%|██████████| 9/9 [00:00<00:00, 13.66ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.75s/it]
Creating parquet from Arrow format: 100%|██████████| 9/9 [00:00<00:00, 13.97ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.30s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LittleFish-Coder/Fake_News_TFG/commit/2403efdc5db94f87eb92904238befb7fb5b5291e', commit_message='Upload dataset', commit_description='', oid='2403efdc5db94f87eb92904238befb7fb5b5291e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LittleFish-Coder/Fake_News_TFG', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LittleFish-Coder/Fake_News_TFG'), pr_revision=None, pr_num=None)