# GonzaloA/fake_news

https://huggingface.co/datasets/GonzaloA/fake_news

Columns:
- `id`: int
- `title`: str
- `text`: str
- `label`: boolean 
    - `0`: fake
    - `1`: true

In [3]:
# import package
from datasets import DatasetDict, load_dataset

In [4]:
# load and download the dataset from huggingface
dataset = load_dataset("GonzaloA/fake_news")

Repo card metadata block was not found. Setting CardData to empty.


# Add BERT Embeddings

In [8]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [9]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained(f"google-bert/bert-base-uncased", clean_up_tokenization_spaces=True)
model = AutoModel.from_pretrained(f"google-bert/bert-base-uncased").to(device)

In [12]:
def generate_bert_embeddings(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
    # Get the BERT embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    
    # get the cls token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()   # (1, 768)

    # flatten the embeddings
    embeddings = embeddings.flatten()   # (768,)
    
    return embeddings

In [13]:
# Generate BERT embeddings
def add_embeddings(batch):
    batch['embeddings'] = [generate_bert_embeddings(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_embeddings, batched=True)

Map: 100%|██████████| 24353/24353 [03:59<00:00, 101.71 examples/s]
Map: 100%|██████████| 8117/8117 [01:19<00:00, 101.65 examples/s]
Map: 100%|██████████| 8117/8117 [01:20<00:00, 101.44 examples/s]


# Reformat the dataset

In original dataset, the `label` are designed as `0` for fake news and `1` for real news.

In our research, we will change the `label` to `0` as real news and `1` as fake news for better usage.

In [23]:
# reformat the `label` column
# 0: real, 1: fake
def format_label(batch):
    batch['label'] = [1 if label == 0 else 0 for label in batch['label']]
    return batch

dataset = dataset.map(format_label, batched=True)

Map: 100%|██████████| 24353/24353 [00:00<00:00, 134864.96 examples/s]
Map: 100%|██████████| 8117/8117 [00:00<00:00, 115736.11 examples/s]
Map: 100%|██████████| 8117/8117 [00:00<00:00, 112335.44 examples/s]


In [None]:
# remove the `unnamed: 0` column
dataset = dataset.remove_columns(['Unnamed: 0'])

# Upload to HuggingFace Hub

In [29]:
# push the dataset to the hub 
dataset.push_to_hub('LittleFish-Coder/Fake_News_TFG')   # type:ignore

Creating parquet from Arrow format: 100%|██████████| 25/25 [00:01<00:00, 21.29ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.03s/it]
Creating parquet from Arrow format: 100%|██████████| 9/9 [00:00<00:00, 22.73ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]
Creating parquet from Arrow format: 100%|██████████| 9/9 [00:00<00:00, 22.67ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/LittleFish-Coder/Fake_News_TFG/commit/1cb744655f0b440f6c71a16f49fadb6ca27d0f4d', commit_message='Upload dataset', commit_description='', oid='1cb744655f0b440f6c71a16f49fadb6ca27d0f4d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LittleFish-Coder/Fake_News_TFG', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LittleFish-Coder/Fake_News_TFG'), pr_revision=None, pr_num=None)