# Ahren09/MMSoc_PolitiFact

https://huggingface.co/datasets/Ahren09/MMSoc_PolitiFact

Split:
- `train`: 381
- `test`: 102

Columns:
- `image`
- `text`: str
- `label`: int
    - `0`: real
    - `1`: fake

In [2]:
# import package
from datasets import DatasetDict, load_dataset

In [3]:
# load and download the dataset from huggingface
dataset = load_dataset("Ahren09/MMSoc_PolitiFact")

Generating train split: 100%|██████████| 381/381 [00:00<00:00, 2176.06 examples/s]
Generating test split: 100%|██████████| 102/102 [00:00<00:00, 4930.61 examples/s]


In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'text', 'label', 'split'],
        num_rows: 381
    })
    test: Dataset({
        features: ['image', 'text', 'label', 'split'],
        num_rows: 102
    })
})


In [5]:
print(f'train size: {len(dataset["train"])}')   # type: ignore
print(f'test size: {len(dataset["test"])}') # type: ignore

train size: 381
test size: 102


# Add BERT Embeddings

In [6]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained(f"google-bert/bert-base-uncased", clean_up_tokenization_spaces=True)
model = AutoModel.from_pretrained(f"google-bert/bert-base-uncased").to(device)

In [8]:
def generate_bert_embeddings(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
    # Get the BERT embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    
    # get the cls token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()   # (1, 768)

    # flatten the embeddings
    embeddings = embeddings.flatten()   # (768,)
    
    return embeddings

In [9]:
# Generate BERT embeddings
def add_embeddings(batch):
    batch['embeddings'] = [generate_bert_embeddings(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_embeddings, batched=True)

Map: 100%|██████████| 381/381 [00:05<00:00, 64.98 examples/s]
Map: 100%|██████████| 102/102 [00:01<00:00, 74.04 examples/s]


# Reformat the dataset

In [10]:
# remove the redundant text column: ['image', 'split']
dataset = dataset.remove_columns(['image', 'split'])

# Upload to HuggingFace Hub

In [12]:
# push the dataset to the hub 
dataset.push_to_hub('LittleFish-Coder/Fake_News_PolitiFact')   # type:ignore

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 20.60ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.28s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 98.70ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.74s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LittleFish-Coder/Fake_News_PolitiFact/commit/c0109ec67b4d6f078a0d601753384c5b6671cd4f', commit_message='Upload dataset', commit_description='', oid='c0109ec67b4d6f078a0d601753384c5b6671cd4f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LittleFish-Coder/Fake_News_PolitiFact', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LittleFish-Coder/Fake_News_PolitiFact'), pr_revision=None, pr_num=None)