# Ahren09/MMSoc_GossipCop

https://huggingface.co/datasets/Ahren09/MMSoc_GossipCop

Split:
- `train`: 9988
- `test`: 2672

Columns:
- `image`
- `text`: str
- `label`: int
    - `0`: real
    - `1`: fake

In [3]:
# import package
from datasets import DatasetDict, load_dataset

In [2]:
# load and download the dataset from huggingface
dataset = load_dataset("Ahren09/MMSoc_GossipCop")

Generating train split: 100%|██████████| 9988/9988 [00:05<00:00, 1863.72 examples/s]
Generating test split: 100%|██████████| 2672/2672 [00:01<00:00, 1917.61 examples/s]


In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'text', 'label', 'split'],
        num_rows: 9988
    })
    test: Dataset({
        features: ['image', 'text', 'label', 'split'],
        num_rows: 2672
    })
})


In [4]:
print(f'train size: {len(dataset["train"])}')   # type: ignore
print(f'test size: {len(dataset["test"])}') # type: ignore

train size: 9988
test size: 2672


# Add BERT Embeddings

In [6]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained(f"google-bert/bert-base-uncased", clean_up_tokenization_spaces=True)
model = AutoModel.from_pretrained(f"google-bert/bert-base-uncased").to(device)

In [8]:
def generate_bert_embeddings(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
    # Get the BERT embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    
    # get the cls token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()   # (1, 768)

    # flatten the embeddings
    embeddings = embeddings.flatten()   # (768,)
    
    return embeddings

In [9]:
# Generate BERT embeddings
def add_embeddings(batch):
    batch['embeddings'] = [generate_bert_embeddings(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_embeddings, batched=True)

Map: 100%|██████████| 9988/9988 [01:51<00:00, 89.98 examples/s]
Map: 100%|██████████| 2672/2672 [00:29<00:00, 91.11 examples/s]


# Reformat the dataset

In [None]:
# remove the redundant text column: ['image', 'split']
dataset = dataset.remove_columns(['image', 'split'])

# Upload to HuggingFace Hub

In [13]:
# push the dataset to the hub 
dataset.push_to_hub('LittleFish-Coder/Fake_News_GossipCop')   # type:ignore

Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 17.36ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:08<00:00,  8.19s/it]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 19.48ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LittleFish-Coder/Fake_News_GossipCop/commit/6413470b2da8fa407184ea601a8a90d3843f0f24', commit_message='Upload dataset', commit_description='', oid='6413470b2da8fa407184ea601a8a90d3843f0f24', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LittleFish-Coder/Fake_News_GossipCop', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LittleFish-Coder/Fake_News_GossipCop'), pr_revision=None, pr_num=None)