# Ahren09/MMSoc_GossipCop

https://huggingface.co/datasets/Ahren09/MMSoc_GossipCop

Split:
- `train`: 9988
- `test`: 2672

Columns:
- `image`
- `text`: str
- `label`: int
    - `0`: real
    - `1`: fake

In [5]:
# import package
from datasets import DatasetDict, load_dataset

In [6]:
# load and download the dataset from huggingface
dataset = load_dataset("Ahren09/MMSoc_GossipCop")

In [7]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'text', 'label', 'split'],
        num_rows: 9988
    })
    test: Dataset({
        features: ['image', 'text', 'label', 'split'],
        num_rows: 2672
    })
})


In [8]:
print(f'train size: {len(dataset["train"])}')   # type: ignore
print(f'test size: {len(dataset["test"])}') # type: ignore

train size: 9988
test size: 2672


# Add BERT Embeddings

In [9]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [10]:
from transformers import AutoTokenizer, AutoModel

bert_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", clean_up_tokenization_spaces=True)
bert_model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)

In [11]:
def generate_bert_embeddings(text):
    # Tokenize the input text
    inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
    # Get the BERT embeddings
    with torch.no_grad():
        outputs = bert_model(**inputs)
    
    # get the cls token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()   # (1, 768)

    # flatten the embeddings
    embeddings = embeddings.flatten()   # (768,)
    
    return embeddings

In [12]:
# Generate BERT embeddings
def add_bert_embeddings(batch):
    batch['bert_embeddings'] = [generate_bert_embeddings(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_bert_embeddings, batched=True)

Map: 100%|██████████| 9988/9988 [02:08<00:00, 77.51 examples/s]
Map: 100%|██████████| 2672/2672 [00:36<00:00, 73.45 examples/s]


# Add RoBERTa Embeddings

In [13]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from transformers import AutoTokenizer, AutoModel

roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base", clean_up_tokenization_spaces=True)
roberta_model = AutoModel.from_pretrained("roberta-base").to(device)

In [15]:
def generate_roberta_embeddings(text):
    # Tokenize the input text
    inputs = roberta_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
    # Get the RoBERTa embeddings
    with torch.no_grad():
        outputs = roberta_model(**inputs)
    
    # get the cls token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()   # (1, 768)

    # flatten the embeddings
    embeddings = embeddings.flatten()   # (768,)
    
    return embeddings

In [16]:
# Generate RoBERTa embeddings
def add_roberta_embeddings(batch):
    batch['roberta_embeddings'] = [generate_roberta_embeddings(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_roberta_embeddings, batched=True)

Map: 100%|██████████| 9988/9988 [02:08<00:00, 77.87 examples/s]
Map: 100%|██████████| 2672/2672 [00:32<00:00, 82.14 examples/s]


# Reformat the dataset

In [17]:
# remove the redundant text column: ['image', 'split']
dataset = dataset.remove_columns(['image', 'split'])

# Upload to HuggingFace Hub

In [18]:
# push the dataset to the hub 
dataset.push_to_hub('LittleFish-Coder/Fake_News_GossipCop')

Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 11.30ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:11<00:00, 11.11s/it]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 13.70ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.24s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LittleFish-Coder/Fake_News_GossipCop/commit/13c7cfbff4fa8f41a0d132b072cae1cbecfa8749', commit_message='Upload dataset', commit_description='', oid='13c7cfbff4fa8f41a0d132b072cae1cbecfa8749', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LittleFish-Coder/Fake_News_GossipCop', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LittleFish-Coder/Fake_News_GossipCop'), pr_revision=None, pr_num=None)