# Ahren09/MMSoc_PolitiFact

https://huggingface.co/datasets/Ahren09/MMSoc_PolitiFact

Split:
- `train`: 381
- `test`: 102

Columns:
- `image`
- `text`: str
- `label`: int
    - `0`: real
    - `1`: fake

In [2]:
# import package
from datasets import DatasetDict, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load and download the dataset from huggingface
dataset = load_dataset("Ahren09/MMSoc_PolitiFact")

Generating train split: 100%|██████████| 381/381 [00:00<00:00, 2130.28 examples/s]
Generating test split: 100%|██████████| 102/102 [00:00<00:00, 3984.46 examples/s]


In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'text', 'label', 'split'],
        num_rows: 381
    })
    test: Dataset({
        features: ['image', 'text', 'label', 'split'],
        num_rows: 102
    })
})


In [5]:
print(f'train size: {len(dataset["train"])}')   # type: ignore
print(f'test size: {len(dataset["test"])}') # type: ignore

train size: 381
test size: 102


# Add BERT Embeddings

In [6]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
from transformers import AutoTokenizer, AutoModel

bert_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", clean_up_tokenization_spaces=True)
bert_model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)

In [8]:
def generate_bert_embeddings(text):
    # Tokenize the input text
    inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
    # Get the BERT embeddings
    with torch.no_grad():
        outputs = bert_model(**inputs)
    
    # get the cls token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()   # (1, 768)

    # flatten the embeddings
    embeddings = embeddings.flatten()   # (768,)
    
    return embeddings

In [9]:
# Generate BERT embeddings
def add_bert_embeddings(batch):
    batch['bert_embeddings'] = [generate_bert_embeddings(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_bert_embeddings, batched=True)

Map: 100%|██████████| 381/381 [00:05<00:00, 64.73 examples/s]
Map: 100%|██████████| 102/102 [00:01<00:00, 73.98 examples/s]


# Add RoBERTa Embeddings

In [10]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [11]:
from transformers import AutoTokenizer, AutoModel

roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base", clean_up_tokenization_spaces=True)
roberta_model = AutoModel.from_pretrained("roberta-base").to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def generate_roberta_embeddings(text):
    # Tokenize the input text
    inputs = roberta_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
    # Get the RoBERTa embeddings
    with torch.no_grad():
        outputs = roberta_model(**inputs)
    
    # get the cls token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()   # (1, 768)

    # flatten the embeddings
    embeddings = embeddings.flatten()   # (768,)
    
    return embeddings

In [13]:
# Generate RoBERTa embeddings
def add_roberta_embeddings(batch):
    batch['roberta_embeddings'] = [generate_roberta_embeddings(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_roberta_embeddings, batched=True)

Map: 100%|██████████| 381/381 [00:05<00:00, 75.67 examples/s]
Map: 100%|██████████| 102/102 [00:01<00:00, 75.50 examples/s]


# Reformat the dataset

In [14]:
# remove the redundant text column: ['image', 'split']
dataset = dataset.remove_columns(['image', 'split'])

# Upload to HuggingFace Hub

In [15]:
# push the dataset to the hub 
dataset.push_to_hub('LittleFish-Coder/Fake_News_PolitiFact')

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 14.13ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.16s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 65.00ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.03s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LittleFish-Coder/Fake_News_PolitiFact/commit/31082faa74918560ba28c90885c1ba86c5a638a5', commit_message='Upload dataset', commit_description='', oid='31082faa74918560ba28c90885c1ba86c5a638a5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LittleFish-Coder/Fake_News_PolitiFact', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LittleFish-Coder/Fake_News_PolitiFact'), pr_revision=None, pr_num=None)