# Ahren09/MMSoc_GossipCop

https://huggingface.co/datasets/Ahren09/MMSoc_GossipCop

Split:
- `train`: 9988
- `test`: 2672

Columns:
- `image`
- `text`: str
- `label`: int
    - `0`: real
    - `1`: fake

In [1]:
# import package
from datasets import DatasetDict, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load and download the dataset from huggingface
dataset = load_dataset("Ahren09/MMSoc_GossipCop")

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'text', 'label', 'split'],
        num_rows: 9988
    })
    test: Dataset({
        features: ['image', 'text', 'label', 'split'],
        num_rows: 2672
    })
})


In [4]:
print(f'train size: {len(dataset["train"])}')   # type: ignore
print(f'test size: {len(dataset["test"])}') # type: ignore

train size: 9988
test size: 2672


# Add BERT Embeddings

In [5]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
from transformers import AutoTokenizer, AutoModel

bert_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", clean_up_tokenization_spaces=True)
bert_model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)

In [7]:
def generate_bert_embeddings(text):
    # Tokenize the input text
    inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
    # Get the BERT embeddings
    with torch.no_grad():
        outputs = bert_model(**inputs)
    
    # get the cls token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()   # (1, 768)

    # flatten the embeddings
    embeddings = embeddings.flatten()   # (768,)
    
    return embeddings

In [8]:
import numpy as np

def generate_bert_embeddings_sliding_window(text, window_size=512, stride=256):
    """
    Generate BERT embeddings using a sliding window approach for longer texts.
    
    Args:
        text (str): The input text
        window_size (int): Maximum sequence length for BERT
        stride (int): Number of tokens to slide the window by
    
    Returns:
        numpy.ndarray: Aggregated BERT embeddings
    """
    # Tokenize the entire text
    tokens = bert_tokenizer.tokenize(text)
    
    # If text is shorter than window_size, just use standard approach
    if len(tokens) <= window_size - 2:  # -2 for [CLS] and [SEP] tokens
        return generate_bert_embeddings(text)
    
    # For longer texts, use sliding window
    embeddings_list = []
    
    # Process text in overlapping windows
    for i in range(0, len(tokens), stride):
        # Extract window of tokens
        window_tokens = tokens[i:i + window_size - 2]
        
        # Convert tokens back to text (this is an approximation)
        window_text = bert_tokenizer.convert_tokens_to_string(window_tokens)
        
        # Get embeddings for this window
        inputs = bert_tokenizer(window_text, return_tensors='pt', padding=True, truncation=True, 
                               max_length=window_size).to(device)
        
        with torch.no_grad():
            outputs = bert_model(**inputs)
        
        # Get the CLS token embedding for this window
        window_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()
        embeddings_list.append(window_embedding)
    
    # Aggregate embeddings from all windows (several options available)
    # Option 1: Average all window embeddings
    final_embedding = np.mean(embeddings_list, axis=0)
    
    # Option 2: Max pooling across all windows
    # final_embedding = np.max(embeddings_list, axis=0)
    
    # Option 3: Weighted average, giving more weight to earlier parts of the text
    # weights = np.linspace(1.0, 0.5, len(embeddings_list))
    # final_embedding = np.average(embeddings_list, axis=0, weights=weights)
    
    return final_embedding

In [9]:
# Generate BERT embeddings
def add_bert_embeddings(batch):
    batch['bert_embeddings'] = [generate_bert_embeddings_sliding_window(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_bert_embeddings, batched=True)

Map:   0%|          | 0/9988 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (664 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 9988/9988 [05:50<00:00, 28.47 examples/s]
Map: 100%|██████████| 2672/2672 [01:28<00:00, 30.26 examples/s]


# Add RoBERTa Embeddings

In [10]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [11]:
from transformers import AutoTokenizer, AutoModel

roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base", clean_up_tokenization_spaces=True)
roberta_model = AutoModel.from_pretrained("roberta-base").to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def generate_roberta_embeddings(text):
    # Tokenize the input text
    inputs = roberta_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
    # Get the RoBERTa embeddings
    with torch.no_grad():
        outputs = roberta_model(**inputs)
    
    # get the cls token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()   # (1, 768)

    # flatten the embeddings
    embeddings = embeddings.flatten()   # (768,)
    
    return embeddings

In [13]:
import numpy as np

def generate_roberta_embeddings_sliding_window(text, window_size=512, stride=256):
    """
    Generate RoBERTa embeddings using a sliding window approach for longer texts.
    
    Args:
        text (str): The input text
        window_size (int): Maximum sequence length for RoBERTa
        stride (int): Number of tokens to slide the window by
    
    Returns:
        numpy.ndarray: Aggregated RoBERTa embeddings
    """
    # Tokenize the entire text
    tokens = roberta_tokenizer.tokenize(text)
    
    # If text is shorter than window_size, just use standard approach
    if len(tokens) <= window_size - 2:  # -2 for <s> and </s> tokens in RoBERTa
        return generate_roberta_embeddings(text)
    
    # For longer texts, use sliding window
    embeddings_list = []
    
    # Process text in overlapping windows
    for i in range(0, len(tokens), stride):
        # Extract window of tokens
        window_tokens = tokens[i:i + window_size - 2]
        
        # Convert tokens back to text (this is an approximation)
        window_text = roberta_tokenizer.convert_tokens_to_string(window_tokens)
        
        # Get embeddings for this window
        inputs = roberta_tokenizer(window_text, return_tensors='pt', padding=True, truncation=True, 
                                  max_length=window_size).to(device)
        
        with torch.no_grad():
            outputs = roberta_model(**inputs)
        
        # Get the CLS token embedding for this window
        window_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()
        embeddings_list.append(window_embedding)
    
    # Aggregate embeddings from all windows (several options available)
    # Option 1: Average all window embeddings
    final_embedding = np.mean(embeddings_list, axis=0)
    
    # Option 2: Max pooling across all windows (alternative approach)
    # final_embedding = np.max(embeddings_list, axis=0)
    
    # Option 3: Weighted average, giving more weight to earlier parts of the text
    # weights = np.linspace(1.0, 0.5, len(embeddings_list))
    # final_embedding = np.average(embeddings_list, axis=0, weights=weights)
    
    return final_embedding

In [14]:
# Generate RoBERTa embeddings
def add_roberta_embeddings(batch):
    batch['roberta_embeddings'] = [generate_roberta_embeddings_sliding_window(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_roberta_embeddings, batched=True)

Map:   0%|          | 0/9988 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (721 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 9988/9988 [05:40<00:00, 29.34 examples/s]
Map: 100%|██████████| 2672/2672 [01:27<00:00, 30.42 examples/s]


# Reformat the dataset

In [15]:
# remove the redundant text column: ['image', 'split']
dataset = dataset.remove_columns(['image', 'split'])

# Upload to HuggingFace Hub

In [16]:
# push the dataset to the hub 
dataset.push_to_hub('LittleFish-Coder/Fake_News_GossipCop')

Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 11.70ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:10<00:00, 10.41s/it]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.59ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.16s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LittleFish-Coder/Fake_News_GossipCop/commit/3d2c8aa6c40bc4296aa7dd64711be64e45fc6c46', commit_message='Upload dataset', commit_description='', oid='3d2c8aa6c40bc4296aa7dd64711be64e45fc6c46', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LittleFish-Coder/Fake_News_GossipCop', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LittleFish-Coder/Fake_News_GossipCop'), pr_revision=None, pr_num=None)