# Ahren09/MMSoc_PolitiFact

https://huggingface.co/datasets/Ahren09/MMSoc_PolitiFact

Split:
- `train`: 381
- `test`: 102

Columns:
- `image`
- `text`: str
- `label`: int
    - `0`: real
    - `1`: fake

In [13]:
# import package
from datasets import DatasetDict, load_dataset
import numpy as np

In [14]:
# load and download the dataset from huggingface
# dataset = load_dataset("Ahren09/MMSoc_PolitiFact", cache_dir="./dataset")

dataset = load_dataset("LittleFish-Coder/Fake_News_PolitiFact", cache_dir="./dataset")

Generating train split: 100%|██████████| 381/381 [00:00<00:00, 1166.83 examples/s]
Generating test split: 100%|██████████| 102/102 [00:00<00:00, 1133.88 examples/s]


In [15]:
# rename interaction_tones to interaction_tones_list
# dataset = dataset.rename_column("interaction_tones", "interaction_tones_list")

In [16]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'user_interactions', 'interaction_embeddings_list', 'interaction_tones_list', 'combined_embeddings', 'distilbert_embeddings', 'bigbird_embeddings', 'deberta_embeddings'],
        num_rows: 381
    })
    test: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'user_interactions', 'interaction_embeddings_list', 'interaction_tones_list', 'combined_embeddings', 'distilbert_embeddings', 'bigbird_embeddings', 'deberta_embeddings'],
        num_rows: 102
    })
})


In [4]:
# remove the redundant text column: ['image', 'split']
try:
    dataset = dataset.remove_columns(['image', 'split'])
except:
    pass

In [5]:
print(f'train size: {len(dataset["train"])}')   # type: ignore
print(f'test size: {len(dataset["test"])}') # type: ignore

train size: 381
test size: 102


# Add BERT Embeddings

In [5]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
from transformers import AutoTokenizer, AutoModel

bert_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", clean_up_tokenization_spaces=True)
bert_model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)

In [7]:
def generate_bert_embeddings(text):
    # Tokenize the input text
    inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
    # Get the BERT embeddings
    with torch.no_grad():
        outputs = bert_model(**inputs)
    
    # get the cls token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()   # (1, 768)

    # flatten the embeddings
    embeddings = embeddings.flatten()   # (768,)
    
    return embeddings

In [8]:
import numpy as np

def generate_bert_embeddings_sliding_window(text, window_size=512, stride=256):
    """
    Generate BERT embeddings using a sliding window approach for longer texts.
    
    Args:
        text (str): The input text
        window_size (int): Maximum sequence length for BERT
        stride (int): Number of tokens to slide the window by
    
    Returns:
        numpy.ndarray: Aggregated BERT embeddings
    """
    # Tokenize the entire text
    tokens = bert_tokenizer.tokenize(text)
    
    # If text is shorter than window_size, just use standard approach
    if len(tokens) <= window_size - 2:  # -2 for [CLS] and [SEP] tokens
        return generate_bert_embeddings(text)
    
    # For longer texts, use sliding window
    embeddings_list = []
    
    # Process text in overlapping windows
    for i in range(0, len(tokens), stride):
        # Extract window of tokens
        window_tokens = tokens[i:i + window_size - 2]
        
        # Convert tokens back to text (this is an approximation)
        window_text = bert_tokenizer.convert_tokens_to_string(window_tokens)
        
        # Get embeddings for this window
        inputs = bert_tokenizer(window_text, return_tensors='pt', padding=True, truncation=False, 
                               max_length=window_size).to(device)
        
        with torch.no_grad():
            outputs = bert_model(**inputs)
        
        # Get the CLS token embedding for this window
        window_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()
        embeddings_list.append(window_embedding)
    
    # Aggregate embeddings from all windows (several options available)
    # Option 1: Average all window embeddings
    final_embedding = np.mean(embeddings_list, axis=0)
    
    # Option 2: Max pooling across all windows
    # final_embedding = np.max(embeddings_list, axis=0)
    
    # Option 3: Weighted average, giving more weight to earlier parts of the text
    # weights = np.linspace(1.0, 0.5, len(embeddings_list))
    # final_embedding = np.average(embeddings_list, axis=0, weights=weights)
    
    return final_embedding

In [9]:
# Generate BERT embeddings
def add_bert_embeddings(batch):
    batch['bert_embeddings'] = [generate_bert_embeddings_sliding_window(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_bert_embeddings, batched=True)

Map:   0%|          | 0/381 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (574 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 381/381 [00:28<00:00, 13.15 examples/s]
Map: 100%|██████████| 102/102 [00:07<00:00, 14.03 examples/s]


# Add RoBERTa Embeddings

In [10]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [11]:
from transformers import AutoTokenizer, AutoModel

roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base", clean_up_tokenization_spaces=True)
roberta_model = AutoModel.from_pretrained("roberta-base").to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def generate_roberta_embeddings(text):
    # Tokenize the input text
    inputs = roberta_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
    # Get the RoBERTa embeddings
    with torch.no_grad():
        outputs = roberta_model(**inputs)
    
    # get the cls token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()   # (1, 768)

    # flatten the embeddings
    embeddings = embeddings.flatten()   # (768,)
    
    return embeddings

In [13]:
import numpy as np

def generate_roberta_embeddings_sliding_window(text, window_size=512, stride=256):
    """
    Generate RoBERTa embeddings using a sliding window approach for longer texts.
    
    Args:
        text (str): The input text
        window_size (int): Maximum sequence length for RoBERTa
        stride (int): Number of tokens to slide the window by
    
    Returns:
        numpy.ndarray: Aggregated RoBERTa embeddings
    """
    # Tokenize the entire text
    tokens = roberta_tokenizer.tokenize(text)
    
    # If text is shorter than window_size, just use standard approach
    if len(tokens) <= window_size - 2:  # -2 for <s> and </s> tokens in RoBERTa
        return generate_roberta_embeddings(text)
    
    # For longer texts, use sliding window
    embeddings_list = []
    
    # Process text in overlapping windows
    for i in range(0, len(tokens), stride):
        # Extract window of tokens
        window_tokens = tokens[i:i + window_size - 2]
        
        # Convert tokens back to text (this is an approximation)
        window_text = roberta_tokenizer.convert_tokens_to_string(window_tokens)
        
        # Get embeddings for this window
        inputs = roberta_tokenizer(window_text, return_tensors='pt', padding=True, truncation=True, 
                                  max_length=window_size).to(device)
        
        with torch.no_grad():
            outputs = roberta_model(**inputs)
        
        # Get the CLS token embedding for this window
        window_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()
        embeddings_list.append(window_embedding)
    
    # Aggregate embeddings from all windows (several options available)
    # Option 1: Average all window embeddings
    final_embedding = np.mean(embeddings_list, axis=0)
    
    # Option 2: Max pooling across all windows (alternative approach)
    # final_embedding = np.max(embeddings_list, axis=0)
    
    # Option 3: Weighted average, giving more weight to earlier parts of the text
    # weights = np.linspace(1.0, 0.5, len(embeddings_list))
    # final_embedding = np.average(embeddings_list, axis=0, weights=weights)
    
    return final_embedding

In [14]:
# Generate RoBERTa embeddings
def add_roberta_embeddings(batch):
    batch['roberta_embeddings'] = [generate_roberta_embeddings_sliding_window(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_roberta_embeddings, batched=True)

Map:   0%|          | 0/381 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (605 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 381/381 [00:28<00:00, 13.31 examples/s]
Map: 100%|██████████| 102/102 [00:11<00:00,  9.11 examples/s]


# Add DistilBERT Embeddings

In [23]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [24]:
from transformers import AutoTokenizer, AutoModel

distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", clean_up_tokenization_spaces=True)
distilbert_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

In [25]:
def generate_distilbert_embeddings(text):
    # Tokenize the input text
    inputs = distilbert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    
    # Get the DistilBERT embeddings
    with torch.no_grad():
        outputs = distilbert_model(**inputs)
    
    # get the cls token embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()   # (1, 768)

    # flatten the embeddings
    embeddings = embeddings.flatten()   # (768,)

    return embeddings

In [26]:
import numpy as np

def generate_distilbert_embeddings_sliding_window(text, window_size=512, stride=256):
    """
    Generate DistilBERT embeddings using a sliding window approach for longer texts.
    
    Args:
        text (str): The input text
        window_size (int): Maximum sequence length for DistilBERT
        stride (int): Number of tokens to slide the window by
    
    Returns:
        numpy.ndarray: Aggregated DistilBERT embeddings
    """
    # Tokenize the entire text
    tokens = distilbert_tokenizer.tokenize(text)
    
    # If text is shorter than window_size, just use standard approach
    if len(tokens) <= window_size - 2:  # -2 for <s> and </s> tokens in DistilBERT
        return generate_distilbert_embeddings(text)
    
    # For longer texts, use sliding window
    embeddings_list = []
    
    # Process text in overlapping windows
    for i in range(0, len(tokens), stride):
        # Extract window of tokens
        window_tokens = tokens[i:i + window_size - 2]
        
        # Convert tokens back to text (this is an approximation)
        window_text = distilbert_tokenizer.convert_tokens_to_string(window_tokens)
        
        # Get embeddings for this window
        inputs = distilbert_tokenizer(window_text, return_tensors='pt', padding=True, truncation=True, 
                                     max_length=window_size).to(device)
        
        with torch.no_grad():
            outputs = distilbert_model(**inputs)
        
        # Get the CLS token embedding for this window
        window_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()
        embeddings_list.append(window_embedding)
    
    # Aggregate embeddings from all windows (several options available)
    # Option 1: Average all window embeddings
    final_embedding = np.mean(embeddings_list, axis=0)
    
    # Option 2: Max pooling across all windows
    # final_embedding = np.max(embeddings_list, axis=0)
    
    return final_embedding

In [27]:
def add_distilbert_embeddings(batch):
    batch['distilbert_embeddings'] = [generate_distilbert_embeddings_sliding_window(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_distilbert_embeddings, batched=True)

Map:   0%|          | 0/381 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (574 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 381/381 [00:38<00:00,  9.87 examples/s]
Map: 100%|██████████| 102/102 [00:11<00:00,  8.85 examples/s]


# Add BigBird Embeddings

In [28]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [29]:
from transformers import AutoTokenizer, AutoModel

bigbird_tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
bigbird_model = AutoModel.from_pretrained("google/bigbird-roberta-base").to(device)

In [30]:
def generate_bigbird_embeddings(text):
    # Tokenize the input text
    inputs = bigbird_tokenizer(text, return_tensors='pt', truncation=True, max_length=4096).to(device)
    with torch.no_grad():
        outputs = bigbird_model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()
    
    return embedding

In [31]:
def add_bigbird_embeddings(batch):
    batch['bigbird_embeddings'] = [generate_bigbird_embeddings(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_bigbird_embeddings, batched=True)

Map:   0%|          | 0/381 [00:00<?, ? examples/s]Attention type 'block_sparse' is not possible if sequence_length: 580 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
Map: 100%|██████████| 381/381 [00:34<00:00, 11.17 examples/s]
Map: 100%|██████████| 102/102 [00:08<00:00, 12.01 examples/s]


# Add DeBERTa Embeddings

In [6]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
from transformers import AutoTokenizer, AutoModel

deberta_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
deberta_model = AutoModel.from_pretrained("microsoft/deberta-base").to(device)

In [9]:
def generate_deberta_embeddings(text):
    # Tokenize the input text
    inputs = deberta_tokenizer(text, return_tensors='pt', truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = deberta_model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()
    
    return embedding

In [10]:
import numpy as np

def generate_deberta_embeddings_sliding_window(text, window_size=512, stride=256):
    """
    Generate DeBERTa embeddings using a sliding window approach for longer texts.
    """
    tokens = deberta_tokenizer.tokenize(text)

    if len(tokens) <= window_size - 2:
        return generate_deberta_embeddings(text)
    
    embeddings_list = []

    for i in range(0, len(tokens), stride):
        window_tokens = tokens[i:i + window_size - 2]
        window_text = deberta_tokenizer.convert_tokens_to_string(window_tokens)
        inputs = deberta_tokenizer(window_text, return_tensors='pt', padding=True, truncation=True, max_length=window_size).to(device)
        with torch.no_grad():
            outputs = deberta_model(**inputs)
        
        window_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()
        
        embeddings_list.append(window_embedding)

    final_embedding = np.mean(embeddings_list, axis=0)
    
    return final_embedding

In [11]:
def add_deberta_embeddings(batch):
    batch['deberta_embeddings'] = [generate_deberta_embeddings_sliding_window(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_deberta_embeddings, batched=True)

Map: 100%|██████████| 381/381 [01:01<00:00,  6.18 examples/s]
Map: 100%|██████████| 102/102 [00:16<00:00,  6.37 examples/s]


# Upload to HuggingFace Hub

In [17]:
# push the dataset to the hub 
dataset.push_to_hub('LittleFish-Coder/Fake_News_PolitiFact', commit_message="rename interaction_tones to interaction_tones_list")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  2.29ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:09<00:00,  9.32s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  7.38ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.15s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LittleFish-Coder/Fake_News_PolitiFact/commit/5df7768d6aca464147d7f7b98eccf068fd8c5d8e', commit_message='rename interaction_tones to interaction_tones_list', commit_description='', oid='5df7768d6aca464147d7f7b98eccf068fd8c5d8e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LittleFish-Coder/Fake_News_PolitiFact', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LittleFish-Coder/Fake_News_PolitiFact'), pr_revision=None, pr_num=None)