In [12]:
from datasets import load_from_disk, load_dataset, DatasetDict, Features, Sequence, Value

# PolitiFact

## Load User Embeddings from Local Disk

In [6]:
user_dataset = load_from_disk("./politifact_with_interaction_embeddings_simple")

In [7]:
print(user_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'interaction_gemini_embeddings', 'interaction_tones'],
        num_rows: 381
    })
    test: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'interaction_gemini_embeddings', 'interaction_tones'],
        num_rows: 102
    })
})


In [8]:
print("\nSplit Information:")
for split_name, split_data in user_dataset.items():
    print(f"- Split '{split_name}': {len(split_data)} examples")
    print(f"  Features: {split_data.features}")


Split Information:
- Split 'train': 381 examples
  Features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'bert_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'roberta_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'interaction_gemini_embeddings': Sequence(feature=Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), length=-1, id=None), 'interaction_tones': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}
- Split 'test': 102 examples
  Features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'bert_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'roberta_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'interaction_gemini_embeddings': Sequence(feature=Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), length=-1, id=N

# Load News Embeddings from Hugging Face

In [3]:
news_dataset = load_dataset("LittleFish-Coder/fake_news_politifact", cache_dir="dataset")

Generating train split: 100%|██████████| 381/381 [00:00<00:00, 9809.58 examples/s]
Generating test split: 100%|██████████| 102/102 [00:00<00:00, 7645.22 examples/s]


In [4]:
print(news_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'user_interactions'],
        num_rows: 381
    })
    test: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'user_interactions'],
        num_rows: 102
    })
})


In [9]:
print("\nSplit Information:")
for split_name, split_data in news_dataset.items():
    print(f"- Split '{split_name}': {len(split_data)} examples")
    print(f"  Features: {split_data.features}")


Split Information:
- Split 'train': 381 examples
  Features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'bert_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'roberta_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'user_interactions': [{'content': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'tone': Value(dtype='string', id=None)}]}
- Split 'test': 102 examples
  Features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'bert_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'roberta_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'user_interactions': [{'content': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'tone': Value(dtype='string', id=None)}]}


# Merge User and News Embeddings

In [15]:
from datasets import DatasetDict

print("\nAdding columns from user_dataset to news_dataset...")
merged_dataset_dict = DatasetDict()


Adding columns from user_dataset to news_dataset...


In [16]:
for split in news_dataset.keys(): # 遍歷 'train', 'test'
    print(f"Processing split: {split}")

    # Retrieve the columns to be added
    interaction_embeddings_col = user_dataset[split]['interaction_gemini_embeddings']
    interaction_tones_col = user_dataset[split]['interaction_tones']
    
    # Create a new dataset with the additional columns
    temp_ds = news_dataset[split].add_column("interaction_embeddings_list", interaction_embeddings_col)
    temp_ds = temp_ds.add_column("interaction_tones", interaction_tones_col)

    merged_dataset_dict[split] = temp_ds

print("\nColumns added. Merged dataset structure (intermediate):")
print(merged_dataset_dict)

Processing split: train
Processing split: test

Columns added. Merged dataset structure (intermediate):
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'user_interactions', 'interaction_embeddings_list', 'interaction_tones'],
        num_rows: 381
    })
    test: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'user_interactions', 'interaction_embeddings_list', 'interaction_tones'],
        num_rows: 102
    })
})


# Combine User and News Embeddings as new features

In [17]:
import numpy as np

def combine_embeddings(example):
    roberta_emb = np.array(example['roberta_embeddings'])
    interaction_embs = example['interaction_embeddings_list']
    
    # Ensure interaction embeddings are numpy arrays
    interaction_embs_np = [np.array(emb) for emb in interaction_embs if emb is not None and len(emb) > 0]
    
    if not interaction_embs_np: # Handle cases with no valid interaction embeddings
        # Option 1: Use only roberta embedding (or a zero vector of the same shape)
        # combined_emb = roberta_emb # Or np.zeros_like(roberta_emb)
        # Option 2: For mean calculation, just return roberta_emb
        mean_interaction_emb = np.zeros_like(roberta_emb) # Default to zero vector if no interactions
    else:
        mean_interaction_emb = np.mean(interaction_embs_np, axis=0)
        
    # Combine by taking the mean of roberta_emb and mean_interaction_emb
    # Ensure both are numpy arrays for element-wise mean
    combined_emb = np.mean([roberta_emb, mean_interaction_emb], axis=0)
    
    # Convert back to list for dataset storage if necessary, though numpy array might be fine
    example['combined_embeddings'] = combined_emb.tolist() 
    return example

In [18]:
# Apply the function to each split
print("\nCombining embeddings...")
final_dataset_dict = merged_dataset_dict.map(combine_embeddings)


Combining embeddings...


Map: 100%|██████████| 381/381 [00:03<00:00, 98.84 examples/s] 
Map: 100%|██████████| 381/381 [00:03<00:00, 98.84 examples/s] 
Map: 100%|██████████| 102/102 [00:00<00:00, 103.88 examples/s]
Map: 100%|██████████| 102/102 [00:00<00:00, 103.88 examples/s]


In [19]:
print("\nEmbeddings combined. Final dataset structure:")
print(final_dataset_dict)

# Verify the first example's combined embedding
print("\nVerification (first train example):")
print("Roberta Embedding shape:", np.array(final_dataset_dict['train'][0]['roberta_embeddings']).shape)
print("Interaction Embeddings List length:", len(final_dataset_dict['train'][0]['interaction_embeddings_list']))
if final_dataset_dict['train'][0]['interaction_embeddings_list']:
    print("First Interaction Embedding shape:", np.array(final_dataset_dict['train'][0]['interaction_embeddings_list'][0]).shape)
print("Combined Embedding shape:", np.array(final_dataset_dict['train'][0]['combined_embeddings']).shape)


Embeddings combined. Final dataset structure:
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'user_interactions', 'interaction_embeddings_list', 'interaction_tones', 'combined_embeddings'],
        num_rows: 381
    })
    test: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'user_interactions', 'interaction_embeddings_list', 'interaction_tones', 'combined_embeddings'],
        num_rows: 102
    })
})

Verification (first train example):
Roberta Embedding shape: (768,)
Interaction Embeddings List length: 20
First Interaction Embedding shape: (768,)
Combined Embedding shape: (768,)


## Upload to Hugging Face

In [20]:
# push the dataset to the hub 
final_dataset_dict.push_to_hub('LittleFish-Coder/fake_news_politifact')

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  2.56ba/s]

Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.63s/it]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.63s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  9.41ba/s]

Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.16s/it]



CommitInfo(commit_url='https://huggingface.co/datasets/LittleFish-Coder/Fake_News_PolitiFact/commit/433390806a7fab30ffa53a9a4834648f169b5172', commit_message='Upload dataset', commit_description='', oid='433390806a7fab30ffa53a9a4834648f169b5172', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LittleFish-Coder/Fake_News_PolitiFact', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LittleFish-Coder/Fake_News_PolitiFact'), pr_revision=None, pr_num=None)

# GossipCop

## Load User Embeddings from Local Disk (GossipCop)

In [21]:
user_dataset_gc = load_from_disk("./gossipcop_with_interaction_embeddings_simple")
print(user_dataset_gc)
print("\nSplit Information:")
for split_name, split_data in user_dataset_gc.items():
    print(f"- Split '{split_name}': {len(split_data)} examples")
    print(f"  Features: {split_data.features}")

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'interaction_gemini_embeddings', 'interaction_tones'],
        num_rows: 9988
    })
    test: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'interaction_gemini_embeddings', 'interaction_tones'],
        num_rows: 2672
    })
})

Split Information:
- Split 'train': 9988 examples
  Features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'bert_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'roberta_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'interaction_gemini_embeddings': Sequence(feature=Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), length=-1, id=None), 'interaction_tones': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}
- Split 'test': 2672 examples
  Features: {'text': Value(d

## Load News Embeddings from Hugging Face (GossipCop)

In [22]:
news_dataset_gc = load_dataset("LittleFish-Coder/fake_news_gossipcop", cache_dir="dataset")
print(news_dataset_gc)
print("\nSplit Information:")
for split_name, split_data in news_dataset_gc.items():
    print(f"- Split '{split_name}': {len(split_data)} examples")
    print(f"  Features: {split_data.features}")

Generating train split: 100%|██████████| 9988/9988 [00:00<00:00, 14639.54 examples/s]
Generating test split: 100%|██████████| 2672/2672 [00:00<00:00, 16123.70 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'user_interactions'],
        num_rows: 9988
    })
    test: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'user_interactions'],
        num_rows: 2672
    })
})

Split Information:
- Split 'train': 9988 examples
  Features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'bert_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'roberta_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'user_interactions': [{'content': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'tone': Value(dtype='string', id=None)}]}
- Split 'test': 2672 examples
  Features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'bert_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)




## Merge User and News Embeddings (GossipCop)

In [23]:
print("\nAdding columns from user_dataset_gc to news_dataset_gc...")
merged_dataset_dict_gc = DatasetDict()

for split in news_dataset_gc.keys(): # Iterate through 'train', 'test'
    print(f"Processing split: {split}")

    # Retrieve the columns to be added
    interaction_embeddings_col_gc = user_dataset_gc[split]['interaction_gemini_embeddings']
    interaction_tones_col_gc = user_dataset_gc[split]['interaction_tones']
    
    # Create a new dataset with the additional columns
    temp_ds_gc = news_dataset_gc[split].add_column("interaction_embeddings_list", interaction_embeddings_col_gc)
    temp_ds_gc = temp_ds_gc.add_column("interaction_tones", interaction_tones_col_gc)

    merged_dataset_dict_gc[split] = temp_ds_gc


Adding columns from user_dataset_gc to news_dataset_gc...
Processing split: train
Processing split: test


In [24]:
print("\nColumns added. Merged GossipCop dataset structure (intermediate):")
print(merged_dataset_dict_gc)


Columns added. Merged GossipCop dataset structure (intermediate):
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'user_interactions', 'interaction_embeddings_list', 'interaction_tones'],
        num_rows: 9988
    })
    test: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'user_interactions', 'interaction_embeddings_list', 'interaction_tones'],
        num_rows: 2672
    })
})


## Combine User and News Embeddings as new features (GossipCop)

In [25]:
# Apply the combine_embeddings function (defined earlier) to each split
print("\nCombining embeddings for GossipCop...")
final_dataset_dict_gc = merged_dataset_dict_gc.map(combine_embeddings)

print("\nGossipCop embeddings combined. Final dataset structure:")
print(final_dataset_dict_gc)

# Verify the first example's combined embedding
print("\nVerification (first train example - GossipCop):")
print("Roberta Embedding shape:", np.array(final_dataset_dict_gc['train'][0]['roberta_embeddings']).shape)
print("Interaction Embeddings List length:", len(final_dataset_dict_gc['train'][0]['interaction_embeddings_list']))
if final_dataset_dict_gc['train'][0]['interaction_embeddings_list']:
    print("First Interaction Embedding shape:", np.array(final_dataset_dict_gc['train'][0]['interaction_embeddings_list'][0]).shape)
print("Combined Embedding shape:", np.array(final_dataset_dict_gc['train'][0]['combined_embeddings']).shape)


Combining embeddings for GossipCop...


Map: 100%|██████████| 9988/9988 [01:44<00:00, 96.01 examples/s] 
Map: 100%|██████████| 2672/2672 [00:28<00:00, 95.35 examples/s] 


GossipCop embeddings combined. Final dataset structure:
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'user_interactions', 'interaction_embeddings_list', 'interaction_tones', 'combined_embeddings'],
        num_rows: 9988
    })
    test: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'user_interactions', 'interaction_embeddings_list', 'interaction_tones', 'combined_embeddings'],
        num_rows: 2672
    })
})

Verification (first train example - GossipCop):
Roberta Embedding shape: (768,)
Interaction Embeddings List length: 20
First Interaction Embedding shape: (768,)
Combined Embedding shape: (768,)





## Upload to Hugging Face (GossipCop)

In [26]:
final_dataset_dict_gc.push_to_hub('LittleFish-Coder/fake_news_gossipcop')

Creating parquet from Arrow format: 100%|██████████| 4/4 [00:03<00:00,  1.30ba/s]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:02<00:00,  1.38ba/s]
Creating parquet from Arrow format: 100%|██████████| 4/4 [00:03<00:00,  1.32ba/s]
Uploading the dataset shards: 100%|██████████| 3/3 [01:10<00:00, 23.38s/it]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:02<00:00,  1.19ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:15<00:00, 15.71s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LittleFish-Coder/Fake_News_GossipCop/commit/e55b0342ad5121b781b8427c3bfc70a930da33cc', commit_message='Upload dataset', commit_description='', oid='e55b0342ad5121b781b8427c3bfc70a930da33cc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LittleFish-Coder/Fake_News_GossipCop', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LittleFish-Coder/Fake_News_GossipCop'), pr_revision=None, pr_num=None)