In [None]:
from datasets import load_from_disk
import numpy as np # For checking embedding shapes

# --- 設定你的資料集路徑 ---
dataset_path = "./politifact_with_interaction_embeddings_simple"

print(f"Loading dataset from: {dataset_path}")
processed_dataset = load_from_disk(dataset_path)

In [2]:
# --- 顯示資料集結構 ---
print("\nDataset Structure:")
print(processed_dataset)


Dataset Structure:
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'interaction_gemini_embeddings', 'interaction_tones'],
        num_rows: 381
    })
    test: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings', 'interaction_gemini_embeddings', 'interaction_tones'],
        num_rows: 102
    })
})


In [4]:
# --- 顯示不同 Split 的資訊 ---
print("\nSplit Information:")
for split_name, split_data in processed_dataset.items():
    print(f"- Split '{split_name}': {len(split_data)} examples")
    print(f"  Features: {split_data.features}")


Split Information:
- Split 'train': 381 examples
  Features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'bert_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'roberta_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'interaction_gemini_embeddings': Sequence(feature=Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), length=-1, id=None), 'interaction_tones': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}
- Split 'test': 102 examples
  Features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'bert_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'roberta_embeddings': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'interaction_gemini_embeddings': Sequence(feature=Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), length=-1, id=N

In [10]:
# --- 檢查第一個訓練樣本 (如果存在) ---
print("\n--- Example: First Training Sample ---")
train_example = processed_dataset["train"][0]

# 顯示部分欄位
print(f"Label: {train_example.get('label')}")
print(f"Text (first 100 chars): {train_example.get('text', '')[:100]}...")

# 檢查 RoBERTa 嵌入 (假設欄位名為 roberta_embeddings)
roberta_emb = np.array(train_example['roberta_embeddings'])
print(f"RoBERTa Embedding shape: {roberta_emb.shape}") # Should be (768,)

interaction_embs = train_example['interaction_gemini_embeddings']
interaction_tones = train_example.get('interaction_tones', []) # Use .get for safety
print(f"Number of Interactions found: {len(interaction_embs)}")

if interaction_embs:
    # 顯示第一個互動的資訊
    first_interaction_emb = np.array(interaction_embs[0])
    first_interaction_tone = interaction_tones[0] if interaction_tones else "N/A"
    print(f"\nFirst Interaction Info:")
    print(f"  Tone: {first_interaction_tone}")
    print(f"  Gemini Embedding shape: {first_interaction_emb.shape}") # Should be (768,) if not empty
    # print(f"  Gemini Embedding (first 5 values): {first_interaction_emb[:5]}")
else:
    print("No interaction embeddings found for this example.")


--- Example: First Training Sample ---
Label: 1
Text (first 100 chars): Inside a Fake News Sausage Factory: ‘This Is All About Income’ In Tbilisi, the two-room rented apart...
RoBERTa Embedding shape: (768,)
Number of Interactions found: 20

First Interaction Info:
  Tone: affirmative
  Gemini Embedding shape: (768,)
