In [8]:
# load super_sms_dataset.csv
import pandas as pd

df = pd.read_csv(
    "d:\\Programmation\\FLspam\\data\\super_sms_dataset.csv", encoding="latin-1"
)

In [20]:
# Load spam dataset (label=1 only)
spam_df = df[df["Labels"] == 1][["SMSes"]].copy()
spam_df.columns = ["text"]
spam_df["label"] = 1
spam_df["main_uuid"] = None  # No associated persona for spam
print(f"Spam messages: {len(spam_df)}")

Spam messages: 26178


In [21]:
# Load persona conversations - only messages RECEIVED by main persona (ham, label=0)
import json

with open(
    "d:\\Programmation\\FLspam\\data\\conversations.json", "r", encoding="utf-8"
) as f:
    conversations = json.load(f)

# Extract only messages received by main persona (sender != main_uuid)
ham_messages = []
for conv in conversations:
    main_uuid = conv["main_uuid"]
    for msg in conv["messages"]:
        if msg["sender_uuid"] != main_uuid:  # Only received messages
            ham_messages.append(
                {
                    "text": msg["text"],
                    "label": 0,
                    "main_uuid": main_uuid,  # Track which persona received it
                }
            )

ham_df = pd.DataFrame(ham_messages)
print(f"Ham messages (received by personas): {len(ham_df)}")
print(f"Unique main personas: {ham_df['main_uuid'].nunique()}")

Ham messages (received by personas): 2734
Unique main personas: 20


In [29]:
# Save ham (with persona assignment) and spam separately
# Spam distribution will be handled in task.py for flexibility (IID/non-IID)

# Save ham messages with persona assignment
ham_output = "d:\\Programmation\\FLspam\\data\\ham_messages.json"
ham_df.to_json(ham_output, orient="records", force_ascii=False, indent=2)
print(f"Saved {len(ham_df)} ham messages to {ham_output}")
print(f"Unique personas: {ham_df['main_uuid'].nunique()}")

# Save spam messages (no persona assignment - will be distributed in task.py)
spam_output = "d:\\Programmation\\FLspam\\data\\spam_messages.json"
spam_df[["text", "label"]].to_json(
    spam_output, orient="records", force_ascii=False, indent=2
)
print(f"\nSaved {len(spam_df)} spam messages to {spam_output}")

Total dataset size: 28912

Label distribution:
label
1    26178
0     2734
Name: count, dtype: int64

Percentages:
label
1    90.5
0     9.5
Name: proportion, dtype: float64

First 20 labels: [1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
# Summary statistics
print("=== Dataset Summary ===")
print(f"Ham messages: {len(ham_df)}")
print(f"Spam messages: {len(spam_df)}")
print(f"Total: {len(ham_df) + len(spam_df)}")
print(f"\nUnique personas (FL clients): {ham_df['main_uuid'].nunique()}")
print(f"Avg ham per persona: {len(ham_df) / ham_df['main_uuid'].nunique():.1f}")
print(
    f"Avg spam per persona (if IID): {len(spam_df) / ham_df['main_uuid'].nunique():.1f}"
)