In [1]:
from huggingface_hub import login
# or from huggingface_hub import notebook_login # if in a Jupyter/Colab notebook

# This will prompt you to enter your token.
login()
# notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict, Features, Value, ClassLabel
# from huggingface_hub import login # Keep if you use programmatic login

def create_hf_dataset_from_concatenated_files(
    base_path=".",
    split_files=None,
    column_names=None,
    delimiter="|"
):
    if split_files is None:
        split_files = {
            "train": "train.txt",
            "validation": "val.txt",
            "test": "test.txt"
        }
    if column_names is None:
        column_names = ['caller', 'text', 'act_tag', 'conversation_no', 'speaker_change']

    dataset_dict = {}

    # Define the features (schema) for the dataset
    features = Features({
        'caller': Value('string'),
        'text': Value('string'),
        'act_tag': Value('string'),
        'conversation_no': Value('string'), # Explicitly string here
        'speaker_change': ClassLabel(num_classes=2, names=['no_change', 'change'])
    })

    # Explicitly define dtypes for pandas to read all specified columns as strings
    # This is the most robust way to prevent misinterpretation
    pandas_dtypes = {col_name: str for col_name in column_names}

    print("Creating Hugging Face Dataset...")
    for split_name, filename in split_files.items():
        filepath = os.path.join(base_path, filename)
        print(f"\nProcessing split: '{split_name}' from file: '{filepath}'")

        if not os.path.exists(filepath) or os.path.getsize(filepath) == 0:
            print(f"Warning: File not found or empty for split '{split_name}': {filepath}. Creating an empty dataset.")
            dataset_dict[split_name] = Dataset.from_dict({}, features=features)
            continue

        try:
            # Use the explicit pandas_dtypes dictionary
            df = pd.read_csv(
                filepath,
                delimiter=delimiter,
                header=None,
                names=column_names,
                dtype=pandas_dtypes,  # Apply explicit string type for all columns
                keep_default_na=False, # Important to treat empty strings as "" not NaN
                na_filter=False        # Disables NaN detection based on default NaN values
            )

            # Debug: Check dtypes after loading and first few values
            # print(f"DEBUG: DataFrame dtypes for {split_name}:")
            # print(df.dtypes)
            # print(f"DEBUG: First 5 'conversation_no' values for {split_name} from DataFrame:")
            # print(df['conversation_no'].head())


            # Ensure the 'speaker_change' column is integer for ClassLabel
            # This conversion happens AFTER reading it as a string
            if 'speaker_change' in df.columns:
                # Convert to numeric, coercing errors, then fill NaNs (from coercion or empty strings) with 0, then to int
                df['speaker_change'] = pd.to_numeric(df['speaker_change'], errors='coerce').fillna(0).astype(int)
            else:
                print(f"Warning: 'speaker_change' column not found in {filepath}. Adding a default column of 0s.")
                df['speaker_change'] = 0 # Assign an integer directly

            # Convert pandas DataFrame to Hugging Face Dataset
            hf_dataset = Dataset.from_pandas(df, features=features, preserve_index=False)
            dataset_dict[split_name] = hf_dataset
            print(f"Successfully created Dataset for split: '{split_name}' with {len(hf_dataset)} examples.")
            if len(hf_dataset) > 0:
                print(f"First example from '{split_name}': {hf_dataset[0]}")
                # Debug: check conversation_no in the HF dataset
                # print(f"DEBUG: 'conversation_no' in first HF example: {hf_dataset[0]['conversation_no']}")


        except pd.errors.EmptyDataError:
            print(f"Warning: File for split '{split_name}' is empty: {filepath}")
            dataset_dict[split_name] = Dataset.from_dict({}, features=features)
        except Exception as e:
            print(f"Error processing file {filepath} for split '{split_name}': {e}")
            print(f"Creating an empty dataset for split '{split_name}' due to error.")
            dataset_dict[split_name] = Dataset.from_dict({}, features=features)

    final_dataset_dict = DatasetDict(dataset_dict)
    print("\n--- Hugging Face DatasetDict created successfully! ---")
    return final_dataset_dict

# ... (rest of your __main__ block) ...

if __name__ == "__main__":
    # --- Configuration (same as before) ---
    PROJECT_BASE_PATH = "."
    SPLIT_FILENAMES = {
        "train":      "train.txt",
        "validation": "val.txt",
        "test":       "test.txt",
    }
    COLUMN_NAMES_ORDERED = ['caller', 'text', 'act_tag', 'conversation_no', 'speaker_change']
    FILE_DELIMITER = "|"

    # 1) Load each split into pandas, same as your function does—but collect all tags
    all_tags = set()
    df_splits = {}
    for split_name, filename in SPLIT_FILENAMES.items():
        df = pd.read_csv(
            os.path.join(PROJECT_BASE_PATH, filename),
            delimiter=FILE_DELIMITER,
            header=None,
            names=COLUMN_NAMES_ORDERED,
            dtype={col: str for col in COLUMN_NAMES_ORDERED},
            keep_default_na=False,
            na_filter=False,
        )
        # coerce speaker_change to int as before
        df['speaker_change'] = pd.to_numeric(df['speaker_change'], errors='coerce').fillna(0).astype(int)

        df_splits[split_name] = df
        # only collect from train (or you could union all three)
        if split_name == "train":
            all_tags.update(df['act_tag'].unique())

    # 2) Build the ClassLabel feature from sorted tags
    unique_tags = sorted(all_tags)
    print(f"Found {len(unique_tags)} unique act_tags. Here’s the mapping:")
    for idx, tag in enumerate(unique_tags):
        print(f"  {idx:3d} → {tag}")

    act_tag_feature = ClassLabel(names=unique_tags)

    # 3) Create HuggingFace Datasets with that schema
    dataset_dict = {}
    for split_name, df in df_splits.items():
        features = Features({
            'caller':          Value('string'),
            'text':            Value('string'),
            # now we use our ClassLabel for act_tag:
            'act_tag':         act_tag_feature,
            'conversation_no': Value('string'),
            'speaker_change':  ClassLabel(names=['no_change','change']),
        })
        ds = Dataset.from_pandas(df, features=features, preserve_index=False)
        dataset_dict[split_name] = ds
        print(f"{split_name} → {len(ds)} examples; act_tag dtype = {ds.features['act_tag']}")

    final_dataset = DatasetDict(dataset_dict)
    print("\nFinal DatasetDict schema:")
    print(final_dataset)

    # 4) Push to the Hub
    HUB_REPO_ID = "nico8771/mrda_processed"  # ← your repo
    final_dataset.push_to_hub(HUB_REPO_ID)
    print(f"Pushed to https://huggingface.co/datasets/{HUB_REPO_ID}")

Found 5 unique act_tags. Here’s the mapping:
    0 → B
    1 → D
    2 → F
    3 → Q
    4 → S
train → 75067 examples; act_tag dtype = ClassLabel(names=['B', 'D', 'F', 'Q', 'S'], id=None)
validation → 16433 examples; act_tag dtype = ClassLabel(names=['B', 'D', 'F', 'Q', 'S'], id=None)
test → 16702 examples; act_tag dtype = ClassLabel(names=['B', 'D', 'F', 'Q', 'S'], id=None)

Final DatasetDict schema:
DatasetDict({
    train: Dataset({
        features: ['caller', 'text', 'act_tag', 'conversation_no', 'speaker_change'],
        num_rows: 75067
    })
    validation: Dataset({
        features: ['caller', 'text', 'act_tag', 'conversation_no', 'speaker_change'],
        num_rows: 16433
    })
    test: Dataset({
        features: ['caller', 'text', 'act_tag', 'conversation_no', 'speaker_change'],
        num_rows: 16702
    })
})


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/76 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Pushed to https://huggingface.co/datasets/nico8771/mrda_processed
