In [1]:
import pandas as pd
from datasets import load_dataset

In [2]:
"""
ETL Pipeline Example: Hugging Face Dataset Integration

This script demonstrates how to authenticate with Hugging Face and load
the telecom synthetic call transcript dataset.

Dataset: ozmo-inc/telecom_synthetic_call_transcript_data
"""

import os
from huggingface_hub import login
from datasets import load_dataset


def authenticate_huggingface():
    """
    Authenticate with Hugging Face using a token.
    
    The token can be provided in two ways:
    1. Environment variable: HF_TOKEN
    2. Interactive prompt (if no token is found)
    """
    token = os.environ.get('HF_TOKEN')
    
    if token:
        print("Authenticating with Hugging Face using token from environment variable...")
        login(token=token)
    else:
        print("No HF_TOKEN found in environment. Please login interactively:")
        login()
    
    print("Successfully authenticated with Hugging Face!")


def load_telecom_dataset(dataset_name="ozmo-inc/telecom_synthetic_call_transcript_data"):
    """
    Load the telecom synthetic call transcript dataset from Hugging Face.
    
    Args:
        dataset_name (str): The name of the dataset on Hugging Face Hub
        
    Returns:
        dataset: The loaded dataset object
    """
    print(f"\nLoading dataset: {dataset_name}")
    
    try:
        # Load the dataset
        dataset = load_dataset(dataset_name)
        
        print(f"Dataset loaded successfully!")
        print(f"Dataset structure: {dataset}")
        
        # Display basic information about the dataset
        if hasattr(dataset, 'keys'):
            print(f"\nDataset splits: {list(dataset.keys())}")
            
            for split_name in dataset.keys():
                split = dataset[split_name]
                print(f"\n{split_name} split:")
                print(f"  - Number of rows: {len(split)}")
                print(f"  - Column names: {split.column_names}")
        
        return dataset
    
    except OSError as e:
        # Network or file system errors
        print(f"Error accessing dataset: {e}")
        print("\nPlease check your internet connection and try again.")
        raise
    except PermissionError as e:
        # Access denied
        print(f"Permission denied: {e}")
        print("\nPlease ensure:")
        print("1. You are authenticated with Hugging Face")
        print("2. You have been granted access to the dataset")
        raise
    except ValueError as e:
        # Invalid dataset name or configuration
        print(f"Invalid dataset configuration: {e}")
        print("\nPlease verify the dataset name is correct: {dataset_name}")
        raise
    except Exception as e:
        print(f"Unexpected error loading dataset: {e}")
        print("\nPlease ensure:")
        print("1. You are authenticated with Hugging Face")
        print("2. You have access to the dataset")
        print("3. The dataset name is correct")
        raise


def display_sample_data(dataset, num_samples=3):
    """
    Display sample data from the dataset.
    
    Args:
        dataset: The loaded dataset object
        num_samples (int): Number of samples to display
    """
    print(f"\n{'='*60}")
    print(f"Displaying {num_samples} sample records:")
    print(f"{'='*60}")
    
    # Get the first split (usually 'train')
    if hasattr(dataset, 'keys'):
        split_name = list(dataset.keys())[0]
        data = dataset[split_name]
    else:
        data = dataset
    
    # Display samples
    for i in range(min(num_samples, len(data))):
        print(f"\nSample {i+1}:")
        print("-" * 60)
        sample = data[i]
        for key, value in sample.items():
            # Truncate long text fields for display
            if isinstance(value, str) and len(value) > 200:
                value = value[:200] + "..."
            print(f"  {key}: {value}")


def main():
    """
    Main ETL pipeline execution function.
    """
    print("="*60)
    print("ETL Pipeline: Hugging Face Dataset Integration")
    print("="*60)
    
    # Step 1: Authenticate with Hugging Face
    authenticate_huggingface()
    
    # Step 2: Load the telecom dataset
    dataset = load_telecom_dataset()
    
    # Step 3: Display sample data
    display_sample_data(dataset)
    
    print("\n" + "="*60)
    print("ETL Pipeline completed successfully!")
    print("="*60)
    
    return dataset


if __name__ == "__main__":
    # Run the ETL pipeline
    dataset = main()
    
    # The dataset is now loaded and ready for further processing
    # You can add your own ETL transformations here
    print("\nDataset is now ready for further ETL processing...")


ETL Pipeline: Hugging Face Dataset Integration
No HF_TOKEN found in environment. Please login interactively:
Successfully authenticated with Hugging Face!

Loading dataset: ozmo-inc/telecom_synthetic_call_transcript_data
Dataset loaded successfully!
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['Agent_ID', 'Pilot', 'Agent_AHT', 'Call_ID', 'Category', 'Sub_Category', 'Call_Start', 'Call_End', 'Call_Transfer', 'Customer_Callback_7_Day', 'Customer_Callback_IDs_7_Day', 'CXM7', 'Transcript_JSON'],
        num_rows: 19322
    })
})

Dataset splits: ['train']

train split:
  - Number of rows: 19322
  - Column names: ['Agent_ID', 'Pilot', 'Agent_AHT', 'Call_ID', 'Category', 'Sub_Category', 'Call_Start', 'Call_End', 'Call_Transfer', 'Customer_Callback_7_Day', 'Customer_Callback_IDs_7_Day', 'CXM7', 'Transcript_JSON']

Displaying 3 sample records:

Sample 1:
------------------------------------------------------------
  Agent_ID: AG-JIX-7837
  Pilot: False
  Agent_AHT: 

In [3]:

df = load_dataset("ozmo-inc/telecom_synthetic_call_transcript_data", split="train").to_pandas()

In [4]:
df.to_csv("telecom_synthetic_call_transcript_data.csv", index=False)