In [1]:
from datasets import load_dataset
import pandas as pd
from tabulate import tabulate
from tqdm.auto import tqdm

In [2]:
# Load both datasets
print("Loading datasets...")
train_dataset = load_dataset("MagedSaeed/tnqeet-training-datasets", "all_shuffled")['train'] # type: ignore
test_dataset = load_dataset("MagedSaeed/tnqeet-testing-datasets", "all_shuffled")['test'] # type: ignore

Loading datasets...


In [7]:
def count_words(text):
    """Count words in a text string"""
    return len(str(text).split())

def get_dataset_table():
    """Create a table with dataset statistics by source"""
    
    # Convert to dataframes and add word counts
    train_df = pd.DataFrame(train_dataset) # type: ignore
    train_df['word_count'] = train_df['text'].apply(count_words)
    
    test_df = pd.DataFrame(test_dataset) # type: ignore
    test_df['word_count'] = test_df['text'].apply(count_words)
    
    # Create table data
    table_data = []
    
    # Add training dataset header
    table_data.append(["=== TRAINING DATASET ===", "", "", "", "", "", "", ""])
    
    # Add training dataset sources
    train_total_samples = 0
    train_total_words = 0
    for source in sorted(train_df['source'].unique()):
        source_data = train_df[train_df['source'] == source]
        domain, language = get_domain_and_language(source)
        
        samples = len(source_data)
        total_words = source_data['word_count'].sum()
        train_total_samples += samples
        train_total_words += total_words
        
        table_data.append([
            source,
            domain,
            language,
            f"{samples:,}",
            f"{total_words:,}",
            f"{source_data['word_count'].mean():.1f}",
            f"{source_data['word_count'].max():,}",
            f"{source_data['word_count'].min()}"
        ])
    
    # Add training dataset total row
    table_data.append([
        "TOTAL",
        "",
        "",
        f"{train_total_samples:,}",
        f"{train_total_words:,}",
        "",
        "",
        ""
    ])
    
    # Add testing dataset header
    table_data.append(["=== TESTING DATASET ===", "", "", "", "", "", "", ""])
    
    # Add testing dataset sources
    test_total_samples = 0
    test_total_words = 0
    for source in sorted(test_df['source'].unique()):
        source_data = test_df[test_df['source'] == source]
        domain, language = get_domain_and_language(source)
        
        samples = len(source_data)
        total_words = source_data['word_count'].sum()
        test_total_samples += samples
        test_total_words += total_words
        
        table_data.append([
            source,
            domain,
            language,
            f"{samples:,}",
            f"{total_words:,}",
            f"{source_data['word_count'].mean():.1f}",
            f"{source_data['word_count'].max():,}",
            f"{source_data['word_count'].min()}"
        ])
    
    # Add testing dataset total row
    table_data.append([
        "TOTAL",
        "",
        "",
        f"{test_total_samples:,}",
        f"{test_total_words:,}",
        "",
        "",
        ""
    ])
    
    # Create table
    headers = [
        "Dataset Name",
        "Domain",
        "Language Type",
        "Samples",
        "Total Words",
        "Avg Words",
        "Max Words",
        "Min Words"
    ]
    
    table = tabulate(table_data, headers=headers, tablefmt="grid")
    print(table)

def get_domain_and_language(source):
    """Get domain description and language type for each source"""
    source_info = {
        'iwslt': ('Translation/Speech', 'MSA'),
        'arabic_wikipedia': ('Wikipedia', 'MSA'),
        'tashkeela': ('Religious/Classical', 'Classical Arabic'),
        'annotated_aoc': ('Social Media', 'Dialectal'),
        'oscar_small': ('Web Text', 'MSA/Dialectical'),
        'ashaar': ('Poetry/Literature', 'Classical Arabic'),
        'sanad': ('News/Media', 'MSA'),
        'wasm': ('Social Media', 'Dialectal'),
        'LLMs_abstracts': ('Academic', 'MSA'),
        "arabic_english_code_switching":("Code Switching","Dialectal"),
        "arasum":("News/Media", "MSA"),
        "kind":("Social Media", "Dialectal"),
        "poetry":("Poetry/Literature", "Classical Arabic"),
        "quran":("Religious/Classical", "Classical Arabic"),
        "social_media":("Social Media", "MSA/Dialectical"),
    }
    return source_info.get(source, ('Unknown', 'Unknown'))

In [8]:
get_dataset_table()

+-------------------------------+---------------------+------------------+-----------+---------------+-------------+-------------+-------------+
| Dataset Name                  | Domain              | Language Type    | Samples   | Total Words   | Avg Words   | Max Words   | Min Words   |
| === TRAINING DATASET ===      |                     |                  |           |               |             |             |             |
+-------------------------------+---------------------+------------------+-----------+---------------+-------------+-------------+-------------+
| annotated_aoc                 | Social Media        | Dialectal        | 215,946   | 7,112,034     | 32.9        | 1,303       | 10          |
+-------------------------------+---------------------+------------------+-----------+---------------+-------------+-------------+-------------+
| arabic_wikipedia              | Wikipedia           | MSA              | 1,087,933 | 258,676,741   | 237.8       | 43,103      |