In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import os

## Set up plotting

In [None]:
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("=== AFRISENTI DATASET EXPLORATION ===")

## Load All Datasets


In [None]:
def load_afrisenti_data():
    """Load all AfriSenti datasets"""
    data = {}
    
    # Languages and splits
    languages = ['twi', 'hausa']
    splits = ['train', 'dev', 'test']
    
    for lang in languages:
        data[lang] = {}
        for split in splits:
            file_path = f'../data/raw/{lang}_{split}.tsv'
            try:
                df = pd.read_csv(file_path, sep='\t')
                data[lang][split] = df
                print(f"Loaded {lang} {split}: {df.shape[0]} samples")
            except FileNotFoundError:
                print(f"File not found: {file_path}")
    
    return data

In [None]:
# Load data
data = load_afrisenti_data()

## EXPLORING DATA STRUCTURE


In [None]:
print("\n=== DATA STRUCTURE EXPLORATION ===")
for lang in ['twi', 'hausa']:
    if lang in data and 'train' in data[lang]:
        df = data[lang]['train']
        print(f"\n{lang.upper()} columns: {list(df.columns)}")
        print(f"Sample data:")
        print(df.head(3))

In [None]:
print("\n=== LABEL DISTRIBUTION ANALYSIS ===")
def analyze_labels(data):
    """Analyze label distributions across languages and splits"""
    results = {}
    
    for lang in ['twi', 'hausa']:
        if lang not in data:
            continue
            
        results[lang] = {}
        total_samples = 0
        
        print(f"\n--- {lang.upper()} ---")
        
        for split in ['train', 'dev', 'test']:
            if split in data[lang]:
                df = data[lang][split]
                # Find label column (might be 'label', 'sentiment', etc.)
                label_col = None
                for col in ['label', 'sentiment', 'target']:
                    if col in df.columns:
                        label_col = col
                        break
                
                if label_col:
                    label_dist = df[label_col].value_counts()
                    results[lang][split] = label_dist
                    total_samples += len(df)
                    
                    print(f"{split}: {len(df)} samples")
                    for label, count in label_dist.items():
                        print(f"  {label}: {count} ({count/len(df)*100:.1f}%)")
        
        print(f"Total {lang} samples: {total_samples}")

In [None]:
label_results = analyze_labels(data)

## Visualize label distributions

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Label Distribution Across Languages and Splits', fontsize=16)

row_idx = 0
for lang in ['twi', 'hausa']:
    if lang in label_results:
        col_idx = 0
        for split in ['train', 'dev', 'test']:
            if split in label_results[lang]:
                ax = axes[row_idx, col_idx]
                label_dist = label_results[lang][split]
                
                # Create bar plot
                bars = ax.bar(label_dist.index, label_dist.values)
                ax.set_title(f'{lang.capitalize()} {split.capitalize()}')
                ax.set_ylabel('Count')
                
                # Add value labels on bars
                for bar in bars:
                    height = bar.get_height()
                    ax.text(bar.get_x() + bar.get_width()/2., height,
                           f'{int(height)}', ha='center', va='bottom')
                
                col_idx += 1
        row_idx += 1

plt.tight_layout()
plt.show()

In [None]:
# Sample text inspection
print("\n=== SAMPLE TEXTS ===")
for lang in ['twi', 'hausa']:
    if lang in data and 'train' in data[lang]:
        df = data[lang]['train']
        text_col = None
        label_col = None
        
        # Find text and label columns
        for col in ['text', 'tweet', 'content']:
            if col in df.columns:
                text_col = col
                break
        for col in ['label', 'sentiment', 'target']:
            if col in df.columns:
                label_col = col
                break
        
        if text_col and label_col:
            print(f"\n{lang.upper()} samples:")
            for sentiment in df[label_col].unique()[:3]:
                sample = df[df[label_col] == sentiment].iloc[0]
                print(f"  {sentiment}: {sample[text_col][:100]}...")