In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import os
from transformers import AutoTokenizer
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import torch

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Set the exact path to your data
base_dir = '/content/drive/MyDrive/preprocessed/data'  # Update if needed
print(f"Looking for data in: {base_dir}")

# Check if directory exists
if not os.path.exists(base_dir):
    print(f"Error: Directory {base_dir} does not exist")
    exit()

# List contents of the directory
print("\nContents of the specified directory:")
for item in os.listdir(base_dir):
    print(f"  - {item}")

# Identify language folders (direct language folders, not prefixed)
language_folders = [d for d in os.listdir(base_dir)
                    if os.path.isdir(os.path.join(base_dir, d))
                    and d not in ['.DS_Store', 'README.txt']]  # Exclude non-language items

if not language_folders:
    print("No language folders found.")
    exit()

# Use the folder names as language codes
languages = language_folders
print(f"\nFound {len(languages)} language folders: {', '.join(languages)}")

# Function to load data
def load_data(language):
    lang_dir = os.path.join(base_dir, language)
    train_path = os.path.join(lang_dir, 'train.tsv')
    dev_path = os.path.join(lang_dir, 'dev.tsv')
    test_path = os.path.join(lang_dir, 'test.tsv')

    # Initialize empty dataframes
    train_df = pd.DataFrame()
    dev_df = pd.DataFrame()
    test_df = pd.DataFrame()

    # Check for files
    print(f"  Checking files for {language}:")
    for path, name in [(train_path, 'train'), (dev_path, 'dev'), (test_path, 'test')]:
        if os.path.exists(path):
            print(f"    - {name}.tsv exists")
            # Check file size
            file_size = os.path.getsize(path) / 1024  # Size in KB
            print(f"      Size: {file_size:.2f} KB")

            # Preview file content
            try:
                with open(path, 'r', encoding='utf-8') as f:
                    first_line = f.readline().strip()
                    print(f"      First line: '{first_line}'")

                    # Check for tab separator
                    if '\t' in first_line:
                        print("      Found tab separator in file")
                    else:
                        print("      WARNING: No tab separator found in first line")
            except Exception as e:
                print(f"      Error reading file: {e}")
        else:
            print(f"    - {name}.tsv does not exist")

    # Try to load the data
    try:
        if os.path.exists(train_path):
            train_df = pd.read_csv(train_path, sep='\t', names=['tweet', 'label'], encoding='utf-8')
            train_df['language'] = language
            print(f"    Loaded {len(train_df)} training samples")

        if os.path.exists(dev_path):
            dev_df = pd.read_csv(dev_path, sep='\t', names=['tweet', 'label'], encoding='utf-8')
            dev_df['language'] = language
            print(f"    Loaded {len(dev_df)} development samples")

        if os.path.exists(test_path):
            test_df = pd.read_csv(test_path, sep='\t', names=['tweet', 'label'], encoding='utf-8')
            test_df['language'] = language
            print(f"    Loaded {len(test_df)} test samples")

    except Exception as e:
        print(f"    Error loading data: {e}")

    # Show preview of training data if loaded
    if not train_df.empty:
        print("\n    Preview of loaded training data:")
        print(train_df.head(2))
        print(f"    Columns: {train_df.columns.tolist()}")

    return train_df, dev_df, test_df

# Initialize empty dataframes for all data
all_train = pd.DataFrame()
all_dev = pd.DataFrame()
all_test = pd.DataFrame()

# Load data for each language
for lang in languages:
    print(f"\nLoading {lang} data...")
    train, dev, test = load_data(lang)
    all_train = pd.concat([all_train, train], ignore_index=True)
    all_dev = pd.concat([all_dev, dev], ignore_index=True)
    all_test = pd.concat([all_test, test], ignore_index=True)

# Check the total loaded data
print("\nTotal loaded data:")
print(f"  Training: {len(all_train)} samples")
print(f"  Development: {len(all_dev)} samples")
print(f"  Testing: {len(all_test)} samples")

# Continue with preprocessing only if we have data
if all_train.empty:
    print("\nNo data loaded. Cannot proceed with preprocessing.")
    exit()

# Show some basic statistics
print("\nClass distribution in training data:")
print(all_train['label'].value_counts())

print("\nLanguage distribution in training data:")
print(all_train['language'].value_counts())

# Basic text cleaning function
def clean_text(text):
    if pd.isna(text):
        return ""
    # Convert to string if it's not already
    text = str(text)
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove @mentions
    text = re.sub(r'@\w+', '@user', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to all datasets
print("\nCleaning text data...")
all_train['cleaned_tweet'] = all_train['tweet'].apply(clean_text)
all_dev['cleaned_tweet'] = all_dev['tweet'].apply(clean_text)
all_test['cleaned_tweet'] = all_test['tweet'].apply(clean_text)

# Show some statistics
print("\nCleaned data preview:")
print(all_train[['tweet', 'cleaned_tweet', 'label']].head(2))

# Load tokenizers for the different subword methods
print("\nLoading tokenizers...")
# 1. BPE (Byte-Pair Encoding)
xlm_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# 2. WordPiece
mbert_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# 3. SentencePiece
sentencepiece_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

# Function to tokenize using different methods
def tokenize_text(text, tokenizer):
    tokens = tokenizer.tokenize(text)
    return tokens

# Sample data for tokenization comparison
print("\nTokenizing sample data with different methods...")
sample_size = min(1000, len(all_train))
sample_data = all_train.sample(sample_size)

# Apply tokenization methods
sample_data['bpe_tokens'] = sample_data['cleaned_tweet'].apply(lambda x: tokenize_text(x, xlm_tokenizer))
sample_data['wordpiece_tokens'] = sample_data['cleaned_tweet'].apply(lambda x: tokenize_text(x, mbert_tokenizer))
sample_data['sentencepiece_tokens'] = sample_data['cleaned_tweet'].apply(lambda x: tokenize_text(x, sentencepiece_tokenizer))

# Calculate token counts
sample_data['bpe_token_count'] = sample_data['bpe_tokens'].apply(len)
sample_data['wordpiece_token_count'] = sample_data['wordpiece_tokens'].apply(len)
sample_data['sentencepiece_token_count'] = sample_data['sentencepiece_tokens'].apply(len)

# Print token count statistics
print("\nToken count statistics:")
for method in ['bpe', 'wordpiece', 'sentencepiece']:
    counts = sample_data[f'{method}_token_count']
    print(f"{method.capitalize()} tokens: min={counts.min()}, max={counts.max()}, avg={counts.mean():.2f}")

# Visualize token count distributions
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.histplot(sample_data['bpe_token_count'], kde=True)
plt.title('BPE Token Count Distribution')

plt.subplot(1, 3, 2)
sns.histplot(sample_data['wordpiece_token_count'], kde=True)
plt.title('WordPiece Token Count Distribution')

plt.subplot(1, 3, 3)
sns.histplot(sample_data['sentencepiece_token_count'], kde=True)
plt.title('SentencePiece Token Count Distribution')

plt.tight_layout()
plt.savefig('token_distributions.png')
plt.close()

# N-gram analysis (as mentioned in your methodology)
def extract_ngrams(text, n=2):
    """Extract character n-grams from text"""
    text = str(text).lower()
    return [text[i:i+n] for i in range(len(text)-n+1)]

# Add n-gram extraction to a sample of the data
print("\nPerforming n-gram analysis...")
ngram_sample = all_train.sample(1000)
ngram_sample['char_2grams'] = ngram_sample['cleaned_tweet'].apply(lambda x: extract_ngrams(x, 2))
ngram_sample['char_3grams'] = ngram_sample['cleaned_tweet'].apply(lambda x: extract_ngrams(x, 3))

# Count the most common n-grams
all_2grams = [gram for sublist in ngram_sample['char_2grams'] for gram in sublist]
all_3grams = [gram for sublist in ngram_sample['char_3grams'] for gram in sublist]

# Get the 20 most common n-grams
common_2grams = Counter(all_2grams).most_common(20)
common_3grams = Counter(all_3grams).most_common(20)

# Plot the most common n-grams
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.barplot(x=[x[0] for x in common_2grams], y=[x[1] for x in common_2grams])
plt.title('Most Common Character 2-grams')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
sns.barplot(x=[x[0] for x in common_3grams], y=[x[1] for x in common_3grams])
plt.title('Most Common Character 3-grams')
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig('common_ngrams.png')
plt.close()

# Prepare data for transformer models
def prepare_data_for_transformers(df, tokenizer, max_length=128):
    """
    Convert dataframe to tokenized inputs for transformer models
    """
    # Map labels to integers
    unique_labels = df['label'].unique()
    label_map = {label: i for i, label in enumerate(sorted(unique_labels))}
    print(f"Label mapping: {label_map}")

    # Tokenize all texts
    encodings = tokenizer(
        df['cleaned_tweet'].tolist(),
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )

    # Convert labels to integers
    labels = torch.tensor([label_map[label] for label in df['label']])

    return {
        'input_ids': encodings.input_ids,
        'attention_mask': encodings.attention_mask,
        'labels': labels,
        'label_map': label_map  # Include the mapping for reference
    }

# Create datasets for each tokenization method
print("\nPreparing datasets for transformer models...")

print("Processing with BPE tokenization (XLM-RoBERTa)...")
bpe_train = prepare_data_for_transformers(all_train, xlm_tokenizer)
bpe_dev = prepare_data_for_transformers(all_dev, xlm_tokenizer)
bpe_test = prepare_data_for_transformers(all_test, xlm_tokenizer)

print("Processing with WordPiece tokenization (mBERT)...")
wordpiece_train = prepare_data_for_transformers(all_train, mbert_tokenizer)
wordpiece_dev = prepare_data_for_transformers(all_dev, mbert_tokenizer)
wordpiece_test = prepare_data_for_transformers(all_test, mbert_tokenizer)

print("Processing with SentencePiece tokenization (XLM-RoBERTa-Large)...")
sentencepiece_train = prepare_data_for_transformers(all_train, sentencepiece_tokenizer)
sentencepiece_dev = prepare_data_for_transformers(all_dev, sentencepiece_tokenizer)
sentencepiece_test = prepare_data_for_transformers(all_test, sentencepiece_tokenizer)

# Save the preprocessed datasets
print("\nSaving preprocessed datasets...")

# Create directory for output if it doesn't exist
output_dir = "/content/drive/MyDrive/preprocessed_output"
os.makedirs(output_dir, exist_ok=True)

# Save BPE datasets
torch.save(bpe_train, os.path.join(output_dir, 'bpe_train_dataset.pt'))
torch.save(bpe_dev, os.path.join(output_dir, 'bpe_dev_dataset.pt'))
torch.save(bpe_test, os.path.join(output_dir, 'bpe_test_dataset.pt'))

# Save WordPiece datasets
torch.save(wordpiece_train, os.path.join(output_dir, 'wordpiece_train_dataset.pt'))
torch.save(wordpiece_dev, os.path.join(output_dir, 'wordpiece_dev_dataset.pt'))
torch.save(wordpiece_test, os.path.join(output_dir, 'wordpiece_test_dataset.pt'))

# Save SentencePiece datasets
torch.save(sentencepiece_train, os.path.join(output_dir, 'sentencepiece_train_dataset.pt'))
torch.save(sentencepiece_dev, os.path.join(output_dir, 'sentencepiece_dev_dataset.pt'))
torch.save(sentencepiece_test, os.path.join(output_dir, 'sentencepiece_test_dataset.pt'))

# Save original processed DataFrames for reference
all_train.to_csv(os.path.join(output_dir, 'all_train.csv'), index=False)
all_dev.to_csv(os.path.join(output_dir, 'all_dev.csv'), index=False)
all_test.to_csv(os.path.join(output_dir, 'all_test.csv'), index=False)

print(f"\nPreprocessing complete! All datasets saved to {output_dir}")
print("\nYou now have preprocessed datasets using three different subword tokenization methods:")
print("1. Byte-Pair Encoding (BPE) - XLM-RoBERTa Base")
print("2. WordPiece - mBERT")
print("3. SentencePiece - XLM-RoBERTa Large")
print("\nYour teammates can load these datasets directly into transformer models using:")
print("train_dataset = torch.load('/content/drive/MyDrive/preprocessed_output/bpe_train_dataset.pt')")

Mounted at /content/drive
Looking for data in: /content/drive/MyDrive/preprocessed/data

Contents of the specified directory:
  - README.txt
  - .DS_Store
  - amh
  - ary
  - ibo
  - pcm
  - por
  - kin
  - hau
  - swa
  - orm
  - arq
  - twi
  - tso
  - yor
  - tir

Found 14 language folders: amh, ary, ibo, pcm, por, kin, hau, swa, orm, arq, twi, tso, yor, tir

Loading amh data...
  Checking files for amh:
    - train.tsv exists
      Size: 1107.43 KB
      First line: 'tweet	label'
      Found tab separator in file
    - dev.tsv exists
      Size: 275.24 KB
      First line: 'tweet	label'
      Found tab separator in file
    - test.tsv exists
      Size: 396.64 KB
      First line: 'tweet	label'
      Found tab separator in file
    Loaded 5985 training samples
    Loaded 1498 development samples
    Loaded 2000 test samples

    Preview of loaded training data:
                                               tweet     label language
0                                              twe

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]


Tokenizing sample data with different methods...

Token count statistics:
Bpe tokens: min=4, max=217, avg=31.36
Wordpiece tokens: min=2, max=292, avg=30.53
Sentencepiece tokens: min=4, max=217, avg=31.36

Performing n-gram analysis...

Preparing datasets for transformer models...
Processing with BPE tokenization (XLM-RoBERTa)...
Label mapping: {'label': 0, 'negative': 1, 'neutral': 2, 'positive': 3}
Label mapping: {'label': 0, 'negative': 1, 'neutral': 2, 'positive': 3}
Label mapping: {'label': 0, 'negative': 1, 'neutral': 2, 'positive': 3}
Processing with WordPiece tokenization (mBERT)...
Label mapping: {'label': 0, 'negative': 1, 'neutral': 2, 'positive': 3}
Label mapping: {'label': 0, 'negative': 1, 'neutral': 2, 'positive': 3}
Label mapping: {'label': 0, 'negative': 1, 'neutral': 2, 'positive': 3}
Processing with SentencePiece tokenization (XLM-RoBERTa-Large)...
Label mapping: {'label': 0, 'negative': 1, 'neutral': 2, 'positive': 3}
Label mapping: {'label': 0, 'negative': 1, 'neut