<a href="https://colab.research.google.com/github/KamilaSulaimanova/Text-Representations-for-Language-Identification/blob/main/SCC413_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets



In [None]:
!pip install hf_xet



In [None]:
# Standard libraries
import os
import time
import warnings
import numpy as np
import pandas as pd
import re
import gc
import random
from tqdm.auto import tqdm
from itertools import islice
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# ML Imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score
)
import torch
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSeq2SeqLM,
    pipeline,
    logging as hf_logging
)
from datasets import load_dataset

# Env setup
# Memory management
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
# Ignore warnings for cleaner output
warnings.filterwarnings("ignore")
# Set plots style
sns.set_theme(style="whitegrid")

# Hugging face setup
# Enable faster downloads
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
# Get token
hf_token = os.environ.get("HF_TOKEN")
# Reduce transformers verbosity
hf_logging.set_verbosity_error()

# Dataset configs
LANGUAGES = ["ky", "kk", "ru"]
DATASET_NAME = "oscar-corpus/OSCAR-2201"
N_SAMPLES = {"ky": 15000, "kk": 15000, "ru": 30000}
TEST_SIZE = 0.2
RANDOM_STATE = 42

# Subword Tokenizer Configs
# Common starting vocabluary size
VOCAB_SIZE = 30000
# Mininmun number of token appearance
MIN_FREQUENCY = 5
# Folder and prefix to store tokenizer files
SUBWORD_MODEL_DIR = "subword_tokenizer"
SUBWORD_MODEL_PREFIX = "ky_kk_ru_bpe"

# Pre-trained Embeddings Configs
EMBEDDING_MODEL_NAME = "xlm-roberta-base"
EMBEDDING_BATCH_SIZE = 64

# NLLB-based Code Switching Generation Configs
NLLB_MODEL = "facebook/nllb-200-distilled-600M"
# NLLB specific language codes
NLLB_LANG_LABELS = {"kk": "kaz_Cyrl", "ky": "kir_Cyrl", "ru": "rus_Cyrl"}
BASE_LANGS = ["ky", "kk"]
MIX_LANG = "ru"
# Words swap probabilities, mixing intensity
WORD_SWAP_PROBS = [0.05, 0.15, 0.30]
# Number of test sentences to use for code switching (per language)
CS_N_TEST_SENTENCES = 500
BATCH_SIZE_MT = 32

# Training Size Configs
TRAINING_SIZE_FRACS = [0.1, 0.5, 1.0]

# Number of code switching error examples to return
N_ERROR_EXAMPLES = 10

# --- Preprocessing ---
def clean(text):
    """Cleans input text by removing URLs, emails, unnecessary symbols, and normalizes whitespace."""
    # Remove URLs and emails
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)

    # Keep only relevant characters:
    # Cyrillic (including Ky/Kk specific), Latin, digits, space, and basic punctuation .,-?!
    allowed_chars = r'[^а-яА-ЯёЁәіңғүұқөһӘІҢҒҮҰҚӨҺa-zA-Z0-9\s.,?!-]'
    text = re.sub(allowed_chars, '', text)

    # Normalize whitespace
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)

    return text

def load_data(languages, n_samples, test_size, random_state):
    """Loads datasets from Hugging Face.
    Perfroms cleaning, filtering, splitting and label encoding.
    """
    all_texts = []
    all_labels = []
    print("Loading and preparing data...")

    for lang in languages:
        print(f"Loading '{lang}' dataset...")
        try:
            # Streaming mode enabled for efficiency
            dataset = load_dataset("oscar-corpus/OSCAR-2201",
                                   lang,
                                   split='train', streaming=True,
                                   token=hf_token, trust_remote_code=True)
            lang_samples = []
            target_samples = n_samples[lang]
            # Fetch more initially
            fetch_target = int(target_samples * 1.5)
            # islice is used to limit samples
            iterator = tqdm(islice(dataset, fetch_target),
                            desc=f"Fetching & cleaning {lang} samples", total=target_samples, unit=" examples")
            count = 0
            # Looping through dataset
            for example in iterator:
                # Stop when desired number of samples is reached
                if count >= target_samples:
                  break
                text = example.get('text')
                # Clean extracted text
                if text and isinstance(text, str):
                    cleaned = clean(text)
                    # Check for minimum number of words
                    if cleaned and len(cleaned.split()) >= 5:
                        lang_samples.append(cleaned)
                        count += 1
                        iterator.update(1)
            iterator.close()
            # Handling cases when not enough sentences extracted
            if count < target_samples:
                print(f"Loaded {count} valid sentences for '{lang}' (target {target_samples}).")
            # Adding samples to the main lists
            texts = lang_samples[:target_samples]
            labels = [lang] * len(texts)
            all_texts.extend(texts)
            all_labels.extend(labels)
            print(f"Loaded {len(texts)} samples for '{lang}'.")
        except Exception as e:
            print(f"Error loading data for language '{lang}': {e}.")
            continue

    print(f"Total samples loaded: {len(all_texts)}")
    print("Label distribution:", Counter(all_labels))

    # Convert string labels into numerical
    le = LabelEncoder()
    # Fit on sorted unique labels
    unique_labels = sorted(list(set(all_labels)))
    le.fit(unique_labels)
    all_labels_encoded = le.transform(all_labels)

    # Splitting data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        all_texts, all_labels_encoded,
        test_size=test_size, random_state=random_state, stratify=all_labels_encoded
    )
    print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
    return X_train, X_test, y_train, y_test, le


def train_tokenizer(texts, vocab_size, min_frequency, output_dir, prefix):
    """
    Trains a ByteLevelBPETokenizer on the full training set.
    Saves the tokenizer files (vocab and merges).
    """
    # Create directory to store files if not exists
    if not os.path.exists(output_dir):
      os.makedirs(output_dir)
    # Writing training data to a temporary file
    temp_train_data = os.path.join(output_dir, f"temp_train_data.txt")
    with open(temp_train_data, "w", encoding="utf-8") as f:
        for text in tqdm(texts, desc="Writing train data"):
            f.write(text + "\n")

    print(f"\nTraining Subword Tokenizer...")
    tokenizer = ByteLevelBPETokenizer()
    start_time = time.time()
    # Training BPE model based on temporary file created
    tokenizer.train(files=[temp_train_data], vocab_size=vocab_size, min_frequency=min_frequency,
                    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
    train_time = time.time() - start_time
    print(f"Subword Tokenizer training complete ({train_time:.2f} seconds).")
    tokenizer.save_model(output_dir, prefix)
    # Delete temporary file
    os.remove(temp_train_data)
    # Paths to saved files
    vocab = os.path.join(output_dir, f"{prefix}-vocab.json")
    merges = os.path.join(output_dir, f"{prefix}-merges.txt")
    return vocab, merges

def load_tokenizer(vocab, merges):
    """Loads a pre-trained ByteLevelBPETokenizer from saved files."""
    # Checking if files exist
    if not (os.path.exists(vocab) and os.path.exists(merges)):
        raise FileNotFoundError(f"Tokenizer files not found: {vocab}, {merges}")
    # Loading tokenizer from saved files
    tokenizer = ByteLevelBPETokenizer(vocab, merges)
    # Adding standard BERT-style processing
    tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")),
    )
    return tokenizer

def get_subword_features(texts, tokenizer, vectorizer=None, max_features=50000):
    """
    Tokenizes texts using the provided BPE tokenizer and then computes
    TF-IDF features on the resulting subword sequences.
    """
    print("Generating subword features using the tokenizer...")
    # Encoding each text into subword tokens
    tokenized_texts = [" ".join(tokenizer.encode(text).tokens) for text in tqdm(texts, desc="Tokenizing (Subword)")]
    # If tokenizer is not specified, fit a new one
    if vectorizer is None:
        print("Fitting TF-IDF Vectorizer for subwords...")
        vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=max_features)
        features = vectorizer.fit_transform(tokenized_texts)
        print(f"Subword TF-IDF fitted: {features.shape}")
    else:
        # If already fitted, transform the new texts
        print("Transforming with existing subword TF-IDF Vectorizer...")
        features = vectorizer.transform(tokenized_texts)
    return features, vectorizer

def get_embedding_features(texts, model_name, batch_size=32):
    """
    Generates sentence embeddings using a pre-trained transformer model.
    Uses mean pooling of the last hidden state.
    """
    print(f"\nLoading pre-trained embedding model: {model_name}")
    # Loading tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    # Choose and move model to device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Set to evaluation mode
    model.to(device).eval()
    embeddings_list = []
    # Processing texts in batches
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Embedding Batches"):
            batch_texts = texts[i:i+batch_size]
            # Tokenizing the batch
            inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
            outputs = model(**inputs)
            # Mean pooling from the last hidden state
            # to get a single fixed size vector representing the sentence.
            embeddings = outputs.last_hidden_state.mean(dim=1).cpu().float().numpy()
            embeddings_list.append(embeddings)
    # Clean up memory
    del model, tokenizer, inputs, outputs; gc.collect();
    if torch.cuda.is_available():
      torch.cuda.empty_cache()
    # Combining embeddings from all batches
    return np.vstack(embeddings_list)

def generate_mix_text(base_lang_text, lang_base_label, lang_mix_label, n_sentences, swap_prob, mt_model, lang_map, batch_size=16):
    """
    Generates synthetic code-switched sentences using the NLLB translation model.
    Randomly selects words with swaping probability and translates
    them into another language (mixed).
    """
    print(f"\n--- Generating mixed data using NLLB ({swap_prob=:.2f}): {lang_base_label} -> {lang_mix_label} ---")
    mixed_text = []
    original_lang = []
    # Getting language codes for model
    src_lang = lang_map[lang_base_label]
    tgt_lang = lang_map[lang_mix_label]

    translator, model, tokenizer = None, None, None
    try:
        print(f"Loading MT model: {mt_model}")
        device_idx = 0 if torch.cuda.is_available() else -1
        # Get tokenizer
        tokenizer = AutoTokenizer.from_pretrained(mt_model, src_lang=src_lang)
        # Load model
        model = AutoModelForSeq2SeqLM.from_pretrained(mt_model).eval()
        # Create translation pipeline
        translator = pipeline("translation", model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang, device=device_idx)
        print("Model and pipeline loaded.")
    except Exception as e:
        print(f"Error loading model: {e}")
        # Clean up
        del model, tokenizer, translator; gc.collect();
        if torch.cuda.is_available():
          torch.cuda.empty_cache(); return [], []

    generated_count = 0

    print(f"Processing {min(len(base_lang_text), n_sentences)} base sentences ({lang_base_label})...")
    # Randomly select subset for translation
    num_base_sentences = min(len(base_lang_text), n_sentences)
    sentence_idx = random.sample(range(len(base_lang_text)), num_base_sentences)
    iterator = tqdm(range(0, num_base_sentences, batch_size), desc=f"Generating {lang_base_label}/{lang_mix_label} mixed set (P={swap_prob:.2f})", unit="batch")

    # Process sentences in batches
    for i in iterator:

        batch_idx = range(i, min(i + batch_size, num_base_sentences))
        actual_idx = [sentence_idx[j] for j in batch_idx]
        batch_sentences = [base_lang_text[idx] for idx in actual_idx]

        # Prepare words for translation
        words_to_translate, sentence_struct, idx_for_flat_list = [], [], []
        flat_word_idx_start = 0

        for sent_idx, sentence_base in enumerate(batch_sentences):
            # Whitespace split
            tokens_base = sentence_base.split()
            if not tokens_base:
              sentence_struct.append({"tokens": [], "swap_idx": {}}); continue

            words_to_swap, swap_idx_map = [], {}

            for idx, token in enumerate(tokens_base):
                # If token looks like a word and passes swap probability check
                if re.search(r'\w', token) and random.random() < swap_prob:
                  cleaned_token = token.strip()
                  # If not empty string
                  if cleaned_token:
                    swap_idx_map[idx] = len(words_to_swap)
                    words_to_swap.append(cleaned_token)

            # Store data needed for swaping
            words_to_translate.extend(words_to_swap)
            sentence_struct.append({"tokens": tokens_base, "swap_idx": swap_idx_map})
            idx_for_flat_list.append((sent_idx, flat_word_idx_start, len(words_to_swap)))
            flat_word_idx_start += len(words_to_swap)

        translated_words_flat = []
        # Translating selected words
        if words_to_translate:
            try:
                # Call the pipeline to translate
                translated_results = translator(words_to_translate, batch_size=batch_size)
                translated_words_flat = [item['translation_text'].strip() for item in translated_results]
            except Exception as e: print(f"\nWarning: Translation failed for batch: {e}")

        current_flat_idx = 0
        # Swaping translated words
        for sent_idx, _, n_words in idx_for_flat_list:
            if n_words == 0:
              continue

            # Get the translated words corresponding to this sentence
            current_translated_words = translated_words_flat[current_flat_idx : current_flat_idx + n_words]
            current_flat_idx += n_words

            if len(current_translated_words) != n_words:
                print(f"\nWarning: Translation count mismatch for sentence {sent_idx} ({len(current_translated_words)} vs {n_words} expected). Using original.")
                continue

            original_tokens = sentence_struct[sent_idx]["tokens"]
            swap_idx_map = sentence_struct[sent_idx]["swap_idx"]
            final_tokens = list(original_tokens) # Start with original
            translated_word_sent_idx = 0
            for orig_idx, token in enumerate(original_tokens):
              if translated_word_sent_idx < len(current_translated_words):
                # If this token was chosen for swaping
                if orig_idx in swap_idx_map:
                    translated_token = current_translated_words[translated_word_sent_idx]
                    translated_word_sent_idx += 1
                    # Replace only if the translation is non-empty and different from original
                    if translated_token and translated_token.lower() != token.strip().lower():
                      final_tokens[orig_idx] = translated_token # Replace in place

            # Update the sentence with the new tokens
            sentence_struct[sent_idx]["tokens"] = final_tokens

        # Collect final sentences
        original_label = [k for k, v in NLLB_LANG_LABELS.items() if v == src_lang][0]
        for structure in sentence_struct:
            final_sentence = " ".join(structure["tokens"])
            if final_sentence:
                mixed_text.append(final_sentence)
                # Append the original base language label
                original_lang.append(original_label)
                generated_count += 1

    print(f"Finished generating {generated_count} mixed sentences for {lang_base_label} -> {lang_mix_label} (P={swap_prob:.2f}).")
    # Clean up memory
    del translator, model, tokenizer; gc.collect();
    if torch.cuda.is_available():
      torch.cuda.empty_cache()

    return mixed_text, original_lang

# --- Training & Evaluation ---
def train_and_evaluate(X_train, y_train, X_test, y_test, label_encoder, method, fraction, plot_cm=False):
    """
    Trains a Logistic Regression model on the provided train set,
    evaluates it on the test set, calculates metrics, and optionally
    plots a confusion matrix.
    """
    print(f"Training & Evaluating: {method} (Fraction: {fraction:.2f})")
    start_time = time.time()
    model = LogisticRegression(max_iter=1500, random_state=RANDOM_STATE, C=1.0, solver='liblinear', class_weight='balanced')
    # Train model
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    print(f"Training complete ({train_time:.2f} seconds).")

    # Evaluate on the test set
    start_time = time.time()
    y_pred = model.predict(X_test)
    eval_time = time.time() - start_time
    print(f"Evaluation complete ({eval_time:.2f} seconds).")

    # Calculate standard metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    metrics_dict = {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'train_time': train_time,
        'eval_time': eval_time
    }
    # Calculate specific F1 scores for 'ky' and 'kk'
    for lang in ['ky', 'kk']:
        if lang in label_encoder.classes_:
            lang_encoded = label_encoder.transform([lang])[0]
            metrics_dict[f'f1_{lang}'] = f1_score(y_test, y_pred, labels=[lang_encoded], average='micro')

    print(f"Evaluation for Fraction: {fraction:.2f}, Method: {method}):")
    # Generate report using only the labels actually present in this pair
    labels_n = np.unique(np.concatenate((y_test, y_pred)))
    target_names = label_encoder.inverse_transform(labels_n)
    print(classification_report(y_test, y_pred, labels=labels_n, target_names=target_names, digits=4))

    # Only plot confusion matrix for the full dataset
    if plot_cm:
        print("Generating Confusion Matrix...")
        try:
            cm_labels = label_encoder.transform(label_encoder.classes_)
            cm = confusion_matrix(y_test, y_pred, labels=cm_labels)
            display_cm = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
            fig, ax = plt.subplots(figsize=(8, 6))
            display_cm.plot(cmap=plt.cm.Blues, ax=ax, xticks_rotation='vertical')
            plt.title(f"Confusion Matrix: {method} (Full Dataset)\nStandard Test")
            plt.tight_layout()
            filename = f"confusion_matrix_{method.replace(' ', '_')}_standard_test.png"
            plt.savefig(filename)
            plt.close(fig)
        except Exception as e: print(f"Could not generate plot: {e}")
    # Return the trained model and performance metrics
    return model, metrics_dict

# --- Plotting Function for Scaling ---
def plot_scaling_results(results):
    """Plots performance metrics vs. training data fraction."""
    print("Plotting Performance Scaling")
    try:
        metrics = [col for col in ['accuracy', 'f1_macro', 'f1_ky', 'f1_kk'] if col in results.columns]
        n_metrics = len(metrics)
        fig, axes = plt.subplots(n_metrics, 1, figsize=(10, 5 * n_metrics), sharex=True, squeeze=False)
        axes = axes.flatten()
        fig.suptitle("Performance vs. Training Data Fraction", y=1.02)

        # Plot each metric on a separate subplot
        for i, metric in enumerate(metrics):
            ax = axes[i]
            sns.lineplot(data=results, x='fraction', y=metric, hue='method', marker='o', ax=ax)
            ax.set_title(f"{metric.replace('_', ' ').title()} Scaling")
            ax.legend(title='Method', bbox_to_anchor=(1.05, 1), loc='upper left')

        axes[-1].set_xlabel("Training Data Fraction")
        # Adjust layout to prevent labels/legend overlapping
        plt.tight_layout(rect=[0, 0, 0.85, 0.98])
        filename = "performance_scaling.png"
        plt.savefig(filename, bbox_inches='tight')
        plt.close(fig)

    except Exception as e:
        print(f"Error plotting scaling results: {e}")

# --- Error Analysis ---
def display_error_examples(X_mixed, y_true_labels, y_pred_labels, n_examples=10):
    """
    Prints examples of misclassified samples.
    """
    count = 0
    print("-" * 50)
    print(f"Displaying up to {n_examples} misclassified examples:")
    indices = list(range(len(X_mixed)))
    random.shuffle(indices) # Show random errors
    for i in indices:
        # Check if there is a misclassification
        if y_true_labels[i] != y_pred_labels[i]:
            print(f"Example {count + 1}:")
            print(f"Original Base Lang: {y_true_labels[i]}")
            print(f"Predicted Lang: {y_pred_labels[i]}")
            # Truncate long text examples
            display_text = X_mixed[i][:500] + '...' if len(X_mixed[i]) > 500 else X_mixed[i]
            print(f"Mixed Text:\n  '{display_text}'")
            print("-" * 20)
            count += 1
            if count >= n_examples:
                break
    if count == 0:
        print("No examples found where base language was misclassified.")
    print("-" * 50)


In [None]:
# --- Main Execution ---
if __name__ == "__main__":

    # Load Data
    X_train, X_test, y_train, y_test, label_encoder = load_data(
        LANGUAGES, N_SAMPLES, TEST_SIZE, RANDOM_STATE
    )
    LANG_CLASSES = label_encoder.classes_

    # Train Subword Tokenizer
    bpe_tokenizer = None
    try:
      vocab, merges = train_tokenizer(
          X_train, VOCAB_SIZE, MIN_FREQUENCY, SUBWORD_MODEL_DIR, SUBWORD_MODEL_PREFIX
      )
      bpe_tokenizer = load_tokenizer(vocab, merges)
    except Exception as e:
        print(f"Warning: Failed to train or load subword tokenizer: {e}")

    all_results = []
    final_models = {}
    final_vectorizers = {}
    X_test_embed = None

    # Pre-generate Test Embeddings
    print("\nPre-generating embeddings for the test set...")
    X_test_embed = get_embedding_features(X_test, EMBEDDING_MODEL_NAME, batch_size=EMBEDDING_BATCH_SIZE)
    if X_test_embed is not None:
      print(f"Test embeddings generated: {X_test_embed.shape}")
    else:
      print("Warning: Test embedding pre-generation failed.")

    # Scaling Analysis
    for frac in TRAINING_SIZE_FRACS:
        print(f"Processing Training Fraction: {frac:.2f}")

        # Create Training Subset
        if frac < 1.0:
            X_train_sub, _, y_train_sub, _ = train_test_split(
                X_train, y_train,
                train_size=frac,
                random_state=RANDOM_STATE,
                stratify=y_train
            )
        else:
            X_train_sub, y_train_sub = X_train, y_train
        print(f"Using {len(X_train_sub)} training samples.")

        # Plot confusion matrix only for the full dataset
        plot_cm = (frac == 1.0)

        # Train and Evaluate Models on Subset

        # Method 1: Character N-grams
        method_name_char = "Character N-grams"
        try:
            print(f"\nStarting {method_name_char} for fraction {frac:.2f}...")
            vectorizer_char = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=50000)
            print("Fitting Char TF-IDF...")
            X_train_feat_char = vectorizer_char.fit_transform(X_train_sub)
            X_test_feat_char = vectorizer_char.transform(X_test)
            # Train model, evaluate, get metrics
            model_char, metrics_char = train_and_evaluate(
                X_train_feat_char, y_train_sub, X_test_feat_char, y_test, label_encoder,
                method_name_char, frac, plot_cm=plot_cm
            )
            metrics_char['fraction'] = frac
            metrics_char['method'] = method_name_char
            all_results.append(metrics_char)
            if plot_cm:
                # Save final model and vectorizer
                final_models[method_name_char] = model_char
                final_vectorizers[method_name_char] = vectorizer_char
            del X_train_feat_char, X_test_feat_char, model_char; gc.collect()
        except Exception as e:
            print(f"Error during {method_name_char} processing (Fraction {frac:.2f}): {e}")

        # Method 2: Subword Units (BPE + TF-IDF)
        method_name_subword = "Subword Units"
        if bpe_tokenizer:
            try:
                print(f"\nStarting {method_name_subword} for fraction {frac:.2f}...")
                # Generate subword TF-IDF features
                X_train_feat_sub, vectorizer_sub = get_subword_features(
                    X_train_sub, bpe_tokenizer, vectorizer=None, max_features=50000
                )
                # Transform the test set using the fitted vectorizer
                X_test_feat_sub, _ = get_subword_features(
                    X_test, bpe_tokenizer, vectorizer=vectorizer_sub
                )
                # Train model, evaluate, get metrics
                model_sub, metrics_sub = train_and_evaluate(
                    X_train_feat_sub, y_train_sub, X_test_feat_sub, y_test, label_encoder,
                    method_name_subword, frac, plot_cm=plot_cm
                )
                metrics_sub['fraction'] = frac
                metrics_sub['method'] = method_name_subword
                all_results.append(metrics_sub)
                if plot_cm:
                    final_models[method_name_subword] = model_sub
                    final_vectorizers[method_name_subword] = vectorizer_sub
                del X_train_feat_sub, X_test_feat_sub, model_sub; gc.collect()
            except Exception as e:
                print(f"Error during {method_name_subword} processing (Fraction {frac:.2f}): {e}")


        # Method 3: Embeddings
        method_name_embed = "Embeddings"
        try:
            print(f"\nStarting {method_name_embed} (Fraction {frac:.2f})...")
            # Generate embeddings specifically for the current training subset
            X_train_feat_embed = get_embedding_features(X_train_sub, EMBEDDING_MODEL_NAME, batch_size=EMBEDDING_BATCH_SIZE)
            if X_train_feat_embed is not None and X_test_embed is not None: # Check both train and pre-generated test embeddings
                # Train model, evaluate, get metrics using pre-generated test embeddings
                model_embed, metrics_embed = train_and_evaluate(
                    X_train_feat_embed, y_train_sub, X_test_embed, y_test, label_encoder, method_name_embed, frac, plot_cm=plot_cm)
                if model_embed:
                     metrics_embed['fraction'] = frac
                     metrics_embed['method'] = method_name_embed
                     all_results.append(metrics_embed)
                     if plot_cm:
                      final_models[method_name_embed] = model_embed
                      final_vectorizers[method_name_embed] = None # No vectorizer
            elif X_test_embed is None:
              print("Skipping evaluation as test embeddings failed pre-generation.")
            # Clean train embeddings
            del X_train_feat_embed; gc.collect()
            if torch.cuda.is_available():
              torch.cuda.empty_cache()
        except Exception as e:
            print(f"Error during {method_name_embed}: {e}")
            if torch.cuda.is_available():
              torch.cuda.empty_cache()

    # End of Fraction Loop

    # Plot Scaling Results
    if all_results:
        results = pd.DataFrame(all_results)
        print("\nOverall Performance Metrics Across Fractions")
        display_cols = [col for col in ['method', 'fraction', 'accuracy', 'f1_macro', 'f1_ky', 'f1_kk', 'train_time', 'eval_time'] if col in results.columns]
        print(results[display_cols].round(4))
        results.to_csv("scaling_results.csv", index=False)
        plot_scaling_results(results)
    else:
      print("\nNo scaling results collected.")

    # Code-Switching Evaluation

    print(f"\n Code-Switching Evaluation")
    if not final_models:
      print("No final models trained.")
    else:
        # Prepare a dictionary mapping language labels to their corresponding test sentences
        y_test_labels_str = label_encoder.inverse_transform(y_test)
        test_sentences = {lang: [X_test[i] for i, l in enumerate(y_test_labels_str) if l == lang] for lang in LANG_CLASSES}

        # Iterate through the specified code-switching probabilities
        for swap_prob in WORD_SWAP_PROBS:
            print(f"\nProcessing Swap Probability: {swap_prob:.2f}")
            X_mixed, original_langs_mixed = [], []
            for base_lang in BASE_LANGS:
                if base_lang not in NLLB_LANG_LABELS:
                  continue
                # Generate sentences with specified swap probability
                mixed_subset, original_subset = generate_mix_text(
                    test_sentences.get(base_lang, []),
                    base_lang,
                    MIX_LANG,
                    CS_N_TEST_SENTENCES,
                    swap_prob,
                    NLLB_MODEL,
                    NLLB_LANG_LABELS,
                    BATCH_SIZE_MT)
                X_mixed.extend(mixed_subset)
                original_langs_mixed.extend(original_subset)
                gc.collect();
                torch.cuda.empty_cache() if torch.cuda.is_available() else None

            if not X_mixed:
              print(f"No mixed sentences for P={swap_prob:.2f}.")
              continue

            print(f"\nEvaluating on {len(X_mixed)} Mixed Sentences (P={swap_prob:.2f}) ---")
            y_true_mixed = label_encoder.transform(original_langs_mixed)

            # Evaluate each final model (trained on full dataset) on this mixed data
            for method, model in final_models.items():
                print(f"\nEvaluating {method} (P={swap_prob:.2f})...")
                vectorizer = final_vectorizers.get(method)
                X_mixed_feat = None
                try:
                    # Get Features for the Mixed Data
                    if method == "Character N-grams":
                        if vectorizer:
                          X_mixed_feat = vectorizer.transform(X_mixed)
                    elif method == "Subword Units":
                        if vectorizer and bpe_tokenizer:
                          X_mixed_feat, _ = get_subword_features(X_mixed, bpe_tokenizer, vectorizer=vectorizer)
                    elif method == "Embeddings":
                        X_mixed_feat = get_embedding_features(X_mixed, EMBEDDING_MODEL_NAME, batch_size=EMBEDDING_BATCH_SIZE)

                    if X_mixed_feat is None:
                      print(f"Feature generation failed for {method}.")
                      continue

                    # Predict & Evaluate
                    y_pred_mixed_encoded = model.predict(X_mixed_feat)
                    y_pred_mixed_labels = label_encoder.inverse_transform(y_pred_mixed_encoded)

                    # Report the distribution of predicted labels
                    print(f"Predicted label distribution: {Counter(y_pred_mixed_labels)}")
                    if MIX_LANG not in y_pred_mixed_labels:
                      print(f"Warning: Mixed lang '{MIX_LANG}' was not predicted.")

                    # Print the detailed classification report
                    print(f"\nClassification Report ({method} Mix P={swap_prob:.2f}):")
                    labels_n = np.unique(np.concatenate((y_true_mixed, y_pred_mixed_encoded)))
                    target_names = label_encoder.inverse_transform(labels_n)
                    print(classification_report(y_true_mixed, y_pred_mixed_encoded, labels=labels_n, target_names=target_names, digits=4, zero_division=0))

                    # Generate and save the confusion matrix for mixed results
                    print(f"Confusion Matrix ({method} Mix P={swap_prob:.2f}):")
                    cm_str = sorted(list(set(BASE_LANGS + [MIX_LANG])))
                    cm_filtered = [l for l in cm_str if l in label_encoder.classes_]
                    cm_encoded = label_encoder.transform(cm_filtered)
                    cm_mixed = confusion_matrix(y_true_mixed, y_pred_mixed_encoded, labels=cm_encoded)
                    display_mixed = ConfusionMatrixDisplay(confusion_matrix=cm_mixed, display_labels=cm_filtered)
                    fig_mix, ax_mix = plt.subplots(figsize=(8, 6))
                    display_mixed.plot(cmap=plt.cm.Blues, ax=ax_mix, xticks_rotation='vertical')
                    plt.title(f"Confusion Matrix: {method} Mix P={swap_prob:.2f}")
                    plt.tight_layout()
                    filename_mix = f"confusion_matrix_{method.replace(' ', '_')}_mix_p{swap_prob:.2f}.png"
                    plt.savefig(filename_mix)
                    plt.close(fig_mix)

                    # Display Error Examples
                    display_error_examples(X_mixed, original_langs_mixed, y_pred_mixed_labels, n_examples=N_ERROR_EXAMPLES)
                    #Clean up
                    del X_mixed_feat; gc.collect()
                    if torch.cuda.is_available():
                      torch.cuda.empty_cache()

                except Exception as e: print(f"\nERROR during {method} NLLB evaluation (P={swap_prob:.2f}): {e}")

Loading and preparing data...
Loading 'ky' dataset...


Fetching & cleaning ky samples:   0%|          | 0/15000 [00:00<?, ? examples/s]

Loaded 15000 samples for 'ky'.
Loading 'kk' dataset...


Fetching & cleaning kk samples:   0%|          | 0/15000 [00:00<?, ? examples/s]

Loaded 15000 samples for 'kk'.
Loading 'ru' dataset...


Fetching & cleaning ru samples:   0%|          | 0/30000 [00:00<?, ? examples/s]

Loaded 30000 samples for 'ru'.
Total samples loaded: 60000
Label distribution: Counter({'ru': 30000, 'ky': 15000, 'kk': 15000})
Train size: 48000, Test size: 12000


Writing train data:   0%|          | 0/48000 [00:00<?, ?it/s]


Training Subword Tokenizer...
Subword Tokenizer training complete (103.56 seconds).

Pre-generating embeddings for the test set...

Loading pre-trained embedding model: xlm-roberta-base


Embedding Batches:   0%|          | 0/188 [00:00<?, ?it/s]

Test embeddings generated: (12000, 768)
Processing Training Fraction: 0.10
Using 4800 training samples.

Starting Character N-grams for fraction 0.10...
Fitting Char TF-IDF...
Training & Evaluating: Character N-grams (Fraction: 0.10)
Training complete (7.19 seconds).
Evaluation complete (0.20 seconds).
Evaluation for Fraction: 0.10, Method: Character N-grams):
              precision    recall  f1-score   support

          kk     1.0000    0.9977    0.9988      3000
          ky     1.0000    0.9993    0.9997      3000
          ru     0.9985    1.0000    0.9993      6000

    accuracy                         0.9992     12000
   macro avg     0.9995    0.9990    0.9992     12000
weighted avg     0.9993    0.9992    0.9992     12000


Starting Subword Units for fraction 0.10...
Generating subword features using the tokenizer...


Tokenizing (Subword):   0%|          | 0/4800 [00:00<?, ?it/s]

Fitting TF-IDF Vectorizer for subwords...
Subword TF-IDF fitted: (4800, 10664)
Generating subword features using the tokenizer...


Tokenizing (Subword):   0%|          | 0/12000 [00:00<?, ?it/s]

Transforming with existing subword TF-IDF Vectorizer...
Training & Evaluating: Subword Units (Fraction: 0.10)
Training complete (0.45 seconds).
Evaluation complete (0.02 seconds).
Evaluation for Fraction: 0.10, Method: Subword Units):
              precision    recall  f1-score   support

          kk     1.0000    0.9973    0.9987      3000
          ky     0.9993    0.9980    0.9987      3000
          ru     0.9978    0.9998    0.9988      6000

    accuracy                         0.9988     12000
   macro avg     0.9991    0.9984    0.9987     12000
weighted avg     0.9988    0.9988    0.9987     12000


Starting Embeddings (Fraction 0.10)...

Loading pre-trained embedding model: xlm-roberta-base


Embedding Batches:   0%|          | 0/75 [00:00<?, ?it/s]

Training & Evaluating: Embeddings (Fraction: 0.10)
Training complete (1.93 seconds).
Evaluation complete (0.02 seconds).
Evaluation for Fraction: 0.10, Method: Embeddings):
              precision    recall  f1-score   support

          kk     0.9993    0.9977    0.9985      3000
          ky     0.9987    0.9977    0.9982      3000
          ru     0.9978    0.9992    0.9985      6000

    accuracy                         0.9984     12000
   macro avg     0.9986    0.9982    0.9984     12000
weighted avg     0.9984    0.9984    0.9984     12000

Processing Training Fraction: 0.50
Using 24000 training samples.

Starting Character N-grams for fraction 0.50...
Fitting Char TF-IDF...
Training & Evaluating: Character N-grams (Fraction: 0.50)
Training complete (35.45 seconds).
Evaluation complete (0.21 seconds).
Evaluation for Fraction: 0.50, Method: Character N-grams):
              precision    recall  f1-score   support

          kk     1.0000    0.9990    0.9995      3000
          ky

Tokenizing (Subword):   0%|          | 0/24000 [00:00<?, ?it/s]

Fitting TF-IDF Vectorizer for subwords...
Subword TF-IDF fitted: (24000, 10706)
Generating subword features using the tokenizer...


Tokenizing (Subword):   0%|          | 0/12000 [00:00<?, ?it/s]

Transforming with existing subword TF-IDF Vectorizer...
Training & Evaluating: Subword Units (Fraction: 0.50)
Training complete (3.00 seconds).
Evaluation complete (0.02 seconds).
Evaluation for Fraction: 0.50, Method: Subword Units):
              precision    recall  f1-score   support

          kk     1.0000    0.9990    0.9995      3000
          ky     0.9997    0.9987    0.9992      3000
          ru     0.9990    1.0000    0.9995      6000

    accuracy                         0.9994     12000
   macro avg     0.9996    0.9992    0.9994     12000
weighted avg     0.9994    0.9994    0.9994     12000


Starting Embeddings (Fraction 0.50)...

Loading pre-trained embedding model: xlm-roberta-base


Embedding Batches:   0%|          | 0/375 [00:00<?, ?it/s]

Training & Evaluating: Embeddings (Fraction: 0.50)
Training complete (13.00 seconds).
Evaluation complete (0.03 seconds).
Evaluation for Fraction: 0.50, Method: Embeddings):
              precision    recall  f1-score   support

          kk     0.9987    0.9983    0.9985      3000
          ky     0.9987    0.9980    0.9983      3000
          ru     0.9983    0.9988    0.9986      6000

    accuracy                         0.9985     12000
   macro avg     0.9986    0.9984    0.9985     12000
weighted avg     0.9985    0.9985    0.9985     12000

Processing Training Fraction: 1.00
Using 48000 training samples.

Starting Character N-grams for fraction 1.00...
Fitting Char TF-IDF...
Training & Evaluating: Character N-grams (Fraction: 1.00)
Training complete (70.18 seconds).
Evaluation complete (0.20 seconds).
Evaluation for Fraction: 1.00, Method: Character N-grams):
              precision    recall  f1-score   support

          kk     1.0000    0.9993    0.9997      3000
          k

Tokenizing (Subword):   0%|          | 0/48000 [00:00<?, ?it/s]

Fitting TF-IDF Vectorizer for subwords...
Subword TF-IDF fitted: (48000, 10716)
Generating subword features using the tokenizer...


Tokenizing (Subword):   0%|          | 0/12000 [00:00<?, ?it/s]

Transforming with existing subword TF-IDF Vectorizer...
Training & Evaluating: Subword Units (Fraction: 1.00)
Training complete (7.69 seconds).
Evaluation complete (0.02 seconds).
Evaluation for Fraction: 1.00, Method: Subword Units):
              precision    recall  f1-score   support

          kk     1.0000    0.9990    0.9995      3000
          ky     0.9997    0.9987    0.9992      3000
          ru     0.9990    1.0000    0.9995      6000

    accuracy                         0.9994     12000
   macro avg     0.9996    0.9992    0.9994     12000
weighted avg     0.9994    0.9994    0.9994     12000

Generating Confusion Matrix...

Starting Embeddings (Fraction 1.00)...

Loading pre-trained embedding model: xlm-roberta-base


Embedding Batches:   0%|          | 0/750 [00:00<?, ?it/s]

Training & Evaluating: Embeddings (Fraction: 1.00)
Training complete (27.99 seconds).
Evaluation complete (0.03 seconds).
Evaluation for Fraction: 1.00, Method: Embeddings):
              precision    recall  f1-score   support

          kk     0.9990    0.9983    0.9987      3000
          ky     0.9987    0.9983    0.9985      3000
          ru     0.9983    0.9988    0.9986      6000

    accuracy                         0.9986     12000
   macro avg     0.9987    0.9985    0.9986     12000
weighted avg     0.9986    0.9986    0.9986     12000

Generating Confusion Matrix...

Overall Performance Metrics Across Fractions
              method  fraction  accuracy  f1_macro   f1_ky   f1_kk  \
0  Character N-grams       0.1    0.9992    0.9992  0.9997  0.9988   
1      Subword Units       0.1    0.9988    0.9987  0.9987  0.9987   
2         Embeddings       0.1    0.9984    0.9984  0.9982  0.9985   
3  Character N-grams       0.5    0.9998    0.9997  1.0000  0.9995   
4      Subword Uni

Generating ky/ru mixed set (P=0.05):   0%|          | 0/16 [00:00<?, ?batch/s]

Finished generating 500 mixed sentences for ky -> ru (P=0.05).

--- Generating mixed data using NLLB (swap_prob=0.05): kk -> ru ---
Loading MT model: facebook/nllb-200-distilled-600M
Model and pipeline loaded.
Processing 500 base sentences (kk)...


Generating kk/ru mixed set (P=0.05):   0%|          | 0/16 [00:00<?, ?batch/s]

Finished generating 500 mixed sentences for kk -> ru (P=0.05).

Evaluating on 1000 Mixed Sentences (P=0.05) ---

Evaluating Character N-grams (P=0.05)...
Predicted label distribution: Counter({np.str_('kk'): 500, np.str_('ky'): 499, np.str_('ru'): 1})

Classification Report (Character N-grams Mix P=0.05):
              precision    recall  f1-score   support

          kk     1.0000    1.0000    1.0000       500
          ky     1.0000    0.9980    0.9990       500
          ru     0.0000    0.0000    0.0000         0

    accuracy                         0.9990      1000
   macro avg     0.6667    0.6660    0.6663      1000
weighted avg     1.0000    0.9990    0.9995      1000

Confusion Matrix (Character N-grams Mix P=0.05):
--------------------------------------------------
Displaying up to 10 misclassified examples:
Example 1:
Original Base Lang: ky
Predicted Lang: ru
Mixed Text:
  'Фотография баян Кыска өмүрүндө источник деятельность показывать алган Төрөкул Айтматов Замана Телебе

Tokenizing (Subword):   0%|          | 0/1000 [00:00<?, ?it/s]

Transforming with existing subword TF-IDF Vectorizer...
Predicted label distribution: Counter({np.str_('kk'): 499, np.str_('ky'): 498, np.str_('ru'): 3})

Classification Report (Subword Units Mix P=0.05):
              precision    recall  f1-score   support

          kk     1.0000    0.9980    0.9990       500
          ky     1.0000    0.9960    0.9980       500
          ru     0.0000    0.0000    0.0000         0

    accuracy                         0.9970      1000
   macro avg     0.6667    0.6647    0.6657      1000
weighted avg     1.0000    0.9970    0.9985      1000

Confusion Matrix (Subword Units Mix P=0.05):
--------------------------------------------------
Displaying up to 10 misclassified examples:
Example 1:
Original Base Lang: kk
Predicted Lang: ru
Mixed Text:
  'vegan highlighter жеке затбелгі макияжы тапсырыс бойынша ерекшеленетін макияж жеке oem жарықтандырғыш палитрасы'
--------------------
Example 2:
Original Base Lang: ky
Predicted Lang: ru
Mixed Text:
  'Это 

Embedding Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Predicted label distribution: Counter({np.str_('ky'): 499, np.str_('kk'): 499, np.str_('ru'): 2})

Classification Report (Embeddings Mix P=0.05):
              precision    recall  f1-score   support

          kk     1.0000    0.9980    0.9990       500
          ky     1.0000    0.9980    0.9990       500
          ru     0.0000    0.0000    0.0000         0

    accuracy                         0.9980      1000
   macro avg     0.6667    0.6653    0.6660      1000
weighted avg     1.0000    0.9980    0.9990      1000

Confusion Matrix (Embeddings Mix P=0.05):
--------------------------------------------------
Displaying up to 10 misclassified examples:
Example 1:
Original Base Lang: ky
Predicted Lang: ru
Mixed Text:
  'Недвижимость в одессе С давних пор недвижимость в одессе относится к наиболее надежным способам сохранности финансов. Подобной гарантии нельзя ожидать ни от денег, ни от золота, ни от драгоценных камней. Ювелирные изделия Купить недвижимость на Северном Кипре 1. Сайт 

Generating ky/ru mixed set (P=0.15):   0%|          | 0/16 [00:00<?, ?batch/s]

Finished generating 500 mixed sentences for ky -> ru (P=0.15).

--- Generating mixed data using NLLB (swap_prob=0.15): kk -> ru ---
Loading MT model: facebook/nllb-200-distilled-600M
Model and pipeline loaded.
Processing 500 base sentences (kk)...


Generating kk/ru mixed set (P=0.15):   0%|          | 0/16 [00:00<?, ?batch/s]

Finished generating 500 mixed sentences for kk -> ru (P=0.15).

Evaluating on 1000 Mixed Sentences (P=0.15) ---

Evaluating Character N-grams (P=0.15)...
Predicted label distribution: Counter({np.str_('kk'): 499, np.str_('ky'): 498, np.str_('ru'): 3})

Classification Report (Character N-grams Mix P=0.15):
              precision    recall  f1-score   support

          kk     1.0000    0.9980    0.9990       500
          ky     1.0000    0.9960    0.9980       500
          ru     0.0000    0.0000    0.0000         0

    accuracy                         0.9970      1000
   macro avg     0.6667    0.6647    0.6657      1000
weighted avg     1.0000    0.9970    0.9985      1000

Confusion Matrix (Character N-grams Mix P=0.15):
--------------------------------------------------
Displaying up to 10 misclassified examples:
Example 1:
Original Base Lang: ky
Predicted Lang: ru
Mixed Text:
  'Created page with Автомат lang-grautomatos сам на работу келүүчү техникада өзү аракеттенүүчү түзүлмө

Tokenizing (Subword):   0%|          | 0/1000 [00:00<?, ?it/s]

Transforming with existing subword TF-IDF Vectorizer...
Predicted label distribution: Counter({np.str_('kk'): 499, np.str_('ky'): 497, np.str_('ru'): 4})

Classification Report (Subword Units Mix P=0.15):
              precision    recall  f1-score   support

          kk     1.0000    0.9980    0.9990       500
          ky     1.0000    0.9940    0.9970       500
          ru     0.0000    0.0000    0.0000         0

    accuracy                         0.9960      1000
   macro avg     0.6667    0.6640    0.6653      1000
weighted avg     1.0000    0.9960    0.9980      1000

Confusion Matrix (Subword Units Mix P=0.15):
--------------------------------------------------
Displaying up to 10 misclassified examples:
Example 1:
Original Base Lang: ky
Predicted Lang: ru
Mixed Text:
  '--ifИзображениеКатегорияУикипедияТасмалар жөнүндө макалалардын постер же обложкасымукабасы жоктор ifeqСүрөтno poster.svgКатегорияУикипедияТасмалар жөнүндө макалалардын постер же обложкасымукабасы жоктор ife

Embedding Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Predicted label distribution: Counter({np.str_('ky'): 499, np.str_('kk'): 499, np.str_('ru'): 2})

Classification Report (Embeddings Mix P=0.15):
              precision    recall  f1-score   support

          kk     1.0000    0.9980    0.9990       500
          ky     1.0000    0.9980    0.9990       500
          ru     0.0000    0.0000    0.0000         0

    accuracy                         0.9980      1000
   macro avg     0.6667    0.6653    0.6660      1000
weighted avg     1.0000    0.9980    0.9990      1000

Confusion Matrix (Embeddings Mix P=0.15):
--------------------------------------------------
Displaying up to 10 misclassified examples:
Example 1:
Original Base Lang: kk
Predicted Lang: ru
Mixed Text:
  'Негізгі бет Автокөліктер мен көлік құралдары Турбо-Трактор против вездехода Секач. Разведка тяжелого маршрута!'
--------------------
Example 2:
Original Base Lang: ky
Predicted Lang: ru
Mixed Text:
  'Недвижимость в одессе С давних пор недвижимость в одессе относится 

Generating ky/ru mixed set (P=0.30):   0%|          | 0/16 [00:00<?, ?batch/s]

Finished generating 500 mixed sentences for ky -> ru (P=0.30).

--- Generating mixed data using NLLB (swap_prob=0.30): kk -> ru ---
Loading MT model: facebook/nllb-200-distilled-600M
Model and pipeline loaded.
Processing 500 base sentences (kk)...


Generating kk/ru mixed set (P=0.30):   0%|          | 0/16 [00:00<?, ?batch/s]

Finished generating 500 mixed sentences for kk -> ru (P=0.30).

Evaluating on 1000 Mixed Sentences (P=0.30) ---

Evaluating Character N-grams (P=0.30)...
Predicted label distribution: Counter({np.str_('ky'): 491, np.str_('kk'): 488, np.str_('ru'): 21})

Classification Report (Character N-grams Mix P=0.30):
              precision    recall  f1-score   support

          kk     1.0000    0.9760    0.9879       500
          ky     1.0000    0.9820    0.9909       500
          ru     0.0000    0.0000    0.0000         0

    accuracy                         0.9790      1000
   macro avg     0.6667    0.6527    0.6596      1000
weighted avg     1.0000    0.9790    0.9894      1000

Confusion Matrix (Character N-grams Mix P=0.30):
--------------------------------------------------
Displaying up to 10 misclassified examples:
Example 1:
Original Base Lang: kk
Predicted Lang: ru
Mixed Text:
  'Я боздасам, боздар едім как бота Мен Если я замерз, лозы Я бы хотел как сенок, Снять едім страны на

Tokenizing (Subword):   0%|          | 0/1000 [00:00<?, ?it/s]

Transforming with existing subword TF-IDF Vectorizer...
Predicted label distribution: Counter({np.str_('kk'): 491, np.str_('ky'): 479, np.str_('ru'): 30})

Classification Report (Subword Units Mix P=0.30):
              precision    recall  f1-score   support

          kk     1.0000    0.9820    0.9909       500
          ky     1.0000    0.9580    0.9785       500
          ru     0.0000    0.0000    0.0000         0

    accuracy                         0.9700      1000
   macro avg     0.6667    0.6467    0.6565      1000
weighted avg     1.0000    0.9700    0.9847      1000

Confusion Matrix (Subword Units Mix P=0.30):
--------------------------------------------------
Displaying up to 10 misclassified examples:
Example 1:
Original Base Lang: ky
Predicted Lang: ru
Mixed Text:
  'ZHEJIANG GOGOGO MECHANICAL ЭЛЕКТРИК CO., LTD, тажрыйбалуу производитель жана кыймылдаткычтарды, суу насосторун жана башка туунду продукты поставщик как ...'
--------------------
Example 2:
Original Base La

Embedding Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Predicted label distribution: Counter({np.str_('kk'): 497, np.str_('ky'): 495, np.str_('ru'): 8})

Classification Report (Embeddings Mix P=0.30):
              precision    recall  f1-score   support

          kk     1.0000    0.9940    0.9970       500
          ky     1.0000    0.9900    0.9950       500
          ru     0.0000    0.0000    0.0000         0

    accuracy                         0.9920      1000
   macro avg     0.6667    0.6613    0.6640      1000
weighted avg     1.0000    0.9920    0.9960      1000

Confusion Matrix (Embeddings Mix P=0.30):
--------------------------------------------------
Displaying up to 10 misclassified examples:
Example 1:
Original Base Lang: ky
Predicted Lang: ru
Mixed Text:
  'Кургак транспорттук тампон производители - Кытай Сухой транспортные тампон фабрика Жабдып сторонники'
--------------------
Example 2:
Original Base Lang: kk
Predicted Lang: ru
Mixed Text:
  'мені мына на адрес жазуға тырысыңыз, затем кейін мен я мои фотографии жіберем