In [4]:
import os
import unicodedata
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np

In [None]:
# Define the list of locales
locales = [
    "af-ZA", "da-DK", "de-DE", "en-US", "es-ES", "fr-FR", "fi-FI", "hu-HU", "is-IS", "it-IT",
    "jv-ID", "lv-LV", "ms-MY", "nb-NO", "nl-NL", "pl-PL", "pt-PT", "ro-RO", "ru-RU", "sl-SL",
    "sv-SE", "sq-AL", "sw-KE", "tl-PH", "tr-TR", "vi-VN", "cy-GB"
]

# Function to deaccent characters
def deaccent(text):
    return ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')

# Load the MASSIVE dataset from Huggingface
dataset = load_dataset("qanastek/MASSIVE")

# Filter the dataset for the relevant locales and partitions
def filter_dataset(dataset, locales, partition):
    filtered_data = dataset.filter(lambda x: x['locale'] in locales and x['partition'] == partition)
    texts = [' '.join(deaccent(token) for token in utt) for utt in filtered_data['tokens']]
    labels = filtered_data['locale']
    return texts, labels

# Get the training, validation, and test data
train_texts, train_labels = filter_dataset(dataset['train'], locales, 'train')
val_texts, val_labels = filter_dataset(dataset['validation'], locales, 'validation')
test_texts, test_labels = filter_dataset(dataset['test'], locales, 'test')

# Encode the labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)
test_labels = label_encoder.transform(test_labels)

# Create a pipeline with a CountVectorizer and MultinomialNB
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Train the model
model.fit(train_texts, train_labels)

# Fine-tune the model with validation data
model.fit(val_texts, val_labels)

# Evaluate the model
def evaluate_model(model, texts, labels, partition_name):
    predictions = model.predict(texts)
    report = classification_report(labels, predictions, target_names=label_encoder.classes_)
    print(f"Performance metrics for {partition_name} partition:")
    print(report)

# Report performance metrics
evaluate_model(model, train_texts, train_labels, "training")
evaluate_model(model, val_texts, val_labels, "validation")
evaluate_model(model, test_texts, test_labels, "test")

In [None]:
# Define the list of locales and their respective continents
locale_to_continent = {
    "af-ZA": "Africa", "da-DK": "Europe", "de-DE": "Europe", "en-US": "North America", "es-ES": "Europe",
    "fr-FR": "Europe", "fi-FI": "Europe", "hu-HU": "Europe", "is-IS": "Europe", "it-IT": "Europe",
    "jv-ID": "Asia", "lv-LV": "Europe", "ms-MY": "Asia", "nb-NO": "Europe", "nl-NL": "Europe",
    "pl-PL": "Europe", "pt-PT": "Europe", "ro-RO": "Europe", "ru-RU": "Europe", "sl-SL": "Europe",
    "sv-SE": "Europe", "sq-AL": "Europe", "sw-KE": "Africa", "tl-PH": "Asia", "tr-TR": "Asia",
    "vi-VN": "Asia", "cy-GB": "Europe"
}

# Function to deaccent characters
def deaccent(text):
    return ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')

# Load the MASSIVE dataset from Huggingface
dataset = load_dataset("qanastek/MASSIVE")

# Filter the dataset for the relevant locales and partitions
def filter_dataset(dataset, locales, partition):
    filtered_data = dataset.filter(lambda x: x['locale'] in locales and x['partition'] == partition)
    texts = [' '.join(deaccent(token) for token in utt) for utt in filtered_data['tokens']]
    labels = [locale_to_continent[x['locale']] for x in filtered_data]
    return texts, labels

# Get the training, validation, and test data
train_texts, train_labels = filter_dataset(dataset['train'], locale_to_continent.keys(), 'train')
val_texts, val_labels = filter_dataset(dataset['validation'], locale_to_continent.keys(), 'validation')
test_texts, test_labels = filter_dataset(dataset['test'], locale_to_continent.keys(), 'test')

# Encode the labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)
test_labels = label_encoder.transform(test_labels)

# Vectorize the text data using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000, min_df=5, stop_words='english')
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)
X_test = vectorizer.transform(test_texts)

# Implement Regularized Discriminant Analysis (RDA)
class RegularizedDiscriminantAnalysis(BaseEstimator, ClassifierMixin):
    def __init__(self, alpha=0.5):
        self.alpha = alpha
        self.lda = LinearDiscriminantAnalysis()
        self.qda = QuadraticDiscriminantAnalysis()
    
    def fit(self, X, y):
        self.lda.fit(X.toarray(), y)
        self.qda.fit(X.toarray(), y)
        return self
    
    def predict(self, X):
        lda_pred = self.lda.predict_proba(X.toarray())
        qda_pred = self.qda.predict_proba(X.toarray())
        combined_pred = self.alpha * lda_pred + (1 - self.alpha) * qda_pred
        return np.argmax(combined_pred, axis=1)

# Create and train the RDA model
rda_model = RegularizedDiscriminantAnalysis(alpha=0.5)
rda_model.fit(X_train, train_labels)

# Evaluate the model
def evaluate_model(model, X, y, partition_name):
    predictions = model.predict(X)
    report = classification_report(y, predictions, target_names=label_encoder.classes_)
    print(f"Performance metrics for {partition_name} partition:")
    print(report)

# Report performance metrics
evaluate_model(rda_model, X_train, train_labels, "training")
evaluate_model(rda_model, X_val, val_labels, "validation")
evaluate_model(rda_model, X_test, test_labels, "test")