In [7]:
import os
import pandas as pd
import unicodedata
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
from sklearn.naive_bayes import MultinomialNB, MultinomialNB

In [4]:
from datasets import load_dataset
dataset = load_dataset("qanastek/MASSIVE", "en-US", split='train')
print(dataset)
print(dataset[0])


Dataset({
    features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'tokens', 'ner_tags', 'worker_id', 'slot_method', 'judgments'],
    num_rows: 11514
})
{'id': '1', 'locale': 'en-US', 'partition': 'train', 'scenario': 9, 'intent': 55, 'utt': 'wake me up at nine am on friday', 'annot_utt': 'wake me up at [time : nine am] on [date : friday]', 'tokens': ['wake', 'me', 'up', 'at', 'nine', 'am', 'on', 'friday'], 'ner_tags': [0, 0, 0, 0, 60, 16, 0, 7], 'worker_id': '1', 'slot_method': {'slot': [], 'method': []}, 'judgments': {'worker_id': [], 'intent_score': [], 'slots_score': [], 'grammar_score': [], 'spelling_score': [], 'language_identification': []}}


In [8]:
df = pd.DataFrame({
    'utt': dataset['utt'],
    'tokens': dataset['tokens'],
    'language': dataset['locale']
})

print(df.head()) 

                                   utt  \
0      wake me up at nine am on friday   
1  set an alarm for two hours from now   
2                           olly quiet   
3                                 stop   
4           olly pause for ten seconds   

                                         tokens language  
0      [wake, me, up, at, nine, am, on, friday]    en-US  
1  [set, an, alarm, for, two, hours, from, now]    en-US  
2                                 [olly, quiet]    en-US  
3                                        [stop]    en-US  
4              [olly, pause, for, ten, seconds]    en-US  


In [9]:
dataset = load_dataset("qanastek/MASSIVE", "af-ZA", split='train')
print(dataset)
print(dataset[0])


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'tokens', 'ner_tags', 'worker_id', 'slot_method', 'judgments'],
    num_rows: 11514
})
{'id': '1', 'locale': 'af-ZA', 'partition': 'train', 'scenario': 9, 'intent': 55, 'utt': 'maak my wakker nege-uur v. m. op vrydag', 'annot_utt': 'maak my wakker [time : nege-uur v. m.] op [date : vrydag]', 'tokens': ['maak', 'my', 'wakker', 'nege-uur', 'v.', 'm.', 'op', 'vrydag'], 'ner_tags': [0, 0, 0, 60, 16, 16, 0, 7], 'worker_id': '20', 'slot_method': {'slot': ['time', 'date'], 'method': ['translation', 'translation']}, 'judgments': {'worker_id': ['40', '49', '20'], 'intent_score': [1, 1, 1], 'slots_score': [1, 1, 1], 'grammar_score': [4, 4, 4], 'spelling_score': [2, 2, 2], 'language_identification': ['target', 'target', 'target']}}


In [10]:
df_2 = pd.DataFrame({
    'utt': dataset['utt'],
    'tokens': dataset['tokens'],
    'language': dataset['locale']
})

print(df_2.head()) 

                                         utt  \
0    maak my wakker nege-uur v. m. op vrydag   
1      stel 'n alarm vir twee ure van nou af   
2                            janneman stilte   
3                                       stop   
4  janneman onderbreek dit vir tien sekondes   

                                             tokens language  
0  [maak, my, wakker, nege-uur, v., m., op, vrydag]    af-ZA  
1   [stel, 'n, alarm, vir, twee, ure, van, nou, af]    af-ZA  
2                                [janneman, stilte]    af-ZA  
3                                            [stop]    af-ZA  
4  [janneman, onderbreek, dit, vir, tien, sekondes]    af-ZA  


In [13]:
df = pd.concat([df, df_2], ignore_index=True)
print(df.tail())

                                   utt  \
34537    stuur hi in whatsapp na vicky   
34538                   het ek e-posse   
34539           watter e-posse is nuut   
34540  het ek nuwe eposse vanaf pieter   
34541          kyk na epos van jeff af   

                                       tokens language  
34537    [stuur, hi, in, whatsapp, na, vicky]    af-ZA  
34538                      [het, ek, e-posse]    af-ZA  
34539             [watter, e-posse, is, nuut]    af-ZA  
34540  [het, ek, nuwe, eposse, vanaf, pieter]    af-ZA  
34541          [kyk, na, epos, van, jeff, af]    af-ZA  


In [19]:
# Assuming df is already defined and has a 'tokens' column
# Convert tokens to strings if necessary
df['tokens_str'] = df['tokens'].apply(lambda x: ' '.join(x))


# Vectorize the text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['utt'])

# Encode the target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['language'])

# Train the Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X, y)

# Predict and evaluate the classifier
y_pred = clf.predict(X)
print(classification_report(y, y_pred, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

       af-ZA       1.00      1.00      1.00     23028
       en-US       0.99      1.00      1.00     11514

    accuracy                           1.00     34542
   macro avg       1.00      1.00      1.00     34542
weighted avg       1.00      1.00      1.00     34542



In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample documents
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "Never jump over the lazy dog quickly.",
    "Bright sun shines over the lazy dog."
]

# Using CountVectorizer
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(documents)

# Using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Convert the matrices to arrays for better readability
count_array = count_matrix.toarray()
tfidf_array = tfidf_matrix.toarray()

# Get feature names
count_features = count_vectorizer.get_feature_names_out()
tfidf_features = tfidf_vectorizer.get_feature_names_out()

# Print results
print("CountVectorizer results:")
print(count_features)
print(count_array)

print("\nTfidfVectorizer results:")
print(tfidf_features)
print(tfidf_array)

CountVectorizer results:
['bright' 'brown' 'dog' 'fox' 'jump' 'jumps' 'lazy' 'never' 'over' 'quick'
 'quickly' 'shines' 'sun' 'the']
[[0 1 1 1 0 1 1 0 1 1 0 0 0 2]
 [0 0 1 0 1 0 1 1 1 0 1 0 0 1]
 [1 0 1 0 0 0 1 0 1 0 0 1 1 1]]

TfidfVectorizer results:
['bright' 'brown' 'dog' 'fox' 'jump' 'jumps' 'lazy' 'never' 'over' 'quick'
 'quickly' 'shines' 'sun' 'the']
[[0.         0.3940004  0.23270298 0.3940004  0.         0.3940004
  0.23270298 0.         0.23270298 0.3940004  0.         0.
  0.         0.46540596]
 [0.         0.         0.28171538 0.         0.4769856  0.
  0.28171538 0.4769856  0.28171538 0.         0.4769856  0.
  0.         0.28171538]
 [0.4769856  0.         0.28171538 0.         0.         0.
  0.28171538 0.         0.28171538 0.         0.         0.4769856
  0.4769856  0.28171538]]
