In [55]:
import pandas as pd
from datasets import load_dataset
from collections import OrderedDict
from IPython.display import Audio
from sklearn.feature_extraction.text import CountVectorizer
import librosa
import numpy as np
import re
from unidecode import unidecode
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.model_selection import train_test_split
nltk.download('wordnet')
nltk.download('omw')
nltk.download('omw-1.4')
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gusta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw to
[nltk_data]     C:\Users\gusta\AppData\Roaming\nltk_data...
[nltk_data]   Package omw is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\gusta\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gusta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [56]:
# List of language codes for the desired languages


language_codes = {
    "en": "English",
    "es": "Spanish",
    "ru": "Russian",
    "ja": "Japanese",
    "zh-CN": "Mandarin",
}

transcripts = []

for lang_code, lang_name in language_codes.items():
    try:
        dataset = load_dataset("mozilla-foundation/common_voice_11_0", lang_code, split="test", streaming=True)
        count = 0
        for x in dataset:
            if count >= 5000:
                break
            count += 1
            transcripts.append({"language": lang_name, "sentence": x["sentence"], "audio": x["audio"]})
        print(f"Loaded transcripts for language: {lang_name}")
    except Exception as e:
        print(f"Error loading transcripts for language: {lang_name}. Error: {str(e)}")

df = pd.DataFrame(transcripts)
print(df)

Reading metadata...: 16354it [00:00, 29711.35it/s]


Error loading transcripts for language: English. Error: Unable to allocate 1.61 MiB for an array with shape (210816, 1) and data type float64


Reading metadata...: 15520it [00:00, 24191.62it/s]


Error loading transcripts for language: Spanish. Error: Unable to allocate 3.34 MiB for an array with shape (437184,) and data type float64


Reading metadata...: 9630it [00:00, 18190.02it/s]


Error loading transcripts for language: Russian. Error: Unable to allocate 2.15 MiB for an array with shape (281664,) and data type float64


Reading metadata...: 4604it [00:00, 22022.72it/s]


Error loading transcripts for language: Japanese. Error: Unable to allocate 2.44 MiB for an array with shape (319680, 1) and data type float64


Reading metadata...: 10581it [00:00, 32852.85it/s]


Error loading transcripts for language: Mandarin. Error: Unable to allocate 2.29 MiB for an array with shape (299760,) and data type float64
      language                                           sentence  \
0      English  Joe Keaton disapproved of films, and Buster al...   
1      English                               She'll be all right.   
2      English                                                six   
3      English                         All's well that ends well.   
4      English  It is a busy market town that serves a large s...   
...        ...                                                ...   
2462  Mandarin                                       殿试登进士第二甲第四名。   
2463  Mandarin                                     他主要的工作是研究冷凝气体。   
2464  Mandarin                                            属米兰总教区。   
2465  Mandarin                                               张九皋。   
2466  Mandarin                                       小酒吧老板天民家中休息。   

                              

In [57]:
# Romanize the transcripts

def romanize_text(text):
    romanized_text = unidecode(text)
    return romanized_text.strip()

# Clean transcripts in the DataFrame
df['sentence'] = df['sentence'].apply(romanize_text)

In [58]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    text = ' '.join(tokens)
    return text

def extract_features(signal, sr):
    # Step 1: Preprocessing
    signal = librosa.effects.trim(signal)[0]
    
    # Step 2: Frame the signal
    hop_length = int(sr / 100) # 10ms hop length
    frames = librosa.util.frame(signal, frame_length=hop_length, hop_length=hop_length)
    
    # Step 3: Apply a window function
    window = np.hamming(hop_length)
    frames_windowed = frames * window[:, np.newaxis]
    
    # Step 4: Compute the spectral features
    stft = np.abs(librosa.stft(signal, n_fft=hop_length*2, hop_length=hop_length, win_length=hop_length*2, window='hamming'))
    mel = librosa.feature.melspectrogram(sr=sr, S=stft**2)
    mfcc = librosa.feature.mfcc(S=librosa.power_to_db(mel))
    
    # Step 5: Aggregate the features
    features = np.concatenate([
        np.mean(mfcc, axis=1),
        np.std(mfcc, axis=1),
        np.median(mfcc, axis=1),
    ])
    
    return features


In [59]:
X_text = []
X_audio = []
y = []

for i, row in df.iterrows():
    preprocessed_sentence = preprocess_text(row['sentence'])
    X_text.append(preprocessed_sentence)
    y.append(row['language'])
    feature = extract_features(row['audio']['array'], row['audio']['sampling_rate'])
    X_audio.append(feature)
    
print(len(X_text))
print(len(X_audio))
print(len(y))
X_textTrain, X_textTest, X_audioTrain, X_audioTest, y_train, y_test = train_test_split(X_text, X_audio, y, test_size=0.3, random_state=40)

2467
2467
2467


In [60]:
count = CountVectorizer()
tfidf = TfidfVectorizer()

X_textTrain = tfidf.fit_transform(X_textTrain)
X_textTest = tfidf.transform(X_textTest)


In [61]:
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, roc_curve, auc, classification_report
from sklearn.metrics import accuracy_score

audioModel = SVC(kernel='linear', C=1, gamma='scale')
audioModel.fit(X_audioTrain, y_train)

# Test the model on the testing set
y_pred = audioModel.predict(X_audioTest)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
y_svc_predicted = audioModel.predict(X_audioTest)
print(classification_report(y_test, y_svc_predicted))

Accuracy: 0.9176788124156545
              precision    recall  f1-score   support

     English       0.92      1.00      0.96       682
    Japanese       0.00      0.00      0.00         1
    Mandarin       1.00      0.02      0.04        45
     Russian       0.00      0.00      0.00         5
     Spanish       0.00      0.00      0.00         8

    accuracy                           0.92       741
   macro avg       0.38      0.20      0.20       741
weighted avg       0.91      0.92      0.88       741



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
from sklearn.linear_model import LogisticRegression
textModel = LogisticRegression()
textModel.fit(X_textTrain, y_train)


y_pred = textModel.predict(X_textTest)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
y_lr_predicted = textModel.predict(X_textTest)
print(classification_report(y_test, y_lr_predicted))

Accuracy: 0.9689608636977058
              precision    recall  f1-score   support

     English       0.97      1.00      0.98       682
    Japanese       0.00      0.00      0.00         1
    Mandarin       0.97      0.78      0.86        45
     Russian       0.00      0.00      0.00         5
     Spanish       1.00      0.12      0.22         8

    accuracy                           0.97       741
   macro avg       0.59      0.38      0.41       741
weighted avg       0.96      0.97      0.96       741



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
