In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
!cp  /content/drive/MyDrive/WiLi_2018_wikipedia_dataset.zip .
!unzip WiLi_2018_wikipedia_dataset.zip
!mv dataset.csv WiLi_2018_wikipedia_dataset.csv
!rm WiLi_2018_wikipedia_dataset.zip

Archive:  WiLi_2018_wikipedia_dataset.zip
  inflating: dataset.csv             


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.8 MB/s[0m eta [36m0:00:0

In [3]:
# basic libraries
import os
import re
import pickle
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

# visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# model building tools
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, accuracy_score

## Flores dataset

In [4]:
# set directories
input_dir = './'
working_dir = './'
# data = pd.read_csv('WiLi_2018_wikipedia_dataset.csv')
data = pd.read_feather('/content/drive/MyDrive/flores_dataset.feather')
data.head()

Unnamed: 0,text,language,macro
0,But shani hta Stanford dakkasu tsi dap kaw na ...,Kachin,Sino-Tibetan
1,Ndai arai hkyep gaw sinda mawng ana hpe sut su...,Kachin,Sino-Tibetan
2,"JAS 39C Gripen mying ai pyenli gaw, buga ginda...",Kachin,Sino-Tibetan
3,Dai nbungli gau ai wa hpe Squadron a ning baw ...,Kachin,Sino-Tibetan
4,"Buga shiga dap kaw na shapoi ai lam gaw, mahta...",Kachin,Sino-Tibetan


In [7]:
data['text'] = data['text'].apply(lambda x: x[:20] if len(x) > 20 else x)

# Print the modified DataFrame
print(data)

                        text language          macro
0       But shani hta Stanfo   Kachin   Sino-Tibetan
1       Ndai arai hkyep gaw    Kachin   Sino-Tibetan
2       JAS 39C Gripen mying   Kachin   Sino-Tibetan
3       Dai nbungli gau ai w   Kachin   Sino-Tibetan
4       Buga shiga dap kaw n   Kachin   Sino-Tibetan
...                      ...      ...            ...
202386  हिल स्टेशन के लेल पर   Magahi  Indo-European
202387  हालांकि, सर्दी के दौ   Magahi  Indo-European
202388  केवल कुछ एयरलाइंस अभ   Magahi  Indo-European
202389  एयरलाइंस जे एगोर पेश   Magahi  Indo-European
202390  सभे मामला में, अहां    Magahi  Indo-European

[202391 rows x 3 columns]


## Preprocessing

In [8]:
x_train, x_test, y_train, y_test = train_test_split(data.text.values, data.language.values, test_size=0.2, random_state=42)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((161912,), (161912,), (40479,), (40479,))

## Completely pretrained but a little finetune

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

x_train_tokens = tokenizer(list(x_train), padding=True, truncation=True, return_tensors='pt', max_length=16)
x_test_tokens = tokenizer(list(x_test), padding=True, truncation=True, return_tensors='pt', max_length=16)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

train_dataset = TensorDataset(x_train_tokens['input_ids'], x_train_tokens['attention_mask'], y_train_tensor)
test_dataset = TensorDataset(x_test_tokens['input_ids'], x_test_tokens['attention_mask'], y_test_tensor)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_encoder.classes_))

optimizer = AdamW(model.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss()
num_epochs = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

model.eval()
all_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2, Loss: 1.6350
Epoch 2/2, Loss: 0.6824


In [12]:
num_epochs = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

model.eval()
all_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)

Epoch 1/2, Loss: 0.4697
Epoch 2/2, Loss: 0.3622


In [16]:
label_encoder.classes_

array(['Achinese', 'Afrikaans', 'Akan', 'Albanian', 'Amharic', 'Arabic',
       'Armenian', 'Assamese', 'Asturian', 'Awadhi', 'Aymara',
       'Azerbaijani', 'Balinese', 'Bambara', 'Bashkir', 'Basque',
       'Belarusian', 'Bemba (Zambia)', 'Bengali', 'Bhojpuri', 'Buginese',
       'Bulgarian', 'Burmese', 'Catalan', 'Cebuano',
       'Central Atlas Tamazight', 'Chhattisgarhi', 'Chinese', 'Chokwe',
       'Crimean Tatar', 'Czech', 'Danish', 'Dinka', 'Dutch', 'Dyula',
       'Dzongkha', 'English', 'Esperanto', 'Estonian', 'Ewe', 'Faroese',
       'Fijian', 'Finnish', 'Fon', 'French', 'Friulian', 'Fulah',
       'Galician', 'Ganda', 'Georgian', 'German', 'Guarani', 'Gujarati',
       'Haitian', 'Hausa', 'Hebrew', 'Hindi', 'Hungarian', 'Icelandic',
       'Igbo', 'Iloko', 'Irish', 'Italian', 'Japanese', 'Javanese',
       'Kabiyè', 'Kabuverdianu', 'Kabyle', 'Kachin', 'Kamba (Kenya)',
       'Kannada', 'Kanuri', 'Kashmiri', 'Kazakh', 'Khmer', 'Kikuyu',
       'Kimbundu', 'Kinyarwanda', 'Kir

In [13]:
torch.save(model, '/content/drive/MyDrive/short_text_bert.pth')

In [14]:
y_pred = label_encoder.inverse_transform(all_preds)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_rep)

Accuracy: 0.8257
                          precision    recall  f1-score   support

                Achinese       0.84      0.75      0.79       401
               Afrikaans       0.90      0.83      0.87       198
                    Akan       0.85      0.89      0.87       421
                Albanian       0.96      0.92      0.94       194
                 Amharic       0.25      0.04      0.06       200
                  Arabic       0.98      0.99      0.99      1773
                Armenian       1.00      1.00      1.00       176
                Assamese       0.98      0.95      0.97       195
                Asturian       0.76      0.66      0.71       183
                  Awadhi       0.44      0.70      0.54       181
                  Aymara       0.96      0.81      0.88       207
             Azerbaijani       0.96      0.86      0.91       420
                Balinese       0.72      0.66      0.69       192
                 Bambara       0.91      0.68      0.78   

In [15]:
# Function to predict the language of a sentence
def predict_language(sentence, model, label_encoder, device):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    model.eval()
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_language = label_encoder.inverse_transform([predicted_class])[0]
    return predicted_language

sentence1 = "This is a sample text."
sentence2 = 'मेरा नाम हर्ष हे'
sentence3 = 'すみません、その駅まで案内していただけますか'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predicted_language = predict_language(sentence1, model, label_encoder, device)
print(f"Predicted Language: {predicted_language}")
predicted_language = predict_language(sentence2, model, label_encoder, device)
print(f"Predicted Language: {predicted_language}")
predicted_language = predict_language(sentence3, model, label_encoder, device)
print(f"Predicted Language: {predicted_language}")

Predicted Language: English
Predicted Language: Awadhi
Predicted Language: Japanese


## Naive Bayes

In [17]:
# function to clean text
def clean_txt(text):
    text=text.lower()
    text=re.sub(r'[^\w\s]',' ',text)
    text=re.sub(r'[_0-9]',' ',text)
    text=re.sub(r'\s\s+',' ',text)
    return text

x_train = [clean_txt(text) for text in tqdm(x_train)]
x_test = [clean_txt(text) for text in tqdm(x_test)]

tfidf = TfidfVectorizer()
tfidf.fit(x_train)
x_train_ready = tfidf.transform(x_train)
x_test_ready = tfidf.transform(x_test)

x_train_ready,x_test_ready

enc = LabelEncoder()
enc.fit(y_train)
y_train_ready = enc.transform(y_train)
y_test_ready = enc.transform(y_test)
labels = enc.classes_
nb = MultinomialNB()

100%|██████████| 161912/161912 [00:01<00:00, 161505.03it/s]
100%|██████████| 40479/40479 [00:00<00:00, 266969.67it/s]


In [18]:
from sklearn.metrics import classification_report
nb.fit(x_train_ready, y_train_ready)
y_pred = nb.predict(x_test_ready)
y_pred_original = enc.inverse_transform(y_pred)
classification_rep = classification_report(y_test, y_pred_original, target_names=labels)
print(classification_rep)

                          precision    recall  f1-score   support

                Achinese       0.95      0.52      0.67       401
               Afrikaans       0.75      0.55      0.64       198
                    Akan       0.64      0.79      0.71       421
                Albanian       0.99      0.47      0.64       194
                 Amharic       1.00      0.22      0.36       200
                  Arabic       0.09      0.99      0.17      1773
                Armenian       1.00      0.15      0.26       176
                Assamese       0.94      0.52      0.67       195
                Asturian       0.56      0.19      0.29       183
                  Awadhi       0.19      0.10      0.14       181
                  Aymara       0.94      0.24      0.38       207
             Azerbaijani       0.75      0.32      0.45       420
                Balinese       0.96      0.23      0.37       192
                 Bambara       0.85      0.30      0.44       196
         

In [19]:
# use pipeline to combine prefitted vectorizer and trained model into one object
model = Pipeline([('vectorizer',tfidf),('nb',nb)])
# function to predict language from text
def predict(text):
    pred = model.predict([clean_txt(text)])
    ans = enc.inverse_transform(pred)
    return ans[0]

predict('my name is harsh'), predict('मेरा नाम हर्ष हे'), predict('mi nombre es harsh'), predict('меня зовут Харш'), predict('mon nom est harsh')

('Arabic', 'Nepali (macrolanguage)', 'Arabic', 'Arabic', 'Arabic')