In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
!cp  /content/drive/MyDrive/WiLi_2018_wikipedia_dataset.zip .
!unzip WiLi_2018_wikipedia_dataset.zip
!mv dataset.csv WiLi_2018_wikipedia_dataset.csv
!rm WiLi_2018_wikipedia_dataset.zip

Archive:  WiLi_2018_wikipedia_dataset.zip
  inflating: dataset.csv             


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m40.1 MB/s[0m eta [36m0:00:0

In [None]:
# basic libraries
import os
import re
import pickle
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

# visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# model building tools
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, accuracy_score

## Wiki dataset 22

In [None]:
# set directories
input_dir = './'
working_dir = './'
data = pd.read_csv('WiLi_2018_wikipedia_dataset.csv')
data.columns = ('text','language')
data.head()
len(data)

22000

In [None]:
data

Unnamed: 0,text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch
...,...,...
21995,hors du terrain les années et sont des année...,French
21996,ใน พศ หลักจากที่เสด็จประพาสแหลมมลายู ชวา อินเ...,Thai
21997,con motivo de la celebración del septuagésimoq...,Spanish
21998,年月，當時還只有歲的她在美國出道，以mai-k名義推出首張英文《baby i like》，由...,Chinese


## All dataset 455

In [None]:
# set directories
input_dir = './'
working_dir = './'
# data = pd.read_csv('WiLi_2018_wikipedia_dataset.csv')
data = pd.read_feather('/content/drive/MyDrive/final_dataset.feather')
data.columns = ('text','language')
data.head()
len(data)

11123049

## Flores dataset

In [None]:
# set directories
input_dir = './'
working_dir = './'
# data = pd.read_csv('WiLi_2018_wikipedia_dataset.csv')
data = pd.read_feather('/content/drive/MyDrive/flores_dataset.feather')
data.columns = ('text','language')
data.head()
len(data)

202391

In [None]:
# Calculate the total number of languages
total_languages = sum(len(languages) for languages in language_groups.values())

print(f"Total number of languages: {total_languages}")

Total number of languages: 152


## Preprocessing

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data.text.values, data.language.values, test_size=0.2, random_state=42)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((161912,), (161912,), (40479,), (40479,))

In [None]:
# number of texts for each language:
print(f"Number of languages: {len(data.language.value_counts())}\n")
# print(f"Number of data points of individual languages:\n{data.language.value_counts()}")

Number of languages: 176



In [None]:
# sentence length distribution over the entire dataset
sent_lengths = [len(text.split()) for text in data.text.values]
plt.subplots(1,2,figsize=(13,5))
plt.suptitle('Sentence length distribution')
bins=[0,25,50,75,100,125,150,175,200,225,250,275,300,325,350,375,400]
plt.subplot(1,2,1)
sns.histplot(sent_lengths, bins=bins)
plt.subplot(1,2,2)
sns.distplot(sent_lengths, bins=bins)
plt.show()

In [None]:
# function to clean text
def clean_txt(text):
    text=text.lower()
    text=re.sub(r'[^\w\s]',' ',text)
    text=re.sub(r'[_0-9]',' ',text)
    text=re.sub(r'\s\s+',' ',text)
    return text

txt = 'my (&*(()))name %$#is harsh_priye'
print(clean_txt(txt))

my name is harsh priye


## Pretrained with freezing weights so only classifier is learned (not used)

This model even if potentially can prduce good results, requires more computational resources

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')


x_train_tokens = tokenizer(list(x_train), padding=True, truncation=True, return_tensors='pt', max_length=128)
x_test_tokens = tokenizer(list(x_test), padding=True, truncation=True, return_tensors='pt', max_length=128)


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)


train_dataset = TensorDataset(x_train_tokens['input_ids'], x_train_tokens['attention_mask'], y_train_tensor)
test_dataset = TensorDataset(x_test_tokens['input_ids'], x_test_tokens['attention_mask'], y_test_tensor)


batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_encoder.classes_))


for param in model.bert.parameters():
    param.requires_grad = False


optimizer = AdamW(model.classifier.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss()

num_epochs = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")


model.eval()
all_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)

Epoch 1/2, Loss: 3.0530
Epoch 2/2, Loss: 2.9815


In [None]:

y_pred = label_encoder.inverse_transform(all_preds)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_rep)

Accuracy: 0.1264
              precision    recall  f1-score   support

      Arabic       0.15      0.35      0.21       113
     Chinese       0.00      0.00      0.00       108
       Dutch       0.00      0.00      0.00       110
     English       0.08      0.75      0.14       102
    Estonian       0.10      0.04      0.06        98
      French       0.09      0.05      0.06       105
       Hindi       1.00      0.01      0.02       112
  Indonesian       0.00      0.00      0.00       102
    Japanese       0.12      0.70      0.20        90
      Korean       0.00      0.00      0.00        87
       Latin       0.18      0.06      0.09       103
     Persian       0.00      0.00      0.00       107
   Portugese       0.00      0.00      0.00        98
      Pushto       0.58      0.41      0.48       101
    Romanian       0.00      0.00      0.00        98
     Russian       0.09      0.09      0.09       110
     Spanish       0.13      0.07      0.09        95
     Swedi

In [None]:
# Function to predict the language of a sentence
def predict_language(sentence, model, label_encoder, device):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    model.eval()
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_language = label_encoder.inverse_transform([predicted_class])[0]
    return predicted_language

sentence1 = "This is a sample text."
sentence2 = 'मेरा नाम हर्ष हे'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predicted_language = predict_language(sentence1, model, label_encoder, device)
print(f"Predicted Language: {predicted_language}")
predicted_language = predict_language(sentence2, model, label_encoder, device)
print(f"Predicted Language: {predicted_language}")

Predicted Language: English
Predicted Language: English


## Completely pretrained but a little finetune

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

x_train_tokens = tokenizer(list(x_train), padding=True, truncation=True, return_tensors='pt', max_length=128)
x_test_tokens = tokenizer(list(x_test), padding=True, truncation=True, return_tensors='pt', max_length=128)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

train_dataset = TensorDataset(x_train_tokens['input_ids'], x_train_tokens['attention_mask'], y_train_tensor)
test_dataset = TensorDataset(x_test_tokens['input_ids'], x_test_tokens['attention_mask'], y_test_tensor)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_encoder.classes_))

optimizer = AdamW(model.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss()
num_epochs = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

model.eval()
all_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2, Loss: 0.6472
Epoch 2/2, Loss: 0.0727


In [None]:
torch.save(model, '/content/drive/MyDrive/finetuned_bert.pth')

In [None]:
y_pred = label_encoder.inverse_transform(all_preds)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_rep)

Accuracy: 0.9792
                          precision    recall  f1-score   support

                Achinese       0.99      0.98      0.99       376
               Afrikaans       0.98      0.99      0.99       192
                    Akan       1.00      1.00      1.00       416
                Albanian       1.00      1.00      1.00       190
                 Amharic       0.41      0.91      0.57       192
                  Arabic       1.00      1.00      1.00      1858
                Armenian       1.00      1.00      1.00       196
                Assamese       1.00      1.00      1.00       206
                Asturian       0.94      1.00      0.97       207
                  Awadhi       0.96      0.99      0.97       218
                  Aymara       1.00      0.99      0.99       185
             Azerbaijani       1.00      0.99      1.00       400
                Balinese       0.97      0.96      0.96       200
                 Bambara       0.96      0.92      0.94   

In [None]:
# Function to predict the language of a sentence
def predict_language(sentence, model, label_encoder, device):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    model.eval()
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_language = label_encoder.inverse_transform([predicted_class])[0]
    return predicted_language

sentence1 = "This is a sample text."
sentence2 = 'मेरा नाम हर्ष हे'
sentence3 = 'すみません、その駅まで案内していただけますか'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predicted_language = predict_language(sentence1, model, label_encoder, device)
print(f"Predicted Language: {predicted_language}")
predicted_language = predict_language(sentence2, model, label_encoder, device)
print(f"Predicted Language: {predicted_language}")
predicted_language = predict_language(sentence3, model, label_encoder, device)
print(f"Predicted Language: {predicted_language}")

Predicted Language: English
Predicted Language: Chhattisgarhi
Predicted Language: Japanese


## Bi-LSTM (inspired by Apple)

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, accuracy_score

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

x_train_tokens = tokenizer(list(x_train), padding=True, truncation=True, return_tensors='pt', max_length=128)
x_test_tokens = tokenizer(list(x_test), padding=True, truncation=True, return_tensors='pt', max_length=128)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

train_dataset = TensorDataset(x_train_tokens['input_ids'], x_train_tokens['attention_mask'], y_train_tensor)
test_dataset = TensorDataset(x_test_tokens['input_ids'], x_test_tokens['attention_mask'], y_test_tensor)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Define a Bidirectional LSTM model
class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2 * hidden_dim, num_classes)  # Multiply by 2 for bidirectional LSTM

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out[:, -1, :])
        return output

# Initialize the BiLSTM model
vocab_size = len(tokenizer)
embedding_dim = 128
hidden_dim = 128
num_classes = len(label_encoder.classes_)

model = BiLSTMModel(vocab_size, embedding_dim, hidden_dim, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(input_ids)
        loss = criterion(logits, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

model.eval()
all_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        logits = model(input_ids)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)

y_pred = label_encoder.inverse_transform(all_preds)

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(classification_rep)


Epoch 1/5, Loss: 2.1782
Epoch 2/5, Loss: 1.1910
Epoch 3/5, Loss: 0.5931
Epoch 4/5, Loss: 0.2232
Epoch 5/5, Loss: 0.1031
Accuracy: 0.9682
              precision    recall  f1-score   support

      Arabic       1.00      0.99      1.00       113
     Chinese       0.99      1.00      1.00       108
       Dutch       0.97      0.96      0.97       110
     English       0.85      0.95      0.90       102
    Estonian       0.93      0.87      0.90        98
      French       0.95      0.99      0.97       105
       Hindi       1.00      0.98      0.99       112
  Indonesian       0.98      0.94      0.96       102
    Japanese       1.00      0.97      0.98        90
      Korean       1.00      1.00      1.00        87
       Latin       0.85      0.93      0.89       103
     Persian       1.00      0.99      1.00       107
   Portugese       0.98      1.00      0.99        98
      Pushto       1.00      0.95      0.97       101
    Romanian       0.98      0.97      0.97        9

In [None]:
# Function to predict the language of a sentence
def predict_language(sentence, model, label_encoder, device):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    model.eval()
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        logits = model(inputs['input_ids'])
    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_language = label_encoder.inverse_transform([predicted_class])[0]
    return predicted_language

sentence1 = "This is a sample text."
sentence2 = 'Это русский'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predicted_language = predict_language(sentence1, model, label_encoder, device)
print(f"Predicted Language: {predicted_language}")
predicted_language = predict_language(sentence2, model, label_encoder, device)
print(f"Predicted Language: {predicted_language}")


Predicted Language: Portugese
Predicted Language: Estonian


## Keras Bi-LSTM

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# Load your dataset, assuming 'data' contains 'text' and 'language' columns
# x_train, x_test, y_train, y_test = train_test_split(data.text.values, data.language.values, test_size=0.1, random_state=42)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

x_train_sequences = tokenizer.texts_to_sequences(x_train)
x_test_sequences = tokenizer.texts_to_sequences(x_test)

max_sequence_length = 128  # Set your desired sequence length
x_train_padded = pad_sequences(x_train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
x_test_padded = pad_sequences(x_test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define the BiLSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_sequence_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])


model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

num_epochs = 5
batch_size = 32

history = model.fit(
    x_train_padded, y_train_encoded,
    epochs=num_epochs,
    batch_size=batch_size,
    validation_data=(x_test_padded, y_test_encoded),
    verbose=2
)


y_pred = model.predict(x_test_padded)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = y_test_encoded


classification_rep = classification_report(y_test_classes, y_pred_classes, target_names=label_encoder.classes_)
print("Classification Report:\n", classification_rep)


accuracy = accuracy_score(y_test_classes, y_pred_classes)
print(f"Accuracy: {accuracy:.4f}")

def predict_language(sentence, model, tokenizer, label_encoder):
    # Tokenize and preprocess the sentence
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post', truncating='post')
    # Make predictions
    prediction = model.predict(padded_sequence)
    predicted_class = np.argmax(prediction, axis=1)[0]
    predicted_language = label_encoder.inverse_transform([predicted_class])[0]
    return predicted_language

# Example usage
sentence1 = "This is a sample text."
sentence2 = 'Это русский'

predicted_language = predict_language(sentence1, model, tokenizer, label_encoder)
print(f"Predicted Language: {predicted_language}")

predicted_language = predict_language(sentence2, model, tokenizer, label_encoder)
print(f"Predicted Language: {predicted_language}")


Epoch 1/5
619/619 - 113s - loss: 0.6715 - accuracy: 0.8062 - val_loss: 0.2290 - val_accuracy: 0.9300 - 113s/epoch - 183ms/step
Epoch 2/5
619/619 - 44s - loss: 0.0744 - accuracy: 0.9802 - val_loss: 0.1893 - val_accuracy: 0.9432 - 44s/epoch - 71ms/step
Epoch 3/5
619/619 - 25s - loss: 0.0121 - accuracy: 0.9969 - val_loss: 0.2172 - val_accuracy: 0.9441 - 25s/epoch - 40ms/step
Epoch 4/5
619/619 - 21s - loss: 0.0025 - accuracy: 0.9994 - val_loss: 0.2181 - val_accuracy: 0.9459 - 21s/epoch - 35ms/step
Epoch 5/5
619/619 - 21s - loss: 0.0020 - accuracy: 0.9995 - val_loss: 0.2060 - val_accuracy: 0.9514 - 21s/epoch - 34ms/step
Classification Report:
               precision    recall  f1-score   support

      Arabic       1.00      0.99      1.00       113
     Chinese       0.91      0.38      0.54       108
       Dutch       0.99      1.00      1.00       110
     English       0.92      0.96      0.94       102
    Estonian       1.00      0.93      0.96        98
      French       0.98     

In [None]:
# Example usage
sentence1 = "This is a sample text predic tsomething why is it wrong oh my goodness."
sentence2 = 'mi corazon esta noche'

predicted_language = predict_language(sentence1, model, tokenizer, label_encoder)
print(f"Predicted Language: {predicted_language}")

predicted_language = predict_language(sentence2, model, tokenizer, label_encoder)
print(f"Predicted Language: {predicted_language}")

Predicted Language: Urdu
Predicted Language: Japanese


## Naive Bayes

In [None]:
# function to clean text
def clean_txt(text):
    text=text.lower()
    text=re.sub(r'[^\w\s]',' ',text)
    text=re.sub(r'[_0-9]',' ',text)
    text=re.sub(r'\s\s+',' ',text)
    return text

x_train = [clean_txt(text) for text in tqdm(x_train)]
x_test = [clean_txt(text) for text in tqdm(x_test)]

tfidf = TfidfVectorizer()
tfidf.fit(x_train)
x_train_ready = tfidf.transform(x_train)
x_test_ready = tfidf.transform(x_test)

x_train_ready,x_test_ready

enc = LabelEncoder()
enc.fit(y_train)
y_train_ready = enc.transform(y_train)
y_test_ready = enc.transform(y_test)
labels = enc.classes_
nb = MultinomialNB()

100%|██████████| 19800/19800 [00:00<00:00, 28332.87it/s]
100%|██████████| 2200/2200 [00:00<00:00, 24942.54it/s]


In [None]:
from sklearn.metrics import classification_report
nb.fit(x_train_ready, y_train_ready)
y_pred = nb.predict(x_test_ready)
y_pred_original = enc.inverse_transform(y_pred)
classification_rep = classification_report(y_test, y_pred_original, target_names=labels)
print(classification_rep)

              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       113
     Chinese       0.96      0.44      0.60       108
       Dutch       0.97      0.99      0.98       110
     English       0.73      1.00      0.85       102
    Estonian       0.98      0.96      0.97        98
      French       0.95      0.99      0.97       105
       Hindi       1.00      0.98      0.99       112
  Indonesian       0.99      0.97      0.98       102
    Japanese       1.00      0.64      0.78        90
      Korean       1.00      1.00      1.00        87
       Latin       0.98      0.90      0.94       103
     Persian       1.00      1.00      1.00       107
   Portugese       1.00      0.97      0.98        98
      Pushto       1.00      0.97      0.98       101
    Romanian       0.97      0.99      0.98        98
     Russian       0.97      1.00      0.99       110
     Spanish       0.98      1.00      0.99        95
     Swedish       0.56    

In [None]:
# use pipeline to combine prefitted vectorizer and trained model into one object
model = Pipeline([('vectorizer',tfidf),('nb',nb)])
# function to predict language from text
def predict(text):
    pred = model.predict([clean_txt(text)])
    ans = enc.inverse_transform(pred)
    return ans[0]

predict('my name is harsh'), predict('मेरा नाम हर्ष हे'), predict('mi nombre es harsh'), predict('меня зовут Харш'), predict('mon nom est harsh')

('English', 'Hindi', 'Spanish', 'Russian', 'French')