#imports

In [1]:
!pip3 install fasttext
!pip3 install transformers

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296186 sha256=a88cb2e01384f95d99bc8beaa2950fee9c9aff0e1c752880960da6796bccb460
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58e02cec2ddb20ce3e59fad8d3c92a
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Success

In [2]:
import zipfile
import os
import re
import pandas as pd
import fasttext
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

#dataset

In [3]:
zip_file_path = '/content/native_script_train_valid_data.zip'
extract_to = 'extracted_data/'

os.makedirs(extract_to, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

In [4]:
data_file_path = os.path.join('/content/extracted_data/Native_script_data', 'train_combine.txt')

data = pd.read_csv(data_file_path, sep='\t', header=None, usecols=[0], names=['labels'])

print(data.head())

                                              labels
0  __label__Kashmiri_Arab ترو وہن شتَن شعرن منز و...
1  __label__Kannada ವೀರೇಶ , ರೋಜಲಿನ್ ಆಡಂ , ಮುದ್ದೆ ...
2  __label__Nepali उनीहरूले तपाईंलाई स्पष्ट लिखित...
3  __label__Manipuri_Mei ꯍꯤꯕꯤ ꯅꯠꯇ꯭ꯔꯒ ꯍꯦꯕꯦ ꯑꯁꯤ ꯒ꯭ꯔ...
4  __label__Hindi बाबा - ए - उर्दू के मौलवी अब्दु...


In [5]:
formatted_data_path = '/content/extracted_data/Native_script_data/train_combine.txt'

print("Using formatted data for FastText training from:", formatted_data_path)

Using formatted data for FastText training from: /content/extracted_data/Native_script_data/train_combine.txt


In [6]:
def count_sentences_per_language(file_path):
    language_count = defaultdict(int)

    with open(file_path, 'r') as f:
        for line in f:
            language_label, _ = line.strip().split(' ', 1)
            language = language_label.replace('__label__', '')
            language_count[language] += 1

    return language_count

train_data_path = '/content/extracted_data/Native_script_data/train_combine.txt'
valid_data_path = '/content/extracted_data/Native_script_data/valid_combine.txt'

train_language_counts = count_sentences_per_language(train_data_path)
valid_language_counts = count_sentences_per_language(valid_data_path)


print("Sentence counts in training data:")
for language, count in train_language_counts.items():
    print(f"{language}: {count}")

print("\nSentence counts in validation data:")
for language, count in valid_language_counts.items():
    print(f"{language}: {count}")

Sentence counts in training data:
Kashmiri_Arab: 100000
Kannada: 100000
Nepali: 115554
Manipuri_Mei: 100000
Hindi: 100000
Tamil: 100000
Telugu: 100000
Oriya: 100000
Urdu: 105282
Manipuri_Beng: 100000
Bodo: 108240
Marathi: 100000
Sanskrit: 102315
Assamese: 100000
Maithili: 100277
Punjabi: 100000
Gujarati: 100000
Other: 100000
Bangla: 100000
Malayalam: 100000
Dogri: 100000
Konkani: 100000
Kashmiri_Deva: 100000
Sindhi: 100000
English: 100000
Santali: 100000

Sentence counts in validation data:
Kashmiri_Arab: 1497
Bangla: 5997
Tamil: 5997
Manipuri_Beng: 997
Marathi: 5997
Sindhi: 5996
Gujarati: 5997
Malayalam: 5997
Punjabi: 5997
Telugu: 5997
Hindi: 5997
Assamese: 997
Kannada: 5997
Urdu: 6376
Oriya: 997
Maithili: 1497
Sanskrit: 1497
English: 997
Manipuri_Mei: 500
Nepali: 1497
Konkani: 500
Bodo: 500
Dogri: 120
Kashmiri_Deva: 997
Santali: 345


#model training

In [7]:
model = fasttext.train_supervised(
    input=formatted_data_path,
    label_prefix='__label__',
    epoch=50,
    lr=0.1,
)

model.save_model('/content/extracted_data/Native_script_data/language_id_model.ftz')

print("model trained")

model trained


#prediction

In [8]:
new_text = "এইটো এটা উদাহৰণ বিবৃতি"

predicted_label = model.predict(new_text)

print(f"Predicted Language: {predicted_label[0][0].replace('__label__', '')}")

Predicted Language: Assamese


#evaluation

In [9]:
test_data_path = '/content/extracted_data/Native_script_data/valid_combine.txt'

test_results = model.test(test_data_path)

n_examples, precision, recall = test_results

print(f"Number of examples: {n_examples}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Number of examples: 79283
Precision: 0.9316
Recall: 0.9316


In [10]:
model = fasttext.load_model('/content/extracted_data/Native_script_data/language_id_model.ftz')


test_data_path = '/content/extracted_data/Native_script_data/valid_combine.txt'


with open(test_data_path, 'r') as f:
    test_data = [line.strip().split(' ', 1) for line in f]


test_df = pd.DataFrame(test_data, columns=['language', 'text'])


test_df['language'] = test_df['language'].str.replace('__label__', '')


test_df['predicted_language'] = test_df['text'].apply(lambda x: model.predict(x)[0][0])
test_df['predicted_language'] = test_df['predicted_language'].str.replace('__label__', '')


language_accuracy = {}


for language in test_df['language'].unique():

    total_instances = test_df[test_df['language'] == language].shape[0]

    correct_predictions = test_df[(test_df['language'] == language) & (test_df['predicted_language'] == language)].shape[0]

    accuracy = correct_predictions / total_instances if total_instances > 0 else 0
    language_accuracy[language] = accuracy

for language, accuracy in language_accuracy.items():
    print(f"Accuracy for {language}: {accuracy:.4f}")

Accuracy for Kashmiri_Arab: 0.9846
Accuracy for Bangla: 0.9475
Accuracy for Tamil: 0.9210
Accuracy for Manipuri_Beng: 0.9840
Accuracy for Marathi: 0.8571
Accuracy for Sindhi: 0.9830
Accuracy for Gujarati: 0.9633
Accuracy for Malayalam: 0.9231
Accuracy for Punjabi: 0.9747
Accuracy for Telugu: 0.8889
Accuracy for Hindi: 0.8586
Accuracy for Assamese: 0.9970
Accuracy for Kannada: 0.9540
Accuracy for Urdu: 0.8802
Accuracy for Oriya: 0.9990
Accuracy for Maithili: 0.9345
Accuracy for Sanskrit: 0.9679
Accuracy for English: 0.9930
Accuracy for Manipuri_Mei: 1.0000
Accuracy for Nepali: 0.9893
Accuracy for Konkani: 0.9860
Accuracy for Bodo: 0.9860
Accuracy for Dogri: 0.9917
Accuracy for Kashmiri_Deva: 0.9428
Accuracy for Santali: 1.0000
