In [None]:
!pip install datasets

import pandas as pd
import random
from datasets import load_dataset
import glob
import os

# Step 1: Load the original dataset
url = "https://raw.githubusercontent.com/amankharwal/Website-data/master/dataset.csv"
df_orig = pd.read_csv(url)

# Step 2: Languages to add
langs = {"ta": "Tamil", "hi": "Hindi", "ml": "Malayalam", "te": "Telugu"}
sampler_size = 1000  # number of samples per dataset

new_samples = []

# Step 3: Fetch Samanantar parallel data
for code, lang_name in langs.items():
    ds = load_dataset("ai4bharat/samanantar", code, split="train")
    sampled = ds.shuffle(seed=42).select(range(sampler_size))
    for row in sampled:
        new_samples.append({"Text": row["tgt"], "language": lang_name})

# Step 4: Download IndicNLP corpus automatically (monolingual)
if not os.path.exists("indicnlp_corpus"):
    !git clone https://github.com/AI4Bharat/indicnlp_corpus.git

# Step 5: Sample IndicNLP text
for code, lang_name in langs.items():
    filepath = f"indicnlp_corpus/{code}/*.txt"
    lines = []
    for fname in glob.glob(filepath):
        with open(fname, encoding="utf-8") as f:
            lines.extend(f.readlines())
    if len(lines) >= sampler_size:
        sampled_lines = random.sample(lines, sampler_size)
    else:
        sampled_lines = lines
    new_samples.extend([{"Text": line.strip(), "language": lang_name} for line in sampled_lines])

# Step 6: Merge and save
df_new = pd.DataFrame(new_samples)
df_final = pd.concat([df_orig, df_new], ignore_index=True)
df_final.to_csv("updated_dataset_real_indic.csv", index=False)

print("✅ Done! Saved as updated_dataset_real_indic.csv")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

ta/train-00000-of-00004.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

ta/train-00001-of-00004.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

ta/train-00002-of-00004.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

ta/train-00003-of-00004.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5264867 [00:00<?, ? examples/s]

hi/train-00000-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00001-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00002-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00003-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00004-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

hi/train-00005-of-00008.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

hi/train-00006-of-00008.parquet:   0%|          | 0.00/239M [00:00<?, ?B/s]

hi/train-00007-of-00008.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10125706 [00:00<?, ? examples/s]

ml/train-00000-of-00004.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

ml/train-00001-of-00004.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

ml/train-00002-of-00004.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

ml/train-00003-of-00004.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5924426 [00:00<?, ? examples/s]

te/train-00000-of-00003.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

te/train-00001-of-00003.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

te/train-00002-of-00003.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4946035 [00:00<?, ? examples/s]

Cloning into 'indicnlp_corpus'...
remote: Enumerating objects: 182, done.[K
remote: Counting objects: 100% (182/182), done.[K
remote: Compressing objects: 100% (142/142), done.[K
Receiving objects: 100% (182/182), 181.53 KiB | 603.00 KiB/s, done.
remote: Total 182 (delta 52), reused 120 (delta 32), pack-reused 0 (from 0)[K
Resolving deltas: 100% (52/52), done.
✅ Done! Saved as updated_dataset_real_indic.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
data = pd.read_csv("updated_dataset_real_indic.csv")
print(data.head())

                                                Text  language
0  klement gottwaldi surnukeha palsameeriti ning ...  Estonian
1  sebes joseph pereira thomas  på eng the jesuit...   Swedish
2  ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...      Thai
3  விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...     Tamil
4  de spons behoort tot het geslacht haliclona en...     Dutch


In [None]:
data.isnull().sum()

Unnamed: 0,0
Text,0
language,0


In [None]:
data["language"].value_counts()

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
Tamil,2000
Hindi,2000
Swedish,1000
Estonian,1000
Dutch,1000
Japanese,1000
Turkish,1000
Thai,1000
Latin,1000
Urdu,1000


In [None]:
x = np.array(data["Text"])
y = np.array(data["language"])

cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['Text'])

In [None]:
model = MultinomialNB()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9435897435897436

In [None]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)


In [None]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = model.predict(data)
print(output)