In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

#### Read the dataset from the provided URL and print the first few rows

In [20]:
data = pd.read_csv("https://raw.githubusercontent.com/KarthikeyaPatnala12/Language-Detection/main/dataset.csv")
print(data.head())

                                                Text  language
0  klement gottwaldi surnukeha palsameeriti ning ...  Estonian
1  sebes joseph pereira thomas  på eng the jesuit...   Swedish
2  ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...      Thai
3  விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...     Tamil
4  de spons behoort tot het geslacht haliclona en...     Dutch


#### Check for missing values in the dataset

In [3]:
data.isnull().sum()

Text        0
language    0
dtype: int64

In [4]:
data["language"].value_counts()

language
Estonian      1000
Swedish       1000
English       1000
Russian       1000
Romanian      1000
Persian       1000
Pushto        1000
Spanish       1000
Hindi         1000
Korean        1000
Chinese       1000
French        1000
Portugese     1000
Indonesian    1000
Urdu          1000
Latin         1000
Turkish       1000
Japanese      1000
Dutch         1000
Tamil         1000
Thai          1000
Arabic        1000
Name: count, dtype: int64

In [5]:
x = np.array(data["Text"])
y = np.array(data["language"])

In [6]:
tfidf = TfidfVectorizer()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
X_train_tfidf = tfidf.fit_transform(X_train)
model = LinearSVC(dual=False)
model.fit(X_train_tfidf, y_train)

In [15]:
X_test_tfidf = tfidf.transform(X_test)

In [16]:
accuracy = model.score(X_test_tfidf, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9674931129476584


In [18]:
user = input("Enter a Text: ")
user_tfidf = tfidf.transform([user])
output = model.predict(user_tfidf)
print("Predicted Language:", output)

Enter a Text: 儿勒屁艾 艾艾西艾 艾艾 西吉艾艾伊娜伊
Predicted Language: ['Chinese']
