In [1]:
pip install scikit-learn pandas




In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics

# Sample dataset
data = {
    "text": [
        "Hello, how are you?",
        "Bonjour, comment ça va?",
        "Hola, ¿cómo estás?",
        "Hallo, wie geht's dir?",
        "Ciao, come stai?",
        "Olá, como você está?",
        "Привет, как дела?",
        "こんにちは、お元気ですか？",
        "你好，你怎么样？",
        "안녕하세요, 어떻게 지내세요?"
    ],
    "language": [
        "English", "French", "Spanish", "German", "Italian",
        "Portuguese", "Russian", "Japanese", "Chinese", "Korean"
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['language'], test_size=0.3, random_state=42)

# Create a pipeline with character n-gram vectorizer and Naive Bayes classifier
model = Pipeline([
    ('vectorizer', CountVectorizer(analyzer='char', ngram_range=(1, 3))),
    ('classifier', MultinomialNB())
])

# Train model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Accuracy
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

# Test on new input
def identify_language(text):
    return model.predict([text])[0]

# Example
print(identify_language("안녕하세요, 어떻게 지내세요?"))  # Output should be "  Korean"


Accuracy: 0.0
Korean


In [4]:
pip install langdetect


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m30.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=fd4d8fd00f753ff904015bd3dc7633cb2df2ae216c8feb59d6fece4e6215b170
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [5]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Ensure consistent results
DetectorFactory.seed = 0

def identify_language(text):
    try:
        language = detect(text)
        return language
    except LangDetectException:
        return "Could not detect language"

# Example usage
texts = [
    "Hello, how are you?",
    "Bonjour, comment allez-vous?",
    "¿Dónde está la biblioteca?",
    "Это моя книга.",
    "これはペンです。",
    "안녕하세요, 반갑습니다.",
    "Buongiorno, come va?",
    "مرحبا كيف حالك؟"
]

for t in texts:
    print(f"Text: {t}\nDetected Language: {identify_language(t)}\n")


Text: Hello, how are you?
Detected Language: en

Text: Bonjour, comment allez-vous?
Detected Language: fr

Text: ¿Dónde está la biblioteca?
Detected Language: es

Text: Это моя книга.
Detected Language: ru

Text: これはペンです。
Detected Language: ja

Text: 안녕하세요, 반갑습니다.
Detected Language: ko

Text: Buongiorno, come va?
Detected Language: it

Text: مرحبا كيف حالك؟
Detected Language: ar

