# Language Classification Using Machine Learning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import RandomOverSampler  
import joblib
import os

In [4]:
df=pd.read_csv('language-classifier/data/Language Detection.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'language-classifier/data/Language Detection.csv'

## About the Dataset

In [None]:
print("dataset shape",df.shape)
print("dataset info\n",df.info())
print("missing values\n", df.isnull().sum())

## Distribution of Languages in Dataset

In [None]:
unique_languages=df["Language"].nunique()
print("Total unique languages are ",unique_languages)

language_counts=df["Language"].value_counts()
print("Language distribution\n",language_counts)

In [None]:
plt.figure(figsize=(12, 5))  # Set figure size
language_counts.plot(kind="bar", color="skyblue")  # Bar plot
plt.title("Distribution of Languages in Dataset")  # Title of the plot
plt.xlabel("Language")  # X-axis label
plt.ylabel("Number of Samples")  # Y-axis label
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.show()  # Display the plot

## Data Preprocessing

In [None]:
# Convert categorical language labels into numerical
le=LabelEncoder()
df["Language_Encoded"]=le.fit_transform(df["Language"])
label_mapping=dict(zip(le.classes_,le.transform(le.classes_)))
print("Label Mapping\n",label_mapping)

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

stemmer=PorterStemmer()
stop_words=set(stopwords.words('english'))

def preprocess_text(text):
    text=text.lower()
    text=re.sub(r'\d+','',text)
    text=text.translate(str.maketrans('','',string.punctuation))
    words=word_tokenize(text)
    words=[word for word in words if word not in stop_words]
    #words=[stemmer.stem(word) for word in words]
    
    return ''.join(words)

df["Processed_Text"]=df["Text"].apply(preprocess_text)


print(df[["Text","Processed_Text"]].head())

## Feature Extraction

In [None]:
# Create a TF-IDF vectorizer
vectorizer =TfidfVectorizer(max_features=10000,ngram_range=(1, 2), analyzer='char')  
# Limiting features to 5000 for efficiency
X = vectorizer.fit_transform(df["Processed_Text"]).toarray()

## Model Training

In [None]:
# Define target variable (encoded language labels)
y = df["Language_Encoded"]


# Check new class distribution
#print("New Class Distribution:", Counter(y_resampled))

#Split resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

#Initialize and train the Logistic Regression model
model=MultinomialNB()
model.fit(X_train_resampled, y_train_resampled)

## Model Evaluation

In [None]:
# Make predictions
y_pred = model.predict(X_test)
print("Unique predictions:", np.unique(y_pred, return_counts=True))
# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Detailed performance report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

## Saving the Model
- Saving the trained model and vectorizer

In [None]:
os.makedirs("models", exist_ok=True)
# Save trained model
joblib.dump(model, "models/language_classifier.pkl")
# Save TF-IDF vectorizer
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully!")

## Making Predictions

In [None]:
# Load the saved model and vectorizer
loaded_model = joblib.load("language_classifier.pkl")
loaded_vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Sample text input
new_texts = ["Bonjour, comment ça va ?", "Hola, ¿cómo estás?", "Das ist ein Beispiel."]

# Convert text into numerical format
new_texts_transformed = loaded_vectorizer.transform(new_texts).toarray()

# Predict the language
predictions = loaded_model.predict(new_texts_transformed)

print("Predicted Language Encodings:", predictions)
print("Decoded Languages:", [le.inverse_transform([p])[0] for p in predictions])
