In [2]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, confusion_matrix
from joblib import dump

# Load the dataset
df = pd.read_csv("Language Detection.csv")

# Preprocessing
def preprocess_text(text):
    text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', '', text)
    text = re.sub(r'\[|\]', ' ', text)  # Properly escape square brackets
    text = text.lower()
    return text

df['Processed_Text'] = df['Text'].apply(preprocess_text)

# Encoding target labels
le = LabelEncoder()
y = le.fit_transform(df['Language'])

# Vectorization
cv = CountVectorizer()
X = cv.fit_transform(df['Processed_Text'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

# Model training
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)

# Model evaluation
y_pred = naive_bayes_classifier.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
print(f'F1 Score: {f1:.2f}')

# Saving the model and CountVectorizer
dump((naive_bayes_classifier, cv, le), 'language_detection_model.joblib')


F1 Score: 0.98


['language_detection_model.joblib']