<a href="https://colab.research.google.com/github/Kyamzzz/GISMA/blob/main/NLP_Language_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the data
data = pd.read_csv('dataset.csv')

# Display the first few rows
data.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [None]:
# Text Preprocessing
import string

# Function to remove punctuation and convert text to lowercase
def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

# Apply the preprocessing function to the 'Text' column
data['Text'] = data['Text'].apply(preprocess_text)

# Display the first few rows of the preprocessed data
data.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer object with the parameter to extract character n-grams
# We'll extract 1-grams, 2-grams and 3-grams
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))

# Fit the vectorizer to the 'Text' column of the data
# This will calculate the TF-IDF statistics
X = vectorizer.fit_transform(data['Text'])

# Get the target variable 'language'
y = data['language']

# Print the shape of X
print('Shape of X:', X.shape)

Shape of X: (22000, 646786)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a MultinomialNB model
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Print a classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       202
     Chinese       0.99      0.99      0.99       201
       Dutch       0.99      0.97      0.98       230
     English       0.73      1.00      0.85       194
    Estonian       0.99      0.95      0.97       200
      French       0.95      0.99      0.97       188
       Hindi       1.00      0.99      0.99       208
  Indonesian       1.00      0.96      0.98       213
    Japanese       1.00      0.98      0.99       194
      Korean       1.00      0.99      1.00       190
       Latin       0.99      0.90      0.94       210
     Persian       0.99      0.99      0.99       196
   Portugese       0.99      0.95      0.97       194
      Pushto       1.00      0.95      0.98       196
    Romanian       1.00      0.97      0.99       197
     Russian       0.99      1.00      0.99       213
     Spanish       0.98      0.98      0.98       199
     Swedish       1.00    

In [None]:
import joblib

# Save the model
joblib.dump(model, 'language_detection_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [None]:
import joblib

# Load the model and vectorizer
model = joblib.load('language_detection_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# New text to predict
text = "Detta är ett test"

# Preprocess the text
text = preprocess_text(text)

# Convert the text to a matrix of TF-IDF features
X = vectorizer.transform([text])

# Use the model to predict the language
prediction = model.predict(X)

print('Predicted language:', prediction[0])


Predicted language: Swedish
