# Imports

In [41]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load Dataset
data received from https://tatoeba.org/

In [42]:
eng_data = pd.read_csv("data/eng_sentences.tsv", sep="\t", header=None)
spa_data = pd.read_csv("data/spa_sentences.tsv", sep="\t", header=None)

# equalize the size of the data
min_size = min(len(eng_data), len(spa_data))

eng_data = eng_data.sample(min_size)
spa_data = spa_data.sample(min_size)

# concatenate the data
data = pd.concat([eng_data, spa_data])

# Data Pre-Processing
- drop column 0
- Name Columns
- Make text data lowercase
- Shuffle Data

In [43]:
data = data.drop(columns=[0])
data = data.rename(columns={1: 'Lang', 2: 'Text'})
data['Text'] = data['Text'].str.lower()

data = data.sample(frac=1).reset_index(drop=True)

data

Unnamed: 0,Lang,Text
0,spa,voy al teléfono y contesto.
1,eng,tom doesn't have to work on monday.
2,spa,a veces uso esta habitación como taller.
3,eng,the singer was at his best in that song.
4,spa,"estaba muy abrigado, así que volví a quitarme ..."
...,...,...
825571,spa,creo que es hora de huir.
825572,spa,las picaduras de los tábanos sí que se sienten...
825573,eng,i've finally gotten used to urban life.
825574,eng,algeria has many areas that are virtually unre...


# Split Training and Test Data
20% to test the accuracy

80% for training the model

In [44]:
X_train, X_test, y_train, y_test = train_test_split(data['Text'], data['Lang'], test_size=0.2)

# Vectorize the text for the classifier
This puts the text into a format that can be used with Naive Bayes classifiers

In [45]:
vectorizedText = CountVectorizer(analyzer='char', ngram_range=(2, 3))  # Use character n-grams
X_train = vectorizedText.fit_transform(X_train)
X_test = vectorizedText.transform(X_test)

# Fit the training Data to the Model

In [46]:
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate Accuracy of the Model
this is the Stats for the training data that was fit to the model

In [47]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9987221105162432
Classification Report:
               precision    recall  f1-score   support

         eng       1.00      1.00      1.00     82972
         spa       1.00      1.00      1.00     82144

    accuracy                           1.00    165116
   macro avg       1.00      1.00      1.00    165116
weighted avg       1.00      1.00      1.00    165116



# Use the model with the test data
