## 1. Imports

In [None]:
## ALL
import re
import nltk
import joblib

## AS
import pandas as pd
import numpy as np

## FROM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

from datasets import load_dataset


## 2. Data

### 2.1. Calling Data

In [None]:
# data = load_dataset('masakhaner', 'yor')
# data = load_dataset('masakhane/masakhaner2', 'yor')

In [None]:
nigeria_data_file = "/Users/izzymohamed/Documents/WORK/AFRICA AGILE/Hackathon/Language Detection System/Data/Final/masakhane/all_masakhane.csv"

In [None]:
data = pd.read_csv(nigeria_data_file)

### 2.2. Preprocessing Data

In [None]:
df = pd.read_csv(nigeria_data_file) #pd.DataFrame(data)

# Preprocess the text data
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['text'] = df['text'].apply(preprocess_text)

### 2.3. Tokenization

In [None]:
# Tokenization (using NLTK tokenizer)
nltk.download('punkt')
df['text'] = df['text'].apply(nltk.word_tokenize)

In [None]:
# Convert list of tokens back to text
df['text'] = df['text'].apply(lambda x: ' '.join(x))

### 2.4. Feature Extraction

In [None]:
# Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])

### 2.5. Label Encoding

In [None]:
# Label Encoding
label_to_id = {lang: i for i, lang in enumerate(df['language'].unique())}
df['label'] = df['language'].map(label_to_id)
y = df['label']

### 2.6. Splitting Data

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 3. Training

In [None]:
# Train the Support Vector Machine (SVM) model
model = SVC(kernel='linear')
model.fit(X_train, y_train)

## 4. Predict

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_to_id.keys()))

## 5. Save Models

In [None]:
# Dir for output if we scrape directly to CSV
# Make sure to create this folder

directory = "/Users/izzymohamed/Documents/WORK/AFRICA AGILE/Hackathon/Language Detection System/"  #TODO: CHANGE DIRECTORY

In [None]:
# Save the model to disk
model_filename = "language_detection_model.joblib"
joblib.dump(model, model_filename)

## 6. Detect language of a new text

Now, to detect the language of a new text, you can use the trained model as follows:
1. Preprocess the new text using the same preprocessing steps applied to the training data.
2. Tokenize the preprocessed text using NLTK tokenizer.
3. Convert the list of tokens back to text.
4. Transform the text using the TF-IDF vectorizer.
5. Use the trained SVM model to predict the language label of the new text.

In [None]:
# Load the saved model from the disk
model_filename = "language_detection_model.joblib"
model = joblib.load(model_filename)

In [None]:
# New text to predict the language for
new_text = "Mo ti jade si ile"

In [None]:
# Preprocess the new text (similar to preprocessing in the training data)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

preprocessed_text = preprocess_text(new_text)

In [None]:
# Tokenize the preprocessed text using NLTK tokenizer
nltk.download('punkt')
tokens = nltk.word_tokenize(preprocessed_text)

In [None]:
# Convert the list of tokens back to text
tokenized_text = ' '.join(tokens)

In [None]:
# Transform the text using the TF-IDF vectorizer fitted on the training data
X_new = vectorizer.transform([tokenized_text])

In [None]:
# Use the trained SVM model to predict the language label of the new text
predicted_label = model.predict(X_new)[0]

In [None]:
# Convert the predicted label back to the original language
label_to_id = {i: lang for lang, i in label_to_id.items()}
predicted_language = label_to_id[predicted_label]

In [None]:
# Print the predicted language
print("Predicted Language:", predicted_language)