In [1]:
from glob import glob
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import Pipeline


In [2]:
files = glob("Alshutairi/*.txt")
dataframes = []

for file in files:
    region = file[-7:-4]
    temp_df = pd.read_csv(file, encoding="utf8", delimiter="\n", names=["Text"])
    temp_df["Region"] = region
    dataframes.append(temp_df)
    
df = pd.concat(dataframes)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df["Text"], df["Region"], random_state=0)
count_vectorizer = CountVectorizer().fit(X_train)
X_train_vectorized = count_vectorizer.transform(X_train)
X_test_vectorized = count_vectorizer.transform(X_test)

In [4]:
naive_bayes = MultinomialNB(alpha=0.1)
naive_bayes.fit(X_train_vectorized, y_train)
naive_bayes.score(X_test_vectorized, y_test)

0.8263942927574592

In [5]:
precision, recall, fscore, support = precision_recall_fscore_support(naive_bayes.predict(X_test_vectorized), y_test)

In [6]:
for i, region in enumerate(df["Region"].unique()):
    print(f"{region}\nPrecision: {round(precision[i], 4)}, Recall: {round(recall[i], 4)}, FScore: {round(fscore[i], 4)}, Support: {support[i]}")

EGY
Precision: 0.9127, Recall: 0.8967, FScore: 0.9046, Support: 158122
GLF
Precision: 0.7618, Recall: 0.8176, FScore: 0.7887, Support: 41898
IRQ
Precision: 0.7695, Recall: 0.7603, FScore: 0.7649, Support: 38940
LEV
Precision: 0.7114, Recall: 0.6974, FScore: 0.7043, Support: 50145
NOR
Precision: 0.7848, Recall: 0.7994, FScore: 0.792, Support: 63009


In [7]:
model = Pipeline([("Count Vectorizer", count_vectorizer), ("Naive Bayes", naive_bayes)])
joblib.dump(model, "naive_bayes.model")

['naive_bayes.model']

In [13]:
model.predict_proba(["هلا"])

array([[0.14877097, 0.18740504, 0.23845124, 0.37163259, 0.05374017]])

In [9]:
naive_bayes.predict_proba(count_vectorizer.transform(["هلا"]))

array([[0.14877097, 0.18740504, 0.23845124, 0.37163259, 0.05374017]])

In [12]:
naive_bayes.classes_

array(['EGY', 'GLF', 'IRQ', 'LEV', 'NOR'], dtype='<U3')