In [1]:
import pandas as pd
import re
import string
import pickle
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn import svm
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection
import numpy as np
from sklearn.model_selection import cross_val_score,StratifiedKFold
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.metrics import fbeta_score, make_scorer
import seaborn as sns

In [2]:
#pip install imblearn

### Import excel file for gulf countries

In [3]:
df=pd.read_excel("data_gulf.xlsx",index_col=0,
              dtype={'tokens': str, 'dialect': str})

In [4]:
df

Unnamed: 0,tokens,dialect
51996,شفتك حاط صوره ولد النعاشه النجس وانا غاسل يدي ...,QA
51997,محد داس طرف يالذنب وش جابك تغريده لتاج راسك تم...,QA
51998,وين انت رايح يابن حجيلان التزم الصمت لان ملك ا...,QA
51999,ياخي انت ماتتوب انا قايلك اشوفك بتغريده تخص شي...,QA
52000,قلتها يالسلوقي الف مره لاكن الانجاس ماتتوب عدت...,QA
...,...,...
458192,مبسوطين اللي باسطانا,BH
458193,ماينده ابش يختي,BH
458194,شو عملنا حنا تهربي احنا مساكين ليش بتعملي هيك,BH
458195,الله يبارك وبالعافيه,BH


### Checking null values

In [5]:
df.isnull().sum()

tokens     0
dialect    0
dtype: int64

### Building voting classifier (linearsvc, multinomialNB, BernoulliNB)

In [6]:
def build_estimators():
    estimators = []
#     Ridge = RidgeClassifier(alpha=0.00001, max_iter=70) 
#     estimators.append(('Ridge',Ridge))
    svc = LinearSVC(penalty='l1', dual=False,tol=1e-3)
    estimators.append(('svc',svc))
    mnb= MultinomialNB(alpha=.01)
    estimators.append(('mnb',mnb))
    bnb= BernoulliNB(alpha=.01)
    estimators.append(('bnb',bnb))
    ensemble = VotingClassifier(estimators)
    return ensemble

### train, validation,test split

In [7]:
x=df['tokens']
y=df['dialect']
X_train, X_test, y_train, y_test=train_test_split(x,y,test_size=0.1,stratify=y,random_state=42)
X_train1, X_valid, y_train1, y_valid=train_test_split(X_train, y_train,test_size=0.1,stratify=y_train,random_state=42)

In [8]:
target_names=list(y_train1.unique())

In [9]:
# featureunion -> combines several transformer objects into a new transformer that combines their output. 
# A FeatureUnion takes a list of transformer objects. During fitting, each of these is fit to the data independently.
# For transforming data, the transformers are applied in parallel, and 
# the sample vectors they output are concatenated end-to-end into larger vectors.

### Extracting features from the training data using a Tfidf-vectorizer

In [10]:

# featureunion -> concatenate the results of multiple transformer objects,combine several feature extraction mechanisms
#into a single transformer.
max_df = 0.5
union = FeatureUnion([("w_v", TfidfVectorizer(sublinear_tf=True, max_df=max_df,analyzer = 'word', ngram_range=(1,3)
                                 )),

                       ("c_wb", TfidfVectorizer(sublinear_tf=True,max_df=max_df,analyzer = 'char_wb', ngram_range=(2,5)
                                 )),

                    ("c_wb5", TfidfVectorizer(sublinear_tf=True, max_df=max_df,analyzer = 'char', ngram_range=(2,4)
                               )),

                       ],
transformer_weights={
            'w_v': 0.9,

            'c_wb': 0.9,

          "c_wb5":0.9,
           
        }
,
)

X_train_feat = union.fit_transform(X_train1)
X_valid_feat = union.transform(X_valid)

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_feat, y_train1)
print("Combined space has", X_res.shape[1], "features")

Combined space has 3121458 features


### Training the classifier

In [11]:
ensemble=build_estimators()
ensemble.fit(X_res, y_res)


VotingClassifier(estimators=[('svc',
                              LinearSVC(dual=False, penalty='l1', tol=0.001)),
                             ('mnb', MultinomialNB(alpha=0.01)),
                             ('bnb', BernoulliNB(alpha=0.01))])

### Testing the classifier on validation data

In [12]:

pred = ensemble.predict(X_valid_feat)

score = metrics.accuracy_score(y_valid, pred)
print("accuracy:   %0.3f" % score)

print("classification report:")
print(metrics.classification_report(y_valid, pred,target_names=target_names))

print("confusion matrix:")
print(metrics.confusion_matrix(y_valid, pred))

accuracy:   0.523
classification report:
              precision    recall  f1-score   support

          SA       0.47      0.53      0.50      2366
          BH       0.37      0.53      0.44      2366
          QA       0.61      0.59      0.60      3789
          AE       0.51      0.45      0.48      1720
          OM       0.61      0.49      0.54      2796
          KW       0.60      0.50      0.54      2415

    accuracy                           0.52     15452
   macro avg       0.53      0.51      0.52     15452
weighted avg       0.54      0.52      0.53     15452

confusion matrix:
[[1261  341  227  191  216  130]
 [ 266 1246  394  149  175  136]
 [ 331  654 2231  124  210  239]
 [ 255  305  177  769   84  130]
 [ 325  442  346  128 1370  185]
 [ 227  358  292  149  179 1210]]


### Testing the classifier on testing data

In [13]:
X_test_feat = union.transform(X_test)
pred_test=ensemble.predict(X_test_feat)
score = metrics.accuracy_score(y_test, pred_test)
print("accuracy:   %0.3f" % score)
print("classification report:")
print(metrics.classification_report(y_test, pred_test,target_names=target_names))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred_test))

accuracy:   0.524
classification report:
              precision    recall  f1-score   support

          SA       0.47      0.55      0.51      2629
          BH       0.37      0.50      0.43      2629
          QA       0.61      0.58      0.60      4210
          AE       0.49      0.46      0.47      1911
          OM       0.62      0.51      0.56      3107
          KW       0.60      0.49      0.54      2683

    accuracy                           0.52     17169
   macro avg       0.53      0.52      0.52     17169
weighted avg       0.54      0.52      0.53     17169

confusion matrix:
[[1437  322  255  243  221  151]
 [ 346 1312  420  164  244  143]
 [ 324  719 2455  177  249  286]
 [ 330  337  175  880   77  112]
 [ 336  439  370  174 1585  203]
 [ 269  387  346  165  194 1322]]


### Testing the model on some sentences

In [14]:

ensemble.predict(union.transform(["اشحالك"]))

array(['AE'], dtype=object)

In [15]:
ensemble.predict(union.transform(["بزبوز"]))

array(['BH'], dtype=object)

In [16]:
ensemble.predict(union.transform(["السحله"]))

array(['SA'], dtype=object)

In [17]:
ensemble.predict(union.transform(["مرهلاكن"]))

array(['SA'], dtype=object)

In [18]:
ensemble.predict(union.transform(["السواه"]))

array(['SA'], dtype=object)

In [20]:
pickle.dump(ensemble,open('model_gulf.pkl','wb'))

In [21]:
pickle.dump(union,open('union_gulf.pkl','wb'))