In [1]:
import pandas as pd
import re
import string
import pickle
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn import svm
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection
import numpy as np
import seaborn as sns

### Import excel file for maghreeb countries

In [2]:
df=pd.read_excel("data_maghreb.xlsx",index_col=0,
              dtype={'tokens': str, 'dialect': str})

In [3]:
df

Unnamed: 0,tokens,dialect
15497,قليلين ادب ومنافقين اختهم او قريبتهم تتعاكس تق...,LY
15498,الليبيين متقلبين بس بالنسبه ليا انا ميليشياوي ...,LY
15499,تانيه شاب ليبي بيرتاح لبنت مختلفه ويلاحظ انها ...,LY
15500,رانيا عقليتك متخلفه اولا الانسان يلي يحتاج اهل...,LY
15501,شكلك متعقده علشان الراجل تحبيه ازوج بنت يتيمه ...,LY
...,...,...
244692,المعلومه غلط انا ماشفتك الا ورى,DZ
244693,السلام يا حلوين مين يشرب,DZ
244694,ابكي لصبح مراح تروحي,DZ
244695,ارحموا عزيز قوم ذل,DZ


### Checking null values

In [4]:
df.isnull().sum()

tokens     0
dialect    0
dtype: int64

### Building voting classifier (linearsvc, multinomialNB, BernoulliNB)

In [5]:
def build_estimators():
    estimators = []
    svc = LinearSVC(penalty='l1', dual=False,tol=1e-3)
    estimators.append(('svc',svc))
    mnb= MultinomialNB(alpha=.01)
    estimators.append(('mnb',mnb))
    bnb= BernoulliNB(alpha=.01)
    estimators.append(('bnb',bnb))
    ensemble = VotingClassifier(estimators)
    return ensemble

### train, validation,test split

In [6]:
x=df['tokens']
y=df['dialect']
X_train, X_test, y_train, y_test=train_test_split(x,y,test_size=0.1,stratify=y,random_state=42)
X_train1, X_valid, y_train1, y_valid=train_test_split(X_train, y_train,test_size=0.1,stratify=y_train,random_state=42)

In [7]:
target_names=list(y_train1.unique())

In [8]:
# featureunion -> combines several transformer objects into a new transformer that combines their output. 
# A FeatureUnion takes a list of transformer objects. During fitting, each of these is fit to the data independently.
# For transforming data, the transformers are applied in parallel, and 
# the sample vectors they output are concatenated end-to-end into larger vectors.

### Extracting features from the training data using a Tfidf-vectorizer

In [9]:

# featureunion -> concatenate the results of multiple transformer objects,combine several feature extraction mechanisms
#into a single transformer.
max_df = 0.5
union = FeatureUnion([("w_v", TfidfVectorizer(sublinear_tf=True, max_df=max_df,analyzer = 'word', ngram_range=(1,3)
                                 )),
 
                       ("c_wb", TfidfVectorizer(sublinear_tf=True,max_df=max_df,analyzer = 'char_wb', ngram_range=(2,5)
                                 )),
 
                      ("c_wb5", TfidfVectorizer(sublinear_tf=True, max_df=max_df,analyzer = 'char', ngram_range=(2,4)
                                 )),
 
                       ],
transformer_weights={
            'w_v': 0.9,
            'c_wb': 0.9,
           "c_wb5":0.9,
        
        }
,
)



X_train_feat = union.fit_transform(X_train1)
X_valid_feat = union.transform(X_valid)
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_feat, y_train1)
print("Combined space has", X_res.shape[1], "features")


Combined space has 1497483 features


### Training the classifier

In [10]:
ensemble = build_estimators()
ensemble.fit(X_res, y_res)

VotingClassifier(estimators=[('svc',
                              LinearSVC(dual=False, penalty='l1', tol=0.001)),
                             ('mnb', MultinomialNB(alpha=0.01)),
                             ('bnb', BernoulliNB(alpha=0.01))])

### Testing the classifier on validation data

In [11]:
pred = ensemble.predict(X_valid_feat)
#for i in range(0,10):
 #   print(data_train.target_names[pred[i]])

score = metrics.accuracy_score(y_valid, pred)
print("accuracy:   %0.3f" % score)

print("classification report:")
print(metrics.classification_report(y_valid, pred,target_names=target_names))

print("confusion matrix:")
print(metrics.confusion_matrix(y_valid, pred))

accuracy:   0.800
classification report:
              precision    recall  f1-score   support

          MA       0.71      0.75      0.73      1456
          LY       0.82      0.92      0.87      3284
          TN       0.87      0.67      0.75      1038
          DZ       0.80      0.57      0.66       832

    accuracy                           0.80      6610
   macro avg       0.80      0.73      0.75      6610
weighted avg       0.80      0.80      0.80      6610

confusion matrix:
[[1095  262   55   44]
 [ 169 3031   33   51]
 [ 160  165  691   22]
 [ 110  235   15  472]]


### Testing the classifier on testing data

In [12]:
X_test_feat = union.transform(X_test)
pred_test=ensemble.predict(X_test_feat)
score = metrics.accuracy_score(y_test, pred_test)
print("accuracy:   %0.3f" % score)
print("classification report:")
print(metrics.classification_report(y_test, pred_test,target_names=target_names))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred_test))

accuracy:   0.805
classification report:
              precision    recall  f1-score   support

          MA       0.73      0.73      0.73      1618
          LY       0.82      0.92      0.87      3648
          TN       0.88      0.68      0.77      1154
          DZ       0.78      0.63      0.70       925

    accuracy                           0.80      7345
   macro avg       0.80      0.74      0.77      7345
weighted avg       0.81      0.80      0.80      7345

confusion matrix:
[[1186  317   59   56]
 [ 186 3360   29   73]
 [ 150  188  786   30]
 [  99  231   16  579]]


### Testing the model on some sentences

In [13]:

ensemble.predict(union.transform(["المعلومه غلط انا ماشفتك الا ورى"]))

array(['LY'], dtype=object)

In [15]:
pickle.dump(ensemble,open('model_maghreb.pkl','wb'))

In [16]:
pickle.dump(union,open('union_maghreb.pkl','wb'))