In [1]:
import pandas as pd
import re
import string
import pickle
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn import svm
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection
import numpy as np
from imblearn.over_sampling import SMOTE
import seaborn as sns

### Import excel file for the main data

In [2]:
df=pd.read_excel("data_main.xlsx",index_col=0,
              dtype={'tokens': str, 'dialect': str})

In [3]:
df

Unnamed: 0,tokens,dialect
0,بالنهايه ينتفض يغير,IQ
1,يعني محسوب البشر حيونه ووحشيه وتطلبون الغرب يح...,IQ
2,مبين كلامه خليجي,IQ
3,يسلملي مرورك وروحك الحلوه,IQ
4,وين الغيبه اخ محمد,IQ
...,...,...
458192,مبسوطين اللي باسطانا,GULF
458193,ماينده ابش يختي,GULF
458194,شو عملنا حنا تهربي احنا مساكين ليش بتعملي هيك,GULF
458195,الله يبارك وبالعافيه,GULF


### Checking null values

In [4]:
df.isnull().sum()

tokens     0
dialect    0
dtype: int64

### Building voting classifier (linearsvc, multinomialNB, BernoulliNB)

In [5]:
def build_estimators():
    estimators = []
    svc = LinearSVC(penalty='l1', dual=False,tol=1e-3)
    estimators.append(('svc',svc))
    mnb= MultinomialNB(alpha=.01)
    estimators.append(('mnb',mnb))
    bnb= BernoulliNB(alpha=.01)
    estimators.append(('bnb',bnb))
    ensemble = VotingClassifier(estimators)
    return ensemble

### train, validation,test split

In [6]:
x=df['tokens']
y=df['dialect']
X_train, X_test, y_train, y_test=train_test_split(x,y,test_size=0.1,stratify=y,random_state=42)
X_train1, X_valid, y_train1, y_valid=train_test_split(X_train, y_train,test_size=0.1,stratify=y_train,random_state=42)

In [7]:
target_names=list(y_train1.unique())

In [8]:
# featureunion -> combines several transformer objects into a new transformer that combines their output. 
# A FeatureUnion takes a list of transformer objects. During fitting, each of these is fit to the data independently.
# For transforming data, the transformers are applied in parallel, and 
# the sample vectors they output are concatenated end-to-end into larger vectors.

### Extracting features from the training data using a Tfidf-Vectorizer

In [9]:

# featureunion -> concatenate the results of multiple transformer objects,combine several feature extraction mechanisms
#into a single transformer.
max_df = 0.5
union = FeatureUnion([("w_v", TfidfVectorizer(sublinear_tf=True, max_df=max_df,analyzer = 'word', ngram_range=(1,3)
                                             )),

                      ("c_wb", TfidfVectorizer(sublinear_tf=True,max_df=max_df,analyzer = 'char_wb', ngram_range=(2,5)
                              )),
 
                      ("c_wb5", TfidfVectorizer(sublinear_tf=True, max_df=max_df,analyzer = 'char', ngram_range=(2,5)
                                 )),
                     

                       ],
transformer_weights={
            'w_v': 0.9,
            'c_wb': 0.9,
          "c_wb5":0.9,
           
        }
,
)

X_train_feat = union.fit_transform(X_train1)
X_valid_feat = union.transform(X_valid)
sm = SMOTE(random_state=42)              ## to solve the imbalanced data problem
X_res, y_res = sm.fit_resample(X_train_feat, y_train1)
print("Combined space has", X_res.shape[1], "features")


Combined space has 8231413 features


### Training the classifier

In [10]:

ensemble = build_estimators()
ensemble.fit(X_res, y_res)





VotingClassifier(estimators=[('svc',
                              LinearSVC(dual=False, penalty='l1', tol=0.001)),
                             ('mnb', MultinomialNB(alpha=0.01)),
                             ('bnb', BernoulliNB(alpha=0.01))])

### Testing the classifier on validation data

In [11]:
pred = ensemble.predict(X_valid_feat)
#for i in range(0,10):
 #   print(data_train.target_names[pred[i]])

score = metrics.accuracy_score(y_valid, pred)
print("accuracy:   %0.3f" % score)

print("classification report:")
print(metrics.classification_report(y_valid, pred,target_names=target_names))

print("confusion matrix:")
print(metrics.confusion_matrix(y_valid, pred))

accuracy:   0.794
classification report:
              precision    recall  f1-score   support

      LEVANT       0.78      0.89      0.83     15452
        GULF       0.74      0.47      0.57      1394
  NILE BASIN       0.77      0.80      0.78     10394
     MAGHERB       0.84      0.71      0.77      6610
          YE       0.85      0.80      0.82      6481
          IQ       0.50      0.14      0.22       893

    accuracy                           0.79     41224
   macro avg       0.75      0.63      0.67     41224
weighted avg       0.79      0.79      0.79     41224

confusion matrix:
[[13741   108   967   343   215    78]
 [  529   649   148    48    17     3]
 [ 1368    57  8322   271   358    18]
 [  921    37   647  4666   317    22]
 [  462    18   580   201  5215     5]
 [  525     9   145    49    40   125]]


### Testing the classifier on testing data

In [12]:
X_test_feat = union.transform(X_test)
pred_test=ensemble.predict(X_test_feat)
score = metrics.accuracy_score(y_test, pred_test)
print("accuracy:   %0.3f" % score)
print("classification report:")
print(metrics.classification_report(y_test, pred_test,target_names=target_names))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred_test))

accuracy:   0.791
classification report:
              precision    recall  f1-score   support

      LEVANT       0.79      0.89      0.83     17168
        GULF       0.74      0.49      0.59      1549
  NILE BASIN       0.77      0.80      0.78     11549
     MAGHERB       0.82      0.69      0.75      7344
          YE       0.84      0.80      0.82      7202
          IQ       0.52      0.17      0.25       992

    accuracy                           0.79     45804
   macro avg       0.74      0.64      0.67     45804
weighted avg       0.79      0.79      0.78     45804

confusion matrix:
[[15244   125  1083   422   209    85]
 [  567   756   148    52    19     7]
 [ 1432    52  9232   344   461    28]
 [ 1046    52   747  5076   399    24]
 [  515    20   665   245  5744    13]
 [  551    14   166    54    40   167]]


### Testing the model on some sentences

In [13]:

ensemble.predict(union.transform(["المعلومه غلط انا ماشفتك الا ورى"]))

array(['MAGHERB'], dtype=object)

In [14]:
ensemble.predict(union.transform(["معنعن"]))

array(['LEVANT'], dtype=object)

In [15]:
p=ensemble.predict(X_train_feat)
score=metrics.accuracy_score(y_train1, p)
print("accuracy:   %0.3f" % score)
print("classification report:")
print(metrics.classification_report(y_train1, p,target_names=target_names))

print("confusion matrix:")
print(metrics.confusion_matrix(y_train1, p))

accuracy:   0.980
classification report:
              precision    recall  f1-score   support

      LEVANT       0.98      0.98      0.98    139064
        GULF       0.98      1.00      0.99     12547
  NILE BASIN       0.97      0.98      0.98     93543
     MAGHERB       0.99      0.97      0.98     59488
          YE       0.98      0.97      0.98     58335
          IQ       0.98      1.00      0.99      8035

    accuracy                           0.98    371012
   macro avg       0.98      0.98      0.98    371012
weighted avg       0.98      0.98      0.98    371012

confusion matrix:
[[136743    115   1161    396    554     95]
 [     8  12505     14      6      9      5]
 [  1242     50  91538    196    468     49]
 [   605     50    647  57820    340     26]
 [   688     44    571    146  56874     12]
 [     6      4     14      3      4   8004]]


In [16]:
pickle.dump(ensemble,open('model_main.pkl','wb'))

In [17]:
pickle.dump(union,open('union_main.pkl','wb'))