In [1]:
import pandas as pd
import re
import string
import pickle
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn import svm
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score,StratifiedKFold
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.metrics import fbeta_score, make_scorer
import seaborn as sns

### Import excel file for nile basin countries

In [2]:
df=pd.read_excel("data_nile_basin.xlsx",index_col=0,
              dtype={'tokens': str, 'dialect': str})

In [3]:
df

Unnamed: 0,tokens,dialect
244697,الاقيش معاك الف يا عم نجيب قرض حسن ابدا بيهم ح...,EG
244698,انت بتفهم كنت بقيت زملكاوي,EG
244699,ولولوا وسكتوا,EG
244700,واحده عشان بواب ملكش عازه,EG
244701,جنسيتك ايه ده انت صهيوني مش هتقول كده,EG
...,...,...
405604,الناس دي بتنفخ قربه مقدوده بالدارجي كده البلد ...,SD
405605,انت عايش وين بره السودان شنو ماشايف البحصل دا,SD
405606,مااحرم ميسي حريف ولعاب برضو مدريدي وافتخر,SD
405607,ياخي ديل ماخلو للشيطان وإبليس شي يروحو وين ربن...,SD


### Checking null values

In [4]:
df.isnull().sum()

tokens     0
dialect    0
dtype: int64

### Building voting classifier (linearsvc, multinomialNB, BernoulliNB)

In [5]:
def build_estimators():
    estimators = []
    svc = LinearSVC(penalty='l1', dual=False,tol=1e-6)
    estimators.append(('svc',svc))
    mnb= MultinomialNB(alpha=.01)
    estimators.append(('mnb',mnb))
    bnb= BernoulliNB(alpha=.01)
    estimators.append(('bnb',bnb))
    ensemble = VotingClassifier(estimators)
    return ensemble

### train, validation,test split

In [6]:
x=df['tokens']
y=df['dialect']
X_train, X_test, y_train, y_test=train_test_split(x,y,test_size=0.1,stratify=y,random_state=42)
X_train1, X_valid, y_train1, y_valid=train_test_split(X_train, y_train,test_size=0.1,stratify=y_train,random_state=42)

In [7]:
target_names=list(y_train1.unique())

In [8]:
# featureunion -> combines several transformer objects into a new transformer that combines their output. 
# A FeatureUnion takes a list of transformer objects. During fitting, each of these is fit to the data independently.
# For transforming data, the transformers are applied in parallel, and 
# the sample vectors they output are concatenated end-to-end into larger vectors.

### Extracting features from the training data using a Tfidf-vectorizer

In [9]:
print("Extracting features from the training data using a sparse vectorizer")
# featureunion -> concatenate the results of multiple transformer objects,combine several feature extraction mechanisms
#into a single transformer.
max_df = 0.5
union = FeatureUnion([("w_v", TfidfVectorizer(sublinear_tf=True, max_df=max_df,analyzer = 'word', ngram_range=(1,3)
                                 )),
                       ("c_wb", TfidfVectorizer(sublinear_tf=True,max_df=max_df,analyzer = 'char_wb', ngram_range=(2,5)
                                 )),

                      ("c_wb5", TfidfVectorizer(sublinear_tf=True, max_df=max_df,analyzer = 'char', ngram_range=(2,4)
                                 )),
                      

                       ],
transformer_weights={
           'w_v': 0.9,
         
            'c_wb': 0.9,
         
           "c_wb5":0.9,
        }
,
)


X_train_feat = union.fit_transform(X_train1)
X_valid_feat = union.transform(X_valid)
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_feat, y_train1)
print("Combined space has", X_res.shape[1], "features")




Extracting features from the training data using a sparse vectorizer
Combined space has 1619119 features


### Training the classifier

In [10]:

ensemble = build_estimators()
ensemble.fit(X_res, y_res)



VotingClassifier(estimators=[('svc',
                              LinearSVC(dual=False, penalty='l1', tol=1e-06)),
                             ('mnb', MultinomialNB(alpha=0.01)),
                             ('bnb', BernoulliNB(alpha=0.01))])

### Testing the classifier on validation data

In [11]:
pred = ensemble.predict(X_valid_feat)
score = metrics.accuracy_score(y_valid, pred)
print("accuracy:   %0.3f" % score)

print("classification report:")
print(metrics.classification_report(y_valid, pred,target_names=target_names))

print("confusion matrix:")
print(metrics.confusion_matrix(y_valid, pred))

accuracy:   0.932
classification report:
              precision    recall  f1-score   support

          SD       0.94      0.98      0.96      5186
          EG       0.89      0.75      0.82      1296

    accuracy                           0.93      6482
   macro avg       0.91      0.86      0.89      6482
weighted avg       0.93      0.93      0.93      6482

confusion matrix:
[[5064  122]
 [ 320  976]]


### Testing the classifier on testing data

In [12]:
X_test_feat = union.transform(X_test)
pred_test=ensemble.predict(X_test_feat)
score = metrics.accuracy_score(y_test, pred_test)
print("accuracy:   %0.3f" % score)
print("classification report:")
print(metrics.classification_report(y_test, pred_test,target_names=target_names))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred_test))

accuracy:   0.930
classification report:
              precision    recall  f1-score   support

          SD       0.94      0.98      0.96      5762
          EG       0.88      0.75      0.81      1440

    accuracy                           0.93      7202
   macro avg       0.91      0.86      0.88      7202
weighted avg       0.93      0.93      0.93      7202

confusion matrix:
[[5618  144]
 [ 357 1083]]


### Testing the model on some sentences

In [13]:

ensemble.predict(union.transform(["انا كويسة الحمد لله"]))

array(['EG'], dtype=object)

In [14]:
ensemble.predict(union.transform(["اهلا زول"]))

array(['SD'], dtype=object)

In [15]:
p=ensemble.predict(X_train_feat)
score=metrics.accuracy_score(y_train1, p)
print("accuracy:   %0.3f" % score)
print("classification report:")
print(metrics.classification_report(y_train1, p,target_names=target_names))

print("confusion matrix:")
print(metrics.confusion_matrix(y_train1, p))

accuracy:   0.998
classification report:
              precision    recall  f1-score   support

          SD       1.00      1.00      1.00     46673
          EG       1.00      1.00      1.00     11661

    accuracy                           1.00     58334
   macro avg       1.00      1.00      1.00     58334
weighted avg       1.00      1.00      1.00     58334

confusion matrix:
[[46617    56]
 [   48 11613]]


In [16]:
pickle.dump(ensemble,open('model_nile_basin.pkl','wb'))

In [17]:
pickle.dump(union,open('union_nile_basin.pkl','wb'))