In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split , cross_val_score 
from sklearn.feature_extraction.text import TfidfTransformer , CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB 
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix

from joblib import dump

In [3]:
df = pd.read_csv(r"C:\Users\Mohamed Nabil\Documents\AIM Task\Data Pre-processing\clean_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,dialect,text,clean_text,tokenizer,region_dialect
0,0,IQ,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,بالنهايه ينتفض يغير,"['بالنهايه', 'ينتفض', 'يغير']",Gulf
1,1,IQ,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,يعني محسوب علي البشر حيونه وحشيه وتطلبون الغرب...,"['يعني', 'محسوب', 'علي', 'البشر', 'حيونه', 'وح...",Gulf
2,2,IQ,@KanaanRema مبين من كلامه خليجي,مبين كلامه خليجي,"['مبين', 'كلامه', 'خليجي']",Gulf
3,3,IQ,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,يسلملي مرورك وروحك الحلوه,"['يسلملي', 'مرورك', 'وروحك', 'الحلوه']",Gulf
4,4,IQ,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,وين الغيبه اخ محمد,"['وين', 'الغيبه', 'اخ', 'محمد']",Gulf


In [4]:
df.columns

Index(['Unnamed: 0', 'dialect', 'text', 'clean_text', 'tokenizer',
       'region_dialect'],
      dtype='object')

In [5]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [6]:
df

Unnamed: 0,dialect,text,clean_text,tokenizer,region_dialect
0,IQ,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,بالنهايه ينتفض يغير,"['بالنهايه', 'ينتفض', 'يغير']",Gulf
1,IQ,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,يعني محسوب علي البشر حيونه وحشيه وتطلبون الغرب...,"['يعني', 'محسوب', 'علي', 'البشر', 'حيونه', 'وح...",Gulf
2,IQ,@KanaanRema مبين من كلامه خليجي,مبين كلامه خليجي,"['مبين', 'كلامه', 'خليجي']",Gulf
3,IQ,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,يسلملي مرورك وروحك الحلوه,"['يسلملي', 'مرورك', 'وروحك', 'الحلوه']",Gulf
4,IQ,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,وين الغيبه اخ محمد,"['وين', 'الغيبه', 'اخ', 'محمد']",Gulf
...,...,...,...,...,...
458192,BH,@Al_mhbaa_7 مبسوطين منك اللي باسطانا😅,مبسوطين منك الي باسطانا,"['مبسوطين', 'منك', 'الي', 'باسطانا']",Gulf
458193,BH,@Zzainabali @P_ameerah والله ماينده ابش يختي,واله ماينده ابش يختي,"['واله', 'ماينده', 'ابش', 'يختي']",Gulf
458194,BH,@Al_mhbaa_7 شو عملنا لك حنا تهربي مننا احنا مس...,شو عملنا حنا تهربي منا احنا مساكين ليش بتعملي ...,"['شو', 'عملنا', 'حنا', 'تهربي', 'منا', 'احنا',...",Gulf
458195,BH,@haneenalmwla الله يبارك فيها وبالعافيه 😋😋😋,اله يبارك وبالعافيه,"['اله', 'يبارك', 'وبالعافيه']",Gulf


In [7]:
df.isnull().sum()

dialect             0
text                0
clean_text        244
tokenizer           0
region_dialect      0
dtype: int64

In [8]:
df.dropna(axis=0,inplace=True)

### Let's try `clean_text` column as a data with both of `dialect & region_dialect` as a target

#### For `dialect` column as a target

In [9]:
num_seed = 123

In [10]:
def train_test(data,target):
  
    x = df[data].astype(str)
    y = df[target].astype('category')

    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3,random_state=num_seed, stratify=y ,shuffle=True)

    return x_train, x_test, y_train, y_test

In [11]:
X_train, X_test, y_train, y_test = train_test('clean_text','dialect')

In [None]:
#feature extractin starts with counting the unique features
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

#then transforms the features
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

In [12]:
features = X_train_tf
labels = y_train

In [13]:
models = [
    LinearSVC(random_state=num_seed, tol=1e-5),
    MultinomialNB()
        ]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [14]:
cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,LinearSVC,0,0.514412
1,LinearSVC,1,0.513188
2,LinearSVC,2,0.513281
3,LinearSVC,3,0.512829
4,LinearSVC,4,0.515013
5,MultinomialNB,0,0.364772
6,MultinomialNB,1,0.366805
7,MultinomialNB,2,0.365293
8,MultinomialNB,3,0.364559
9,MultinomialNB,4,0.36718


In [15]:
cv_df.groupby('model_name').accuracy.mean()

model_name
LinearSVC        0.513744
MultinomialNB    0.365722
Name: accuracy, dtype: float64

#### LinearSVC        

In [16]:
model = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC(random_state=num_seed, tol=1e-5)),
                    ])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Training Accuracy:',model.score(X_train,y_train))
print('Testing Accuracy:',accuracy_score(y_test,y_pred))

Training Accuracy: 0.9968773981021068
Testing Accuracy: 0.5412705806996346


In [17]:
target = df['dialect'].astype('category')
print(classification_report(y_test, y_pred,target_names=target.cat.categories))

              precision    recall  f1-score   support

          AE       0.43      0.43      0.43      7886
          BH       0.40      0.30      0.34      7870
          DZ       0.63      0.52      0.57      4769
          EG       0.65      0.88      0.75     17326
          IQ       0.64      0.53      0.58      4638
          JO       0.44      0.33      0.38      8430
          KW       0.47      0.61      0.53     12697
          LB       0.61      0.68      0.64      8277
          LY       0.67      0.69      0.68     11054
          MA       0.76      0.60      0.67      3479
          OM       0.45      0.34      0.39      5706
          PL       0.48      0.54      0.51     13081
          QA       0.47      0.50      0.48      9287
          SA       0.42      0.44      0.43      7922
          SD       0.72      0.55      0.62      4236
          SY       0.50      0.29      0.37      4942
          TN       0.69      0.44      0.54      2768
          YE       0.46    

#### MultinomialNB

In [12]:
model = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer()),
                     ('nb', MultinomialNB()),
                    ])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Training Accuracy:',model.score(X_train,y_train))
print('Testing Accuracy:',accuracy_score(y_test,y_pred))

Training Accuracy: 0.5711130590484984
Testing Accuracy: 0.36494984932962604


In [13]:
target = df['dialect'].astype('category')
print(classification_report(y_test, y_pred,target_names=target.cat.categories))

              precision    recall  f1-score   support

          AE       0.74      0.09      0.17      7887
          BH       0.80      0.06      0.11      7884
          DZ       0.93      0.14      0.24      4853
          EG       0.28      0.98      0.44     17285
          IQ       0.97      0.06      0.11      4645
          JO       0.75      0.05      0.10      8367
          KW       0.29      0.76      0.42     12630
          LB       0.82      0.38      0.52      8285
          LY       0.71      0.49      0.58     10943
          MA       0.98      0.13      0.23      3461
          OM       0.98      0.02      0.04      5732
          PL       0.34      0.58      0.43     13118
          QA       0.55      0.32      0.40      9317
          SA       0.63      0.11      0.19      8049
          SD       0.99      0.04      0.07      4317
          SY       0.98      0.03      0.06      4869
          TN       0.93      0.01      0.02      2772
          YE       1.00    

In [18]:
# dump the pipeline model
dump(model, filename="ML_dialect.joblib")

['ML_dialect.joblib']

#### For `region_dialect` column as a target

In [14]:
X_train, X_test, y_train, y_test = train_test('clean_text','region_dialect')

In [20]:
#feature extractin starts with counting the unique features
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

#then transforms the features
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

In [21]:
features = X_train_tf
labels = y_train

In [22]:
models = [
    LinearSVC(random_state=num_seed, tol=1e-5),
    MultinomialNB()
        ]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [23]:
cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,LinearSVC,0,0.790561
1,LinearSVC,1,0.788904
2,LinearSVC,2,0.789247
3,LinearSVC,3,0.787672
4,LinearSVC,4,0.788498
5,MultinomialNB,0,0.702452
6,MultinomialNB,1,0.701449
7,MultinomialNB,2,0.70134
8,MultinomialNB,3,0.702135
9,MultinomialNB,4,0.702026


In [24]:
cv_df.groupby('model_name').accuracy.mean()

model_name
LinearSVC        0.788976
MultinomialNB    0.701880
Name: accuracy, dtype: float64

#### LinearSVC 

In [25]:
model = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC(random_state=num_seed, tol=1e-5)),
                    ])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Training Accuracy:',model.score(X_train,y_train))
print('Testing Accuracy:',accuracy_score(y_test,y_pred))

Training Accuracy: 0.9968929955141843
Testing Accuracy: 0.8023743321735839


In [26]:
target = df['region_dialect'].astype('category')
print(classification_report(y_test, y_pred,target_names=target.cat.categories))

              precision    recall  f1-score   support

        Gulf       0.80      0.91      0.85     59024
      Levant       0.80      0.73      0.76     34730
     Maghreb       0.83      0.64      0.72     22070
  Nile_Basin       0.78      0.80      0.79     21562

    accuracy                           0.80    137386
   macro avg       0.80      0.77      0.78    137386
weighted avg       0.80      0.80      0.80    137386



#### MultinomialNB    

In [15]:
model = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer()),
                     ('NB', MultinomialNB()),
                    ])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Training Accuracy:',model.score(X_train,y_train))
print('Testing Accuracy:',accuracy_score(y_test,y_pred))

Training Accuracy: 0.788346897840389
Testing Accuracy: 0.6550303524376574


In [16]:
target = df['region_dialect'].astype('category')
print(classification_report(y_test, y_pred,target_names=target.cat.categories))

              precision    recall  f1-score   support

        Gulf       0.58      0.99      0.73     59116
      Levant       0.85      0.54      0.66     34639
     Maghreb       0.99      0.22      0.36     22029
  Nile_Basin       0.89      0.38      0.53     21602

    accuracy                           0.66    137386
   macro avg       0.83      0.53      0.57    137386
weighted avg       0.76      0.66      0.62    137386



In [27]:
# dump the pipeline model
dump(model, filename="ML_region_dialect.joblib")

['ML_region_dialect.joblib']