In [6]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, classification_report

from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC 
from sklearn.svm import LinearSVC 

# from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced

import pickle
import joblib

In [7]:
df = pd.read_csv("../input/tokenized-df/tokenized_df.csv", engine="python")

In [8]:
df.dropna(inplace=True)
df.drop(df[df.text == '[]'].index, inplace=True)

In [9]:
len(df[df.text == '[]']), len(df[df.dialect.isna()])

(0, 0)

In [10]:
X, Y = df.text.values, df.dialect.values

X.shape

(457953,)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=911, shuffle=True)

x_train.shape, x_test.shape

((320567,), (137386,))

In [12]:
clf = make_pipeline(TfidfVectorizer(ngram_range=(1, 5), binary=False,sublinear_tf=True),
                    SMOTE("minority"),
                    LinearSVC(random_state=0, tol=1e-5))

clf.fit(x_train, y_train)



Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(ngram_range=(1, 5), sublinear_tf=True)),
                ('smote', SMOTE(sampling_strategy='minority')),
                ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])

In [13]:
y_pred = clf.predict(x_test)
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         AE       0.43      0.42      0.97      0.42      0.63      0.38      8008
         BH       0.43      0.27      0.98      0.33      0.51      0.24      7990
         DZ       0.64      0.49      0.99      0.56      0.70      0.46      4882
         EG       0.60      0.89      0.91      0.72      0.90      0.82     17166
         IQ       0.64      0.51      0.99      0.57      0.71      0.48      4649
         JO       0.45      0.30      0.98      0.36      0.54      0.27      8304
         KW       0.46      0.61      0.93      0.53      0.75      0.55     12721
         LB       0.59      0.70      0.97      0.64      0.82      0.66      8204
         LY       0.66      0.66      0.97      0.66      0.80      0.62     10901
         MA       0.74      0.57      0.99      0.65      0.76      0.55      3465
         OM       0.47      0.32      0.98      0.38      0.56      0.30      5703
   

In [14]:
joblib.dump(clf, 'classification_pipeline.pkl')

['classification_pipeline.pkl']