In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('../PreprocessedData/preprocessed_data.csv',sep='\t',encoding='utf-8')

In [3]:
df.sample(5)

Unnamed: 0,id,text,dialect,preprocessed_text
343951,1176935779500773376,لما يقصون عليه اول مرة نقول طيب وثقته زايدة با...,KW,لما يقصون عليه اول مرة نقول طيب وثقته زايدة با...
270826,915966586175049600,@a_berber07 @MahmoudAttyaAid ولسه انت شوفت حاج...,EG,[مستخدم] [مستخدم] ولسه انت شوفت حاجة . . ترحال...
75680,962999639183065216,@Dhahi_Khalfan نفس اللي سواه ولد سلمان بس بطري...,QA,[مستخدم] نفس اللي سواه ولد سلمان بس بطريقه راق...
104454,872882143965437952,مش بعيد بكرة نشوف اليهود بيقزدروا في شوارع مكة...,PL,مش بعيد بكرة نشوف اليهود بيقزدروا في شوارع مكة...
144674,946128795039420416,@hany_ms 😈 ماهو يبداو بيك الاول هههههههه,TN,[مستخدم] ماهو يبداو بيك الاول هه


In [4]:
df.dropna(subset=['preprocessed_text'],inplace=True)

In [5]:
X_tr, X_val = train_test_split(df, test_size=0.1, random_state=42, stratify=df['dialect'])

In [6]:
X_tr.shape, X_val.shape

((412342, 4), (45816, 4))

In [7]:
model_name = "aubmindlab/bert-base-arabertv02-twitter"
arabert_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
def tokenize(text):
    tokens = arabert_tokenizer.tokenize(text)
    return tokens

In [9]:
tfidf = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 2), min_df=2, lowercase=False, token_pattern=None)

In [10]:
tfidf.fit(X_tr['preprocessed_text'])

TfidfVectorizer(lowercase=False, min_df=2, ngram_range=(1, 2),
                token_pattern=None,
                tokenizer=<function tokenize at 0x000001EA81D64708>)

In [11]:
len(tfidf.get_feature_names())

816817

In [12]:
X_tr_tf = tfidf.transform(X_tr['preprocessed_text'])
X_val_tf = tfidf.transform(X_val['preprocessed_text'])

In [13]:
model = LinearSVC(C=10, dual=True, verbose=True, random_state=42)
model.fit(X_tr_tf, X_tr['dialect'])

[LibLinear]

LinearSVC(C=10, random_state=42, verbose=True)

In [14]:
print(model.score(X_val_tf, X_val['dialect']) )

0.5617033350794483


In [15]:
filename1 = '../models/svm_model.sav'
filename2 = '../models/tfidf.pkl'

In [16]:
import pickle
pickle.dump(model, open(filename1, 'wb'))
pickle.dump(tfidf, open(filename2, 'wb'))

In [17]:
loaded_model = pickle.load(open(filename1, 'rb'))

In [18]:
print(loaded_model.score(X_val_tf, X_val['dialect']) )

0.5617033350794483
