In [19]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix , f1_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier
import re
import string
import nltk
from nltk.corpus import stopwords
import gensim.downloader as api
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import scipy.stats as stats
import numpy as np

In [20]:
 nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Loading data

In [21]:
train_data = pd.read_csv('/content/new data/ML_cleaned_train.csv')
test_data = pd.read_csv('/content/new data/ML_cleaned_test.csv')


# Dtat Validation

In [22]:
train_data.isna().sum()

Text       1
Dialect    0
dtype: int64

In [23]:
train_data.dropna(axis=0 , inplace=True)

In [24]:
test_data.dropna(axis=0 , inplace=True)

In [7]:
def clean_text(text):
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^ا-ي\s]', '', text, re.I|re.A)
    text = text.strip()
    stop_words = set(stopwords.words('arabic'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [16]:
train_data['cleaned_text'] = train_data['Text'].apply(clean_text)
test_data['cleaned_text'] = test_data['Text'].apply(clean_text)
train_data[['Text', 'cleaned_text']].head(40)

Unnamed: 0,Text,cleaned_text
0,واله بالعكس جو سمح صقع سم لين خلاص,واله بالعكس جو سمح صقع سم لين خلاص
1,ضربتها بالقلايه بكري,ضربتها بالقلايه بكري
2,يا استاذي الفاضل احنا عاصرنا ده محدش حكاهولنا,استاذي الفاضل احنا عاصرنا ده محدش حكاهولنا
3,هوينه هانيبال ولد العقيد متزوج لبنانيه عارضه ا...,هوينه هانيبال ولد العقيد متزوج لبنانيه عارضه ا...
4,السعاده حضرتك المنشن المحترمين,السعاده حضرتك المنشن المحترمين
5,واله فاصوليا بالكرشه الا باهيه الحق,واله فاصوليا بالكرشه الا باهيه الحق
6,رمضان قرب وبدو ايقصو الضي كا العاده,رمضان قرب وبدو ايقصو الضي كا العاده
7,الطلاب لوزاره التعليم:اذا مكنشي النهارده يبقي ...,الطلاب لوزاره التعليماذا مكنشي النهارده يبقي ب...
8,السنه لعبش كويس حتي انا ضد انه يطلع لمسه,السنه لعبش كويس حتي انا ضد انه يطلع لمسه
9,يا انت اكل بلوك يا انا اكيد يلي اكل بلوك مين هيدا,انت اكل بلوك انا اكيد يلي اكل بلوك مين هيدا


**classes are not balenced so we will use Macro F1 score for evaluation**

# Data splitting

In [25]:
X = train_data['Text']
y = train_data['Dialect']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=99)

In [26]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(94410,)
(23603,)
(94410,)
(23603,)


# Embedding

In [27]:
word2vec_model = api.load('word2vec-google-news-300')

In [28]:
def get_average_vector(sentence, model, num_features):
    words = sentence.lower().split()
    word_vectors = [model[word] for word in words if word in model]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(num_features)

In [29]:
num_features = word2vec_model.vector_size


In [30]:
X_train_vectors = np.array([get_average_vector(text, word2vec_model, num_features) for text in X_train])
X_test_vectors = np.array([get_average_vector(text, word2vec_model, num_features) for text in X_val])


In [31]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_val)

# xgb  Model Training

In [77]:
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')
xgb_model.fit(X_train_vectors, y_train_encoded)


In [78]:
# Evaluate the initial model
y_pred = xgb_model.predict(X_test_vectors)
accuracy = accuracy_score(y_test_encoded, y_pred)
print("Initial Accuracy:", accuracy)

Initial Accuracy: 0.3901622675083676


In [79]:
f1 = f1_score(y_test_encoded, y_pred , average='macro')

In [80]:
print("Initial F1 Score:", f1)

Initial F1 Score: 0.11226722745420743


**fine tuning**

In [87]:
param_grid = {
    'max_depth': [2, 4, 6],
    'n_estimators': [25, 50, 100],
    'learning_rate': [0.01, 0.1, 0.2]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='f1_macro', verbose=1)
grid_search.fit(X_train_vectors, y_train_encoded)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [88]:
print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 50}


In [89]:
best_xgb_model = grid_search.best_estimator_

In [90]:
# Evaluate the fine-tuned model
y_pred_tuned = best_xgb_model.predict(X_test_vectors)
accuracy_tuned = accuracy_score(y_test_encoded, y_pred_tuned)
f1_tuned = f1_score(y_test_encoded, y_pred_tuned, average='macro')

In [91]:
print("Tuned Accuracy:", accuracy_tuned)
print("Tuned F1 Score:", f1_tuned)

Tuned Accuracy: 0.3901622675083676
Tuned F1 Score: 0.11226722745420743


In [92]:
with open('xgb2.pkl', 'wb') as f:
    pickle.dump(best_xgb_model, f)


#tfdif with multinb



In [93]:
tfidf_vec = TfidfVectorizer()
X_train_tfidf = tfidf_vec.fit_transform(X_train)

# Transform the test data
X_test_tfidf = tfidf_vec.transform(X_val)

In [94]:
clf_nb = MultinomialNB()
clf_nb.fit(X_train_tfidf, y_train_encoded)
nb_y_predict = clf_nb.predict(X_test_tfidf)

In [95]:
print(classification_report(y_test_encoded, nb_y_predict))


              precision    recall  f1-score   support

           0       0.61      0.99      0.76      9209
           1       0.95      0.68      0.79      4439
           2       0.83      0.67      0.74      5828
           3       0.99      0.28      0.44      1883
           4       0.99      0.12      0.22      2244

    accuracy                           0.72     23603
   macro avg       0.88      0.55      0.59     23603
weighted avg       0.80      0.72      0.68     23603



In [96]:
import pickle

# Save the trained model to a pickle file
with open('nb_model2.pkl', 'wb') as f:
    pickle.dump(clf_nb, f)
