# Introduction

# Imortant Libraries

In [49]:
! pip install contractions
! pip install emoji
! pip install arabic_reshaper python-bidi nltk snowballstemmer



In [50]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
import plotly.express as px

In [51]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
import emoji
import arabic_reshaper
from bidi.algorithm import get_display
from snowballstemmer import stemmer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

In [52]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load Dataset

In [53]:
# read data
dialects_data = pd.read_csv('/kaggle/input/dialects-db/dialects_data.csv', engine='python')

In [54]:
# show the first five rows of the dataframe
dialects_data.head()

Unnamed: 0,id,text,dialect
0,1009754958479151232,@toha_Altomy @gy_yah ŸÇŸÑŸäŸÑŸäŸÜ ÿßÿØÿ® ŸàŸÖŸÜÿßŸÅŸÇŸäŸÜ. ŸÑŸà ÿßÿÆÿ™ŸáŸÖ ÿßŸà ŸÇÿ±Ÿäÿ®ÿ™ŸáŸÖ ÿ™ÿ™ÿπÿßŸÉÿ≥ ÿ™ŸÇŸàŸÑŸä ÿπŸÑŸäŸáŸÖ ŸÖŸÜ ŸÜÿ¥ÿßÿ∑ ÿ≠ŸÇŸàŸÇ ÿßŸÑŸÖÿ±ÿ£ÿ© ŸÖŸÜ ÿ±ÿØÿ© ŸÅÿπŸÑŸáŸÖ.,LY
1,1009794751548313600,@AlmFaisal üòÇüòÇ ÿßŸÑŸÑŸäÿ®ŸäŸäŸÜ ŸÖÿ™ŸÇŸÑÿ®ŸäŸÜ!!!\nÿ®ÿ≥ ÿ®ÿßŸÑŸÜÿ≥ÿ®ÿ© ŸÑŸäÿß ÿßŸÜÿß ŸÖŸäŸÑŸäÿ¥ŸäÿßŸàŸä ÿ≤ŸÖÿßŸÜ Ÿàÿ™Ÿàÿ©,LY
2,1019989115490787200,@smsm071990 @ALMOGRBE ŸÉŸÑ 20 ÿ™ÿßŸÜŸäŸá ÿ¥ÿßÿ® ŸÑŸäÿ®Ÿä ÿ®Ÿäÿ±ÿ™ÿßÿ≠ ŸÑÿ®ŸÜÿ™ ŸÖÿÆÿ™ŸÑŸÅÿ© ŸàŸäŸÑÿßÿ≠ÿ∏ ÿßŸÜŸáÿß ÿ∫Ÿäÿ± ŸÉŸÑ ÿßŸÑÿ®ŸÜÿßÿ™ Ÿàÿ®Ÿäÿ≠ÿ≥ ŸÉÿ£ŸÜŸá Ÿäÿπÿ±ŸÅŸáÿß ŸÖŸÜ ÿ≤ŸÖÿßŸÜ. ÿ®ÿπÿØŸäŸÜ Ÿäÿ™ÿ≤Ÿàÿ¨ Ÿàÿ≠ÿØÿ© ŸÖŸÜŸáŸÜ ŸàŸÖŸÖŸÉŸÜ ÿßÿ´ŸÜŸäŸÜ ŸàŸÑÿßÿ´ŸÑÿßÿ´ÿ© Ÿàÿ™ŸÜŸÇŸÑÿ® ÿßŸÑÿ±ŸàŸÖÿßŸÜÿ≥Ÿäÿ© ŸÑÿπŸäÿßÿ∑ Ÿàÿ¥Ÿäÿßÿ∑ Ÿàÿ™Ÿáÿ≤Ÿäÿ®\nand they live happily ever after\nÿ∞Ÿä ÿßŸÜÿØ,LY
3,1035479791758135168,@AboryPro @lyranoo85 ÿ±ÿßŸÜŸäÿß ÿπŸÇŸÑŸäÿ™ŸÉ ŸÖÿ™ÿÆŸÑŸÅÿ©. ÿßŸàŸÑÿß ÿßŸÑÿßŸÜÿ≥ÿßŸÜ ŸäŸÑŸä Ÿäÿ≠ÿ™ÿßÿ¨ ÿßŸáŸÑ ŸäÿÆÿßŸÅ ŸÖŸÜŸáŸÖ ÿπŸÑÿ¥ÿßŸÜ ŸäŸÉŸàŸÜ ŸÖÿ≠ÿ™ÿ±ŸÖ ŸáŸà ÿßŸÜÿ≥ÿßŸÜ ŸÇŸÑŸäŸÑ ÿßŸÑÿßÿØÿ® ÿßÿµŸÑÿßŸã. ÿ´ÿßŸÜŸäÿßŸã ÿ¥ŸÜ ÿ∞ŸÜÿ® ŸäŸÑŸä ŸÖÿπŸÜÿØŸáÿ¥ ÿßÿ® ŸàŸÑÿß ÿßŸÖ ŸàŸÑÿß ÿÆŸàÿ™ ŸàŸÑÿß ÿÆŸàÿßÿ™ÿü ŸäÿπŸÜŸä ÿßŸÑŸäÿ™ŸäŸÖÿ© ŸÖÿ™ÿ≥ÿ™ÿ≠ŸÇÿ¥ ÿ™ÿ™ÿ≤Ÿàÿ¨ÿü Ÿàÿ´ÿßŸÑÿ´ÿßŸã ŸÑŸäÿ¥ ÿßŸÑÿ®ŸÜÿ™ ŸáŸä ÿ®ÿ≥ ŸÑÿßÿ≤ŸÖ ÿßÿØŸäÿ± ÿßŸÑŸÅ ÿ≠ÿ≥ÿßÿ® ŸÑŸÑÿ±ÿßÿ¨ŸÑÿü ŸáŸä ŸÖÿ™ÿ≥ÿ™ÿ≠ŸÇÿ¥ ŸäŸÜÿØÿßÿ±ŸÑŸáÿß ÿßŸÑŸÅ ÿ≠ÿ≥ÿßÿ® ŸàŸÑÿß ŸáŸä ÿπÿ®ÿØÿ©ÿü,LY
4,1035481122921164800,@lyranoo85 ÿ¥ŸÉŸÑŸÉ ŸÖÿ™ÿπŸÇÿØÿ© ÿπŸÑÿ¥ÿßŸÜ ÿßŸÑÿ±ÿßÿ¨ŸÑ ŸÑŸä ÿ™ÿ≠ÿ®ŸäŸá ÿßÿ≤Ÿàÿ¨ ÿ®ŸÜÿ™ Ÿäÿ™ŸäŸÖÿ© ŸàŸÑÿß ÿ®ŸÜÿ™ ŸÖÿπŸÜÿØŸáÿ¥ ÿÆŸàÿ™. ŸáÿØŸä ÿßÿπÿµÿßÿ®ŸÉ ŸàŸÅŸÉŸäŸÜÿß ŸÖŸÜ ÿßŸÑÿ™ÿÆŸÑŸÅ ÿßŸÖÿ™ÿßÿπŸÉ,LY


# EDA and Preprocessing


In [55]:
dialects_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147734 entries, 0 to 147733
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   id       147734 non-null  object
 1   text     147732 non-null  object
 2   dialect  147718 non-null  object
dtypes: object(3)
memory usage: 3.4+ MB


## check NaNs

In [56]:
dialects_data.isna().sum()

id          0
text        2
dialect    16
dtype: int64

**we will delete rows with missing values**

In [57]:
# drop rows with missing values
dialects_data = dialects_data.dropna()

In [58]:
#check for missing values ag
dialects_data.isna().sum()

id         0
text       0
dialect    0
dtype: int64

## check duplicates
we need drop id column before check duplicates

In [59]:
# Drop the 'id' column
dialects_data = dialects_data.drop(columns=['id'])

In [60]:
dialects_data.head()

Unnamed: 0,text,dialect
0,@toha_Altomy @gy_yah ŸÇŸÑŸäŸÑŸäŸÜ ÿßÿØÿ® ŸàŸÖŸÜÿßŸÅŸÇŸäŸÜ. ŸÑŸà ÿßÿÆÿ™ŸáŸÖ ÿßŸà ŸÇÿ±Ÿäÿ®ÿ™ŸáŸÖ ÿ™ÿ™ÿπÿßŸÉÿ≥ ÿ™ŸÇŸàŸÑŸä ÿπŸÑŸäŸáŸÖ ŸÖŸÜ ŸÜÿ¥ÿßÿ∑ ÿ≠ŸÇŸàŸÇ ÿßŸÑŸÖÿ±ÿ£ÿ© ŸÖŸÜ ÿ±ÿØÿ© ŸÅÿπŸÑŸáŸÖ.,LY
1,@AlmFaisal üòÇüòÇ ÿßŸÑŸÑŸäÿ®ŸäŸäŸÜ ŸÖÿ™ŸÇŸÑÿ®ŸäŸÜ!!!\nÿ®ÿ≥ ÿ®ÿßŸÑŸÜÿ≥ÿ®ÿ© ŸÑŸäÿß ÿßŸÜÿß ŸÖŸäŸÑŸäÿ¥ŸäÿßŸàŸä ÿ≤ŸÖÿßŸÜ Ÿàÿ™Ÿàÿ©,LY
2,@smsm071990 @ALMOGRBE ŸÉŸÑ 20 ÿ™ÿßŸÜŸäŸá ÿ¥ÿßÿ® ŸÑŸäÿ®Ÿä ÿ®Ÿäÿ±ÿ™ÿßÿ≠ ŸÑÿ®ŸÜÿ™ ŸÖÿÆÿ™ŸÑŸÅÿ© ŸàŸäŸÑÿßÿ≠ÿ∏ ÿßŸÜŸáÿß ÿ∫Ÿäÿ± ŸÉŸÑ ÿßŸÑÿ®ŸÜÿßÿ™ Ÿàÿ®Ÿäÿ≠ÿ≥ ŸÉÿ£ŸÜŸá Ÿäÿπÿ±ŸÅŸáÿß ŸÖŸÜ ÿ≤ŸÖÿßŸÜ. ÿ®ÿπÿØŸäŸÜ Ÿäÿ™ÿ≤Ÿàÿ¨ Ÿàÿ≠ÿØÿ© ŸÖŸÜŸáŸÜ ŸàŸÖŸÖŸÉŸÜ ÿßÿ´ŸÜŸäŸÜ ŸàŸÑÿßÿ´ŸÑÿßÿ´ÿ© Ÿàÿ™ŸÜŸÇŸÑÿ® ÿßŸÑÿ±ŸàŸÖÿßŸÜÿ≥Ÿäÿ© ŸÑÿπŸäÿßÿ∑ Ÿàÿ¥Ÿäÿßÿ∑ Ÿàÿ™Ÿáÿ≤Ÿäÿ®\nand they live happily ever after\nÿ∞Ÿä ÿßŸÜÿØ,LY
3,@AboryPro @lyranoo85 ÿ±ÿßŸÜŸäÿß ÿπŸÇŸÑŸäÿ™ŸÉ ŸÖÿ™ÿÆŸÑŸÅÿ©. ÿßŸàŸÑÿß ÿßŸÑÿßŸÜÿ≥ÿßŸÜ ŸäŸÑŸä Ÿäÿ≠ÿ™ÿßÿ¨ ÿßŸáŸÑ ŸäÿÆÿßŸÅ ŸÖŸÜŸáŸÖ ÿπŸÑÿ¥ÿßŸÜ ŸäŸÉŸàŸÜ ŸÖÿ≠ÿ™ÿ±ŸÖ ŸáŸà ÿßŸÜÿ≥ÿßŸÜ ŸÇŸÑŸäŸÑ ÿßŸÑÿßÿØÿ® ÿßÿµŸÑÿßŸã. ÿ´ÿßŸÜŸäÿßŸã ÿ¥ŸÜ ÿ∞ŸÜÿ® ŸäŸÑŸä ŸÖÿπŸÜÿØŸáÿ¥ ÿßÿ® ŸàŸÑÿß ÿßŸÖ ŸàŸÑÿß ÿÆŸàÿ™ ŸàŸÑÿß ÿÆŸàÿßÿ™ÿü ŸäÿπŸÜŸä ÿßŸÑŸäÿ™ŸäŸÖÿ© ŸÖÿ™ÿ≥ÿ™ÿ≠ŸÇÿ¥ ÿ™ÿ™ÿ≤Ÿàÿ¨ÿü Ÿàÿ´ÿßŸÑÿ´ÿßŸã ŸÑŸäÿ¥ ÿßŸÑÿ®ŸÜÿ™ ŸáŸä ÿ®ÿ≥ ŸÑÿßÿ≤ŸÖ ÿßÿØŸäÿ± ÿßŸÑŸÅ ÿ≠ÿ≥ÿßÿ® ŸÑŸÑÿ±ÿßÿ¨ŸÑÿü ŸáŸä ŸÖÿ™ÿ≥ÿ™ÿ≠ŸÇÿ¥ ŸäŸÜÿØÿßÿ±ŸÑŸáÿß ÿßŸÑŸÅ ÿ≠ÿ≥ÿßÿ® ŸàŸÑÿß ŸáŸä ÿπÿ®ÿØÿ©ÿü,LY
4,@lyranoo85 ÿ¥ŸÉŸÑŸÉ ŸÖÿ™ÿπŸÇÿØÿ© ÿπŸÑÿ¥ÿßŸÜ ÿßŸÑÿ±ÿßÿ¨ŸÑ ŸÑŸä ÿ™ÿ≠ÿ®ŸäŸá ÿßÿ≤Ÿàÿ¨ ÿ®ŸÜÿ™ Ÿäÿ™ŸäŸÖÿ© ŸàŸÑÿß ÿ®ŸÜÿ™ ŸÖÿπŸÜÿØŸáÿ¥ ÿÆŸàÿ™. ŸáÿØŸä ÿßÿπÿµÿßÿ®ŸÉ ŸàŸÅŸÉŸäŸÜÿß ŸÖŸÜ ÿßŸÑÿ™ÿÆŸÑŸÅ ÿßŸÖÿ™ÿßÿπŸÉ,LY


In [61]:
# Check for duplicates
duplicates = dialects_data.duplicated()

# Print the number of duplicate rows
print(f"Number of duplicate rows: {duplicates.sum()}")

Number of duplicate rows: 0


## check dataset balancing

In [62]:
# Plot the distribution of 'dialect' column with colorful bars
fig = px.histogram(dialects_data, x='dialect', color='dialect',
                   title='Distribution of Dialects',
                   labels={'dialect': 'Dialect', 'count': 'Frequency'},
                   color_discrete_sequence=px.colors.qualitative.Set1)
fig.update_xaxes(categoryorder='total descending')
fig.update_layout(xaxis_title="Dialect", yaxis_title="Frequency")
fig.show()





## **Ooh there is big imbalance here**
We Can deal with that in difference ways.

Here are a few strategies we can do to handle imbalanced datasets:

*    **Resampling Techniques:**
        Over-sampling: Increase the number of minority class samples by randomly duplicating them or generating synthetic samples (e.g., using SMOTE).
        Under-sampling: Decrease the number of majority class samples by randomly removing them.
        Combined over- and under-sampling: A combination of over-sampling the minority class and under-sampling the majority class.

*    **Algorithmic Techniques:**
        Use algorithms that are robust to class imbalance, such as tree-based algorithms (Random Forest, Gradient Boosting) or ensemble methods.
        Adjust class weights in the model to penalize misclassification of minority classes more.

*    **Evaluation Metrics:**
        Choose appropriate evaluation metrics that are less sensitive to class imbalance, such as precision, recall, F1-score.



## Show a representative sample of data texts to find out required cleaning steps.

In [63]:
sample_data = dialects_data.sample(10)
sample_data

Unnamed: 0,text,dialect
10122,@MohammedSafar93 @arabqoute ŸàŸäÿßÿ±ÿ™ŸÜŸâ ŸÖŸÉŸÜÿ™ÿ¥ ŸÑÿπÿ®ÿ™ ŸÖÿπÿßŸÉÿåÿåÿå ŸäŸÑÿπŸÜ ÿßÿ®Ÿà ÿ¥ŸÉŸÑŸÉ,LY
43981,ŸäŸÑÿßŸá ÿßŸÑŸÑŸä ŸÜÿßÿπÿ≥ ŸäŸÅŸäŸÇ ŸàÿßŸÑŸÑŸä ŸÖÿ≤ÿßŸÑ ŸÅÿßŸäŸÇ ŸäŸÖÿ¥Ÿä ŸäŸÜÿπÿ≥..,MA
20144,@jeedr2004 ŸàŸäŸÜ ÿßŸÖ ÿßŸÑŸÇŸÜÿØŸäŸÑ ÿ¥ÿ±ŸÇ ŸàŸÑÿß ÿ∫ÿ±ÿ® !,LY
7610,ŸÖÿ¥ŸÉŸÑŸá ÿßŸéŸä Ÿàÿ≤Ÿäÿ± ÿπŸÜÿØŸÜÿß ŸÖŸäÿ≠ÿ∑ÿ¥ ÿÆÿ∑Ÿá ÿ≠ÿ™Ÿâ ŸÖÿ™Ÿàÿ≥ÿ∑Ÿá ÿßŸÑÿ£ŸÖÿØ ŸàŸäÿ¥ÿ™ÿ∫ŸÑ ÿπŸÑŸäŸáÿß ÿ≠ÿ™Ÿâ ŸÑŸà ŸáŸàÿß ÿ™ÿ±ŸÉ ÿßŸÑŸàÿ≤ÿßÿ±Ÿá ŸäŸÉŸÖŸÑ ÿßŸÑŸÑŸä ÿ®ÿπÿØŸá \nŸÉŸÑ ÿ≠ŸÑŸàŸÑŸÜÿß ŸàŸÇÿ™ŸäŸá ŸÖŸÜ ÿßÿ¨ŸÑ ÿßŸÜÿ™ÿµÿßÿ±ÿßÿ™ ŸÑŸÑÿØÿπÿßŸäÿ© ÿßŸÑÿ≥Ÿäÿßÿ≥Ÿäÿ© ŸàÿßŸÑŸàÿ≤Ÿäÿ± ÿßŸÑŸÑŸä Ÿäÿ¨Ÿä Ÿäÿ®ÿØÿß ŸÖŸÜ ÿßŸÑÿµŸÅÿ±,LY
9206,@Aliswan16 ÿßŸÜÿ™ ÿ®ÿßŸÑÿ∞ÿßÿ™ ŸÖŸÅÿ±Ÿàÿ∂ Ÿäÿ≠ÿ®ÿ≥ŸàŸÉ ŸÅŸä ŸÑŸäÿ®Ÿäÿß ÿ™ÿ∑ŸÑÿπ ÿßÿØŸäÿ± ŸÑÿπÿßÿ±,LY
87281,ÿßŸÖÿ™Ÿâ ÿ≠ŸÜŸÇÿ™ŸÜÿπ ÿØÿßÿÆŸÑŸäÿß ÿ®ÿ¨ÿØ ÿßŸÜ ÿßŸÑÿ≥Ÿäÿßÿ≥ÿ© ŸÖŸÑŸáÿßÿ¥ ÿØÿπŸàÿ© ÿ®ÿßŸÑÿØŸäŸÜ ŸàÿßŸÇÿ™ŸÜÿßÿπŸÜÿß ŸäŸÉŸàŸÜ ÿ≠ŸÇŸäŸÇŸâ ŸÑÿØÿ±ÿ¨ÿ© ÿßŸÜŸá Ÿäÿ∏Ÿáÿ± ŸÅŸâ ŸÉŸÑÿßŸÖŸÜÿß.\nÿßŸÜÿß ŸÖÿ≥Ÿäÿ≠Ÿäÿ© ŸàŸÅÿÆŸàÿ±ÿ© ÿßŸÜ ÿßŸÑŸÑŸâ ÿ®Ÿäÿ≠ŸÉŸÖŸÜŸâ #ÿßŸÑÿ≥Ÿäÿ≥Ÿä ŸàŸÖÿ¥ ŸÅÿßÿ±ŸÇ ŸÖÿπÿßŸäÿß ÿØŸäŸÜŸá\nŸàŸÑŸà ÿßÿ™ÿ≠ÿ∑Ÿäÿ™ ŸÅŸâ ÿßÿÆÿ™Ÿäÿßÿ± ÿ®ŸäŸÜŸá Ÿàÿ®ŸäŸÜ ŸÖÿ≥Ÿäÿ≠Ÿâ ÿßŸÇŸÑ ŸÅŸâ ÿßŸÑŸÉŸÅÿßÿ°ÿ© ŸàÿßŸÑÿßÿÆŸÑÿßÿµ ŸàÿßŸÑÿßŸÖÿßŸÜÿ© ÿßŸÉŸäÿØ ÿ≠ÿßÿÆÿ™ÿßÿ± #ÿßŸÑÿ≥Ÿäÿ≥Ÿä,EG
55463,@MegaFMeg @omertaher ÿßÿ≠ŸÜÿß ÿπÿßŸäÿ¥ŸäŸÜ ÿπÿµÿ± ÿßÔªªŸàŸÅÿ± ŸàÿßŸÑÿ™ÿ±ÿ®ÿµÿåŸàŸÑŸà ÿØŸá ŸáŸäÿ®ŸÇŸä ÿßŸÑŸÖŸÜÿ∑ŸÇ ŸáŸÜÔªªŸÇŸä ÿßÿ¨ÿßŸÜÿ® ÿßÔªªÿÆÿ± ÿ¥ÿ®ŸáŸàÿß ÿßŸÑÿ±ÿ¶Ÿäÿ≥ ÿ®ŸÖÿ±ÿ≥Ÿä ŸÅ ŸÖÿ¥ ŸáŸÜÿÆŸÑÿµ ÿßÿ™ŸáÿßŸÖÿßÿ™,EG
146938,@alimoha73879480 @OMDz6ktXClBIUos ŸÖÿß ÿ±ÿØŸäÿ™ ÿπŸÑŸâ ÿßŸÑÿ≥ÿ§ÿßŸÑüòÅ,SD
73270,ŸáŸÑ ÿ™ÿµÿ®ÿ≠ ŸÅŸäŸÅŸâ ÿπÿ®ÿØŸá ŸÜÿ¨ŸÖÿ© Ÿáÿ∞ÿß ÿßŸÑÿ¥Ÿáÿ± ÿü https://t.co/hKFIpwuUuo,EG
19234,@AlhoneS ŸÑŸäŸá Ÿáÿßÿ±ÿ® ÿ≥ÿßÿπÿ™ŸäŸÜ Ÿàÿ±ÿ®ÿπ ÿ®ÿ≤ÿ®ÿ∑,LY


## Cleaning steps that we will need are:

- Remove Tatweel
- Remove digits and symbols
- Remove emojis
- Remove URLs
- Remove usernames
- Remove non-Arabic characters
- Remove extra spaces
- Remove hashtags
- Remove un-ASCII characters
- Handling Noise and Garbage Characters
- Handling Bi-directional Text

## Split data into features and target

In [64]:
# Separate features (X) and target variable (y)
X_text = dialects_data['text']  # Text features
y = dialects_data['dialect']  # Target variable

In [65]:
label_encoder = LabelEncoder()
label_encoder.fit(y)


In [66]:
y = label_encoder.transform(y)

In [67]:
y = pd.DataFrame(y, columns=['dialect'])

In [68]:
y.shape

(147718, 1)

In [69]:
y.value_counts()

dialect
0          57635
2          36499
1          27617
4          14433
3          11534
Name: count, dtype: int64

In [70]:
X_text.head()

0                                                                                                                                                                                          @toha_Altomy @gy_yah ŸÇŸÑŸäŸÑŸäŸÜ ÿßÿØÿ® ŸàŸÖŸÜÿßŸÅŸÇŸäŸÜ. ŸÑŸà ÿßÿÆÿ™ŸáŸÖ ÿßŸà ŸÇÿ±Ÿäÿ®ÿ™ŸáŸÖ ÿ™ÿ™ÿπÿßŸÉÿ≥ ÿ™ŸÇŸàŸÑŸä ÿπŸÑŸäŸáŸÖ ŸÖŸÜ ŸÜÿ¥ÿßÿ∑ ÿ≠ŸÇŸàŸÇ ÿßŸÑŸÖÿ±ÿ£ÿ© ŸÖŸÜ ÿ±ÿØÿ© ŸÅÿπŸÑŸáŸÖ.
1                                                                                                                                                                                                                                   @AlmFaisal üòÇüòÇ ÿßŸÑŸÑŸäÿ®ŸäŸäŸÜ ŸÖÿ™ŸÇŸÑÿ®ŸäŸÜ!!!\nÿ®ÿ≥ ÿ®ÿßŸÑŸÜÿ≥ÿ®ÿ© ŸÑŸäÿß ÿßŸÜÿß ŸÖŸäŸÑŸäÿ¥ŸäÿßŸàŸä ÿ≤ŸÖÿßŸÜ Ÿàÿ™Ÿàÿ©
2                                                                @smsm071990 @ALMOGRBE ŸÉŸÑ 20 ÿ™ÿßŸÜŸäŸá ÿ¥ÿßÿ® ŸÑŸäÿ®Ÿä ÿ®Ÿäÿ±ÿ™ÿßÿ≠ ŸÑÿ®ŸÜÿ™ ŸÖÿÆÿ™ŸÑŸÅÿ© ŸàŸäŸÑÿßÿ≠ÿ∏ ÿßŸÜŸáÿß ÿ∫Ÿäÿ± ŸÉŸÑ ÿßŸÑÿ®ŸÜÿßÿ™ Ÿàÿ®Ÿäÿ≠ÿ≥ ŸÉÿ£ŸÜŸá Ÿäÿπÿ±ŸÅŸáÿß ŸÖŸÜ ÿ≤ŸÖÿßŸÜ. ÿ®ÿπÿØŸäŸÜ Ÿäÿ™ÿ

## Split data into train,valid and test sets

In [71]:
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42)


In [72]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.25    , random_state=42)

In [73]:
X_train.shape, X_valid.shape, X_test.shape

((88630,), (29544,), (29544,))

## Define the custom transformer for Arabic text cleaning

In [74]:
class ArabicTextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, use_stemming=False):
        self.use_stemming = use_stemming
        self.arabic_stopwords = set(stopwords.words('arabic'))
        
        if self.use_stemming:
            self.stemmer = stemmer("arabic")
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # If X is a single string, convert it to a list of one element
        if isinstance(X, str):
            X = [X]
        return [self.clean_text(text) for text in X]
    
    def clean_text(self, text):
        print("Original text:", text)

        # Removing Non-Arabic Characters
        text = re.sub(r'[^\u0600-\u06FF\s]', '', text)
        print("After removing non-Arabic characters:", text)

        # Removing Tatweel
        text = text.replace('ŸÄ', '')
        print("After removing Tatweel:", text)

        # Removing HTML Tags
        text = re.sub(r'<.*?>', '', text)
        print("After removing HTML tags:", text)

        # Tokenization
        words = word_tokenize(text)
        print("After tokenization:", words)

        # Removing Stopwords
        words = [word for word in words if word not in self.arabic_stopwords]
        print("After removing stopwords:", words)

        # Removing Digits and Symbols
        text = re.sub(r'[0-9‚Äô!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', '', ' '.join(words))
        print("After removing digits and symbols:", text)

        # Stemming
        if self.use_stemming:
            words = [self.stemmer.stemWord(word) for word in words]
        print("After stemming:", words)

        text = ' '.join(words)
        print("After joining words:", text)

        # Remove hashtag
        text = re.sub(r'#([^\s]+)', '', text)
        print("After removing hashtags:", text)

        # Removing URLs and Email Addresses and usernames
        text = re.sub(r'http\S+|www\S+|@\S+', '', text)
        print("After removing URLs and usernames:", text)

        # Removing Emojis and Symbols
        text = ''.join(char for char in text if not emoji.is_emoji(char))
        
        print("After removing emojis and symbols:", text)

        # Removing Repeated Characters
        text = re.sub(r'(.)\1{2,}', r'\1\1', text)
        print("After removing repeated characters:", text)

        # Handling Noise and Garbage Characters
        text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # Removing non-Arabic characters again
        print("After handling noise and garbage characters:", text)

       
        return text


In [75]:
# Try the ArabicTextCleaner class on the sample data
n="@smsm071990 @ALMOGRBE ‚ô•Ô∏èüòÇü•∞üë©üèª‚Äçüíª‚ùé‚úÖ ŸÉŸÑ 20 ÿ™ÿßŸÜŸäŸá ÿ¥ÿßÿ® ŸÑŸäÿ®Ÿä ÿ®Ÿäÿ±ÿ™ÿßÿ≠ ŸÑÿ®ŸÜÿ™ ŸÖÿÆÿ™ŸÑŸÅÿ© ŸàŸäŸÑÿßÿ≠ÿ∏ ÿßŸÜŸáÿß ÿ∫Ÿäÿ± ŸÉŸÑ ÿßŸÑÿ®ŸÜÿßÿ™ Ÿàÿ®Ÿäÿ≠ÿ≥ ŸÉÿ£ŸÜŸá Ÿäÿπÿ±ŸÅŸáÿß ŸÖŸÜ ÿ≤ŸÖÿßŸÜ. ÿ®ÿπÿØŸäŸÜ Ÿäÿ™ÿ≤Ÿàÿ¨ Ÿàÿ≠ÿØÿ© ŸÖŸÜŸáŸÜ ŸàŸÖŸÖŸÉŸÜ ÿßÿ´ŸÜŸäŸÜ ŸàŸÑÿßÿ´ŸÑÿßÿ´ÿ© Ÿàÿ™ŸÜŸÇŸÑÿ® ÿßŸÑÿ±ŸàŸÖÿßŸÜÿ≥Ÿäÿ© ŸÑÿπŸäÿßÿ∑ Ÿàÿ¥Ÿäÿßÿ∑ Ÿàÿ™Ÿáÿ≤Ÿäÿ®\nand they live happily ever after\nÿ∞Ÿä ÿßŸÜÿØ	"
cleaning = ArabicTextCleaner()
cleaned_n = cleaning.transform(n)
print(cleaned_n)

Original text: @smsm071990 @ALMOGRBE ‚ô•Ô∏èüòÇü•∞üë©üèª‚Äçüíª‚ùé‚úÖ ŸÉŸÑ 20 ÿ™ÿßŸÜŸäŸá ÿ¥ÿßÿ® ŸÑŸäÿ®Ÿä ÿ®Ÿäÿ±ÿ™ÿßÿ≠ ŸÑÿ®ŸÜÿ™ ŸÖÿÆÿ™ŸÑŸÅÿ© ŸàŸäŸÑÿßÿ≠ÿ∏ ÿßŸÜŸáÿß ÿ∫Ÿäÿ± ŸÉŸÑ ÿßŸÑÿ®ŸÜÿßÿ™ Ÿàÿ®Ÿäÿ≠ÿ≥ ŸÉÿ£ŸÜŸá Ÿäÿπÿ±ŸÅŸáÿß ŸÖŸÜ ÿ≤ŸÖÿßŸÜ. ÿ®ÿπÿØŸäŸÜ Ÿäÿ™ÿ≤Ÿàÿ¨ Ÿàÿ≠ÿØÿ© ŸÖŸÜŸáŸÜ ŸàŸÖŸÖŸÉŸÜ ÿßÿ´ŸÜŸäŸÜ ŸàŸÑÿßÿ´ŸÑÿßÿ´ÿ© Ÿàÿ™ŸÜŸÇŸÑÿ® ÿßŸÑÿ±ŸàŸÖÿßŸÜÿ≥Ÿäÿ© ŸÑÿπŸäÿßÿ∑ Ÿàÿ¥Ÿäÿßÿ∑ Ÿàÿ™Ÿáÿ≤Ÿäÿ®
and they live happily ever after
ÿ∞Ÿä ÿßŸÜÿØ	
After removing non-Arabic characters:    ŸÉŸÑ  ÿ™ÿßŸÜŸäŸá ÿ¥ÿßÿ® ŸÑŸäÿ®Ÿä ÿ®Ÿäÿ±ÿ™ÿßÿ≠ ŸÑÿ®ŸÜÿ™ ŸÖÿÆÿ™ŸÑŸÅÿ© ŸàŸäŸÑÿßÿ≠ÿ∏ ÿßŸÜŸáÿß ÿ∫Ÿäÿ± ŸÉŸÑ ÿßŸÑÿ®ŸÜÿßÿ™ Ÿàÿ®Ÿäÿ≠ÿ≥ ŸÉÿ£ŸÜŸá Ÿäÿπÿ±ŸÅŸáÿß ŸÖŸÜ ÿ≤ŸÖÿßŸÜ ÿ®ÿπÿØŸäŸÜ Ÿäÿ™ÿ≤Ÿàÿ¨ Ÿàÿ≠ÿØÿ© ŸÖŸÜŸáŸÜ ŸàŸÖŸÖŸÉŸÜ ÿßÿ´ŸÜŸäŸÜ ŸàŸÑÿßÿ´ŸÑÿßÿ´ÿ© Ÿàÿ™ŸÜŸÇŸÑÿ® ÿßŸÑÿ±ŸàŸÖÿßŸÜÿ≥Ÿäÿ© ŸÑÿπŸäÿßÿ∑ Ÿàÿ¥Ÿäÿßÿ∑ Ÿàÿ™Ÿáÿ≤Ÿäÿ®
     
ÿ∞Ÿä ÿßŸÜÿØ	
After removing Tatweel:    ŸÉŸÑ  ÿ™ÿßŸÜŸäŸá ÿ¥ÿßÿ® ŸÑŸäÿ®Ÿä ÿ®Ÿäÿ±ÿ™ÿßÿ≠ ŸÑÿ®ŸÜÿ™ ŸÖÿÆÿ™ŸÑŸÅÿ© ŸàŸäŸÑÿßÿ≠ÿ∏ ÿßŸÜŸáÿß ÿ∫Ÿäÿ± ŸÉŸÑ ÿßŸÑÿ®ŸÜÿßÿ™ Ÿàÿ®Ÿäÿ≠ÿ≥ ŸÉÿ£ŸÜŸá Ÿäÿπÿ±ŸÅŸáÿß ŸÖŸÜ ÿ≤ŸÖÿßŸÜ ÿ®ÿπÿØŸäŸ

In [76]:
class ArabicTextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, use_stemming=False):
        self.use_stemming = use_stemming
        self.arabic_stopwords = set(stopwords.words('arabic'))
        
        if self.use_stemming:
            self.stemmer = stemmer("arabic")
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # If X is a single string, convert it to a list of one element
        if isinstance(X, str):
            X = [X]
        return [self.clean_text(text) for text in X]
    
    def clean_text(self, text):
        
        # Removing Non-Arabic Characters
        text = re.sub(r'[^\u0600-\u06FF\s]', '', text)
        

        # Removing Tatweel
        text = text.replace('ŸÄ', '')

        # Removing HTML Tags
        text = re.sub(r'<.*?>', '', text)

        # Tokenization
        words = word_tokenize(text)

      

        # Removing Digits and Symbols
        text = re.sub(r'[0-9‚Äô!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', '', ' '.join(words))

        # Stemming
        if self.use_stemming:
            words = [self.stemmer.stemWord(word) for word in words]

        text = ' '.join(words)


        # Remove hashtag
        text = re.sub(r'#([^\s]+)', '', text)

        # Removing URLs and Email Addresses and usernames
        text = re.sub(r'http\S+|www\S+|@\S+', '', text)
    

        # Removing Emojis and Symbols
        text = ''.join(char for char in text if not emoji.is_emoji(char))

        # Removing Repeated Characters
        text = re.sub(r'(.)\1{2,}', r'\1\1', text)
        
        # Handling Noise and Garbage Characters
        text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # Removing non-Arabic characters again

        return text

In [77]:
cleaning = ArabicTextCleaner()
cleaned_x_train = cleaning.transform(X_train)
cleaned_x_train[0]

'ÿßŸä ÿÆÿ®ÿ± ÿßŸà ÿ™ÿ∑Ÿàÿ± ŸÖŸÜ ÿ¨ŸÑ ÿßŸÑÿØŸäÿ® ÿ≠ÿØÿß ŸäÿØŸÇŸÑŸä ŸÖÿß ÿ®ŸÅÿ™ÿ≠ŸÑŸà ÿÆÿ∑ ÿ≠ ÿßÿ∫ŸÅŸâ ÿ¥ŸàŸä'

In [78]:
cleaned_x_valid = cleaning.transform(X_valid)
cleaned_x_test = cleaning.transform(X_test)

# Save the cleaned data to CSV files to use it later in modeling.

In [79]:
# Combine the transformed data and labels into DataFrames
df_cleaned_x_test = pd.DataFrame({'cleaned_x_test': cleaned_x_test, 'y_test':y_test.squeeze()})
df_cleaned_x_train = pd.DataFrame({'cleaned_x_train': cleaned_x_train, 'y_train':y_train.squeeze()})
df_cleaned_x_valid = pd.DataFrame({'cleaned_x_valid': cleaned_x_valid, 'y_valid':y_valid.squeeze()})

In [80]:
# Save the DataFrames to CSV files
df_cleaned_x_train.to_csv('cleaned_x_train.csv', index=False)
df_cleaned_x_valid.to_csv('cleaned_x_valid.csv', index=False)
df_cleaned_x_test.to_csv('cleaned_x_test.csv', index=False)

# Tokenization

In [81]:
class CustomTokenizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        
        self.tokenizer = Tokenizer()

    def fit(self, X, y=None):
        # Fit the tokenizer on the input text
        self.tokenizer.fit_on_texts(X)
        return self

    def transform(self, X, y=None):
       
        # Tokenize the input text into sequences of tokens
        sequences = self.tokenizer.texts_to_sequences(X)
        return sequences

In [82]:
# Instantiate an object of the CustomTokenizer class
tokenizer = CustomTokenizer()

# Fit the tokenizer on the training data
xtrain_tokens=tokenizer.fit(cleaned_x_train )

x_train_idx = tokenizer.transform(cleaned_x_train)
x_train_idx[0]

[53, 598, 40, 4565, 1, 9495, 7404, 111, 43667, 3, 68138, 1099, 325, 43668, 293]

In [83]:
x_valid_idx = tokenizer.transform(cleaned_x_valid)
x_test_idx = tokenizer.transform(cleaned_x_test)

# Calculate max lenth for sentence and determine VOCAB SIZE

In [84]:
max_sequence_len = 0
for sentence in X_train:
    max_sequence_len = max(len(sentence), max_sequence_len)
print(max_sequence_len)

698


In [85]:
import plotly.express as px

# Calculate lengths of sentences in x_train
sentence_lengths = [len(sentence) for sentence in x_train_idx]

# Create histogram figure using Plotly Express
fig = px.histogram(x=sentence_lengths, title='Distribution of Sentence Lengths in x_train_idx',
                   labels={'x': 'Sentence Length', 'y': 'Frequency'})
                  
fig.show()

According to to the Distribution of Sentence Lengths in x_train we will
set **max_length to 50**.

In [86]:
max_sequence_len = 50

In [87]:
# Extract unique tokens from the tokenized sequences
unique_tokens = set(token for sequence in x_train_idx for token in sequence)

In [88]:
# Calculate the vocabulary size based on the number of unique tokens
VOCAB_SIZE = len(unique_tokens)
print("Vocabulary Size:", VOCAB_SIZE)

Vocabulary Size: 187676


# Padding
**Pad sequences to ensure uniform input size for the RNN. This step is crucial because RNNs require input sequences of the same length.**

In [89]:
class padding(BaseEstimator, TransformerMixin):
    def __init__(self, max_sequence_len):

        self.max_sequence_len = max_sequence_len

    def fit(self, X, y=None):

        return self

    def transform(self, X):

        # Initialize the padded dataset with zeros
        X_padded = np.zeros((len(X), max_sequence_len))

        # Pad each sequence
        for i, sent in enumerate(X):
            X_padded[i, :min(len(sent), self.max_sequence_len)] = sent[:self.max_sequence_len]

        return X_padded

In [90]:
pt=padding(max_sequence_len)
x_train_padded =pt.fit_transform(x_train_idx)
x_valid_padded =pt.fit_transform(x_valid_idx)
x_test_padded =pt.fit_transform(x_test_idx)

In [91]:
x_train_padded[0]

array([5.3000e+01, 5.9800e+02, 4.0000e+01, 4.5650e+03, 1.0000e+00,
       9.4950e+03, 7.4040e+03, 1.1100e+02, 4.3667e+04, 3.0000e+00,
       6.8138e+04, 1.0990e+03, 3.2500e+02, 4.3668e+04, 2.9300e+02,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00])

# Save the padded data to CSV files to use it later in modeling.

In [92]:
# Create DataFrames from the padded sequences
df_x_train_padded = pd.DataFrame(x_train_padded)
df_x_valid_padded = pd.DataFrame(x_valid_padded)
df_x_test_padded = pd.DataFrame(x_test_padded)



In [93]:
df_x_test_padded.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,20.0,21.0,1572.0,617.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,955.0,163.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,32.0,6521.0,644.0,2.0,89283.0,78.0,469.0,282.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,749.0,33380.0,578.0,1857.0,190.0,73.0,18.0,19.0,7815.0,341.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,30.0,6.0,674.0,13006.0,692.0,875.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,20150.0,590.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7.0,4764.0,351.0,1023.0,133.0,37812.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,25.0,19.0,825.0,19994.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,3383.0,14147.0,2597.0,531.0,31491.0,263.0,7277.0,2.0,626.0,2278.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,447.0,21.0,121181.0,4.0,25311.0,22444.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
# Add the labels to the DataFrames
df_x_train_padded['y_train'] = y_train.values
df_x_valid_padded['y_valid'] = y_valid.values
df_x_test_padded['y_test'] = y_test.values


In [95]:
df_x_train_padded.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,y_train
0,53.0,598.0,40.0,4565.0,1.0,9495.0,7404.0,111.0,43667.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,564.0,644.0,26071.0,6.0,1895.0,3113.0,12139.0,8884.0,1058.0,125.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,5775.0,413.0,5775.0,5775.0,5775.0,5775.0,5775.0,5775.0,101.0,5775.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,99.0,3.0,10234.0,8318.0,1390.0,2.0,804.0,68139.0,1592.0,66.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,10.0,1694.0,5513.0,29.0,2140.0,456.0,1.0,63.0,3.0,68140.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
5,15.0,59.0,280.0,68144.0,7.0,68145.0,6658.0,1.0,21836.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
6,19.0,16.0,9496.0,68152.0,8885.0,1172.0,1059.0,26.0,16508.0,43670.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
7,68153.0,102.0,211.0,797.0,3114.0,12141.0,265.0,298.0,6047.0,12142.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
8,569.0,43672.0,136.0,3285.0,68154.0,1839.0,3286.0,68155.0,2213.0,143.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
9,43673.0,752.0,13278.0,3759.0,278.0,1630.0,172.0,34.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [97]:
# Save the DataFrames to CSV files
df_x_train_padded.to_csv('x_train_padded.csv', index=False)
df_x_valid_padded.to_csv('x_valid_padded.csv', index=False)
df_x_test_padded.to_csv('x_test_padded.csv', index=False)


----