# <p style="padding:50px;background-color:#DA8359;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Setup</p>

## 1. Imports

In [20]:
import re
import os
import typing
import random
import emoji

import torch
import numpy as np
import pandas as pd

from collections import Counter
from textblob import TextBlob

import warnings

from __future__ import unicode_literals

warnings.simplefilter("ignore")

## 2. Helper Methods

In [13]:
def extract_sentences(file_path: str) -> list:
    '''
    process youtube/podcast documents

    @param file_path: a string represent file path
    '''

    with open(file_path, 'r', errors="ignore", encoding="utf-8") as f:
        text = f.read()

    # Remove numbers followed by ':'
    text = re.sub(r'\d+.*\d*\s*:', '', text)

    # Define sentence delimiters for Arabic
    sentence_endings = r'(?<=[.!؟؛،])\s+'

    # Split sentences while preserving dependencies
    sentences = re.split(sentence_endings, text)

    return [s.strip() for s in sentences if len(s.strip()) > 1] # Remove empty strings
    

In [8]:
def sentences_to_df(sentences: list) -> pd.DataFrame:
    '''
    convert a list of sentences to a dataframe

    @param sentences: a list of sentences
    '''

    return pd.DataFrame(sentences)

## 3. Load Dataset

In [None]:
def load_dataset(file_path: str = "./data") -> pd.DataFrame:
    '''
    Load dataset from a directory

    :params: **file_path**: a string representing file path to dataset.
    '''
    output_df = []

    data_folder = "./data"

    for file_name in os.listdir(data_folder):
        if file_name.endswith(".txt"):
            file_path = os.path.join(data_folder, file_name)
            tmp_df = sentences_to_df(extract_sentences(file_path))
            output_df.append(tmp_df)

    output_df = pd.concat(output_df, ignore_index=True)

    return output_df

In [15]:
file_path = "./data/البطاطس  الدحيح.txt"
df = load_dataset(file_path)
df.head

<bound method NDFrame.head of                                      0
0                لكل شخص في آخر الشهر،
1               ما معاهوش غير 50 جنيه،
2           ومحتاج ياكل أكلة تشبّعه...
3                    لكل واحد "فورمة"،
4              محتاج أكلة سريعة الهضم،
..                                 ...
733          لازم السندوتش معاه بطاطس.
734              وأقوم مطلّع بطاطساية،
735          وأقوم حاططها في السندوتش!
736  دا أنا أحيانًا بجيب سندوتش بطاطس،
737    من "الحرمين" اللي في "الحُصري".

[738 rows x 1 columns]>

---

# <p style="padding:50px;background-color:#DA8359;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Libraries & Models</p>

## 1. Ghalatawi

**Ghalatawi:** an Arabic Autocorrect library مكتبة للتصحيح التلقائي للغة العربية

Source: https://github.com/linuxscout/ghalatawi

In [16]:
from ghalatawi.autocorrector import AutoCorrector

autoco = AutoCorrector()

autoco.show_config()

{'regex': True, 'wordlist': True, 'punct': True, 'typo': True}

> <span style="color: yellow">**_Note:_**</span> The library allow for fixing spelling, adjusting punctuations, typos.

## 2. PyArabic

**PyArabic:** a Arabic language library for Python, provides basic functions to manipulate Arabic letters and text, like detecting Arabic letters, Arabic letters groups and characteristics, remove diacritics etc.

Source: https://github.com/linuxscout/pyarabic

In [17]:
import pyarabic.araby as araby
import pyarabic.number as number

autoco = AutoCorrector()

## 3. CAMeL Bert

**CAMeL Bert:** a collection of BERT models pre-trained on Arabic texts with different sizes and variants.

Source: https://huggingface.co/CAMeL-Lab/bert-base-arabic-camelbert-msa

In [19]:
# Load Arabic SBERT Model
from sentence_transformers import SentenceTransformer

sbert_model = SentenceTransformer("CAMeL-Lab/bert-base-arabic-camelbert-msa")

No sentence-transformers model found with name CAMeL-Lab/bert-base-arabic-camelbert-msa. Creating a new one with mean pooling.


## 4. DSAraby

**DSAraby:** is a library that aims to transliterate text which is to write a word using the closest corresponding letters of a different alphabet or language.

Source: https://github.com/saobou/DSAraby/tree/master

In [29]:
from dsaraby import DSAraby

ds = DSAraby()

## 5. Tashaphyne

**Tashaphyne:** is an Arabic light stemmer and segmentor. It mainly supports light stemming (removng prefixes and suffixes) and gives all possible segmentations. it uses a modified finite state automation, which allows it to generate all segmentations.

Source: https://github.com/linuxscout/tashaphyne

In [28]:
from tashaphyne.stemming import ArabicLightStemmer
from tashaphyne.arabicstopwords import STOPWORDS as TASHAPHYNE_STOPWORDS

tashaphyne_stemmer = ArabicLightStemmer()

## 6. CaMeL Tools

**CaMeL Tools:** is suite of Arabic natural language processing tools developed by the CAMeL Lab at New York University Abu Dhabi.

Source: https://github.com/CAMeL-Lab/camel_tools

In [36]:
import camel_tools

from camel_tools.data import downloader
from camel_tools.morphology import analyzer
from camel_tools.utils.dediac import dediac_ar
# from camel_tools.disambig import CamelDisambiguator
# from camel_tools.dialectid import DialectIdentifier
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.tagger.default import DefaultTagger
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.morphology.database import MorphologyDB
from camel_tools.utils.normalize import normalize_unicode
from camel_tools.tokenizers.word import simple_word_tokenize
# from camel_tools.segmenters.word import MaxLikelihoodProbabilityModel
# from camel_tools.ner import NERecognizer, STOPWORDS as CAMEL_STOPWORDS
from camel_tools.tokenizers.morphological import MorphologicalTokenizer

camel_data_path = os.path.join(os.path.dirname(camel_tools.__file__), 'cli', 'camel_data.py')
print(camel_data_path)

downloader.DownloaderError("calima-msa-r13")

morph_db = MorphologyDB.builtin_db(flags = 'r')
analyzer = Analyzer(morph_db)

c:\Users\mazen\AppData\Local\Programs\Python\Python312\Lib\site-packages\camel_tools\cli\camel_data.py


## 7. Farasa

**Farasa:** is the state-of-the-art library for dealing with Arabic Language Processing. It has been developed by Arabic Language Technologies Group at Qatar Computing Research Institute (QCRI).

Source: https://github.com/MagedSaeed/farasapy

In [26]:
from farasa.stemmer import FarasaStemmer

farasa_stemmer = FarasaStemmer()

## 8. Tnkeeh

**Tnkeeh:** is an Arabic preprocessing library for python. it was designe dusing `re` for creating quick replacement expressions for several examples such as Quick cleaning, Segmentation, Normalization and Data splitting.

Source: https://github.com/ARBML/tnkeeh

In [25]:
import tnkeeh as tn

## 9. NLTK

**NLTK:** a leading platform for building Python programs to work with human language data.

Source: https://www.nltk.org/

In [24]:
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer

NLTK_STOPWORDS = set(stopwords.words('arabic'))

nltk_stemmer = ISRIStemmer()

## 10. SinaTools

**SinaTools:** an Open-Source Toolkit for Arabic NLP and NLU developed by SinaLab at Birzeit University.

Models:
- morph: https://sina.birzeit.edu/lemmas_dic.pickle,
- ner: https://sina.birzeit.edu/Wj27012000.tar.gz,
- wsd_model: https://sina.birzeit.edu/bert-base-arabertv02_22_May_2021_00h_allglosses_unused01.zip,
- wsd_tokenizer: https://sina.birzeit.edu/bert-base-arabertv02.zip,
- one_gram: https://sina.birzeit.edu/one_gram.pickle,
- five_grams: https://sina.birzeit.edu/five_grams.pickle,
- four_grams: https://sina.birzeit.edu/four_grams.pickle,
- three_grams: https://sina.birzeit.edu/three_grams.pickle,
- two_grams: https://sina.birzeit.edu/two_grams.pickle,
- graph_l2: https://sina.birzeit.edu/graph_l2.pkl,
- graph_l3: https://sina.birzeit.edu/graph_l3.pkl,
- relation: https://sina.birzeit.edu/relation_model.zip


Source: https://github.com/SinaLab/SinaTools

In [23]:
from sinatools.morphology import morph_analyzer

## 10. Transformers

**🤗 Transformers:** a library that provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio.

Source: https://github.com/huggingface/transformers

In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

## 11. Scikit-Learn


**Scikit-Learn**: a Python module for machine learning built on top of SciPy and is distributed under the 3-Clause BSD license.

Source: https://github.com/scikit-learn/scikit-learn

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

-----

# <p style="padding:50px;background-color:#DA8359;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Cleaning</p>

## 1. Tidying Up Text

### 1.1 Orthographic mistakes

### 1.2 Spelling inconsistencies (Text Correction)

In [None]:
def auto_correct(dataset: list) -> list:
    '''
    A method that that fixes typos, punctuation and spelling mistakes.
    '''

    output = []

    for text in dataset:
        output = autoco.spell(text)

    return output

### 1.3 Unknown characters

### 1.4 Repeated letters and with spaces in the words



### 1.5 Reshape Text

https://pypi.org/project/arabic-reshaper/

## 2. Text Processing

### 2.1 Sentence Segmentation

One of the problems in text collected from youtube/podcast is that their is no true sentence structure is made that we split text upon.

In [None]:
def arabic_sentence_segmentation(paragraph: str) -> str:
    '''
    A method that segmente arabic pargaraphs to meaningful sentences

    @param paragraph: a bunch of sentences that are segmented to meaningful sentences.
    '''

    # Compute sentence embeddings
    embeddings = sbert_model.encode(paragraph)

    # Compute cosine similarity
    sim_matrix = cosine_similarity(embeddings)

    # Find semantic breakpoints (low similarity)
    threshold = 0.5  # Adjust this based on experimentation
    split_points = [i for i in range(len(paragraph) - 1) if sim_matrix[i, i+1] < threshold]

    # Generate semantic splits
    segments = []
    start = 0
    for split in split_points:
        segments.append(" ".join(paragraph[start:split+1]))
        start = split + 1

    segments.append(" ".join(paragraph[start:]))

    return segments

> <span style="color: red">**_TODO:_**</span> handle text that contains both english and arabic.

### 2.2  Arabizi to Arabic

In [5]:
def arabizi_to_arabic(text: str) -> str:
    '''
    A method that gives the possible words in Arabic based on a given word in Latin by mapping
    the Latin letters to Arabic ones, then takes the most frequent word existing in a corpus.
    
    @param text: a sentence containing english (عمي) that need to be converted to arabic.
    '''

    return ds.transliterate(text)

Example

In [None]:
# Example usage
arabizi_text = "mar7aba, kayf 7alak?"
arabic_text = arabizi_to_arabic(arabizi_text)
print("Arabizi:", arabizi_text)
print("Arabic:", arabic_text)

NameError: name 'ds' is not defined

### 2.3 Stemming

> <span style="color: green">**_Stemming:_**</span> is the process of reducing a word to its root/lemma.

In [None]:
def arabic_stemming(text: str, tool: str) -> str:
    '''
    A method that perform arabic text stemming

    @param text: A sentence that requires stemming
    '''
    zen = TextBlob(text) # check for alternatives
    words = zen.words
    
    if tool == 'camel':
        return ' '.join([analyzer.analyze(word)['stem'] for word in words])
    elif tool == 'farasa':
        return farasa_stemmer.stem(text)
    elif tool == "light":
        return ' '.join([tashaphyne_stemmer.light_stem(word) for word in words])
    else:
        return ' '.join([nltk_stemmer.stem(word) for word in words])
        

> <span style="color: yellow">**_Note:_**</span> ISRI Stemmer is a stemming process that is based on algorithm (Arabic Stemming without a root dictionary).

### 2.4 Lemmatization

> <span style="color: green">**_Lemmatization:_**</span>  

In [2]:
def arabic_lemmatization(text: str) -> str:
    '''
    A method that perform arabic text lemmatization

    @param text: A sentence that requires lemmatization
    '''
    zen = TextBlob(text) # check for alternatives
    words = zen.words

    analyzed_text = morph_analyzer.analyze(text)

    return ' '.join([word["lemma"] for word in analyzed_text])    

### 2.5 Stopwords

> <span style="color: green">**_Stopwords:_**</span> are most common terms in an Arabic language such as حروف الجر.

In [54]:
def remove_arabic_stopwords(text: str, custom_stopwords: bool=None, use_nltk: bool=True, use_camel: bool=True, use_tashaphyne: bool=True) -> str:
    '''
    A method that remove stopwords in text

    @param text: a sentence that requires removing stopwords.
    '''

    # Get Arabic stopwords
    stopwords = set()

    if use_nltk:
        stopwords.update(NLTK_STOPWORDS)
    if use_camel:
        stopwords.update(CAMEL_STOPWORDS)
    if use_tashaphyne:
        stopwords.update(TASHAPHYNE_STOPWORDS)
    if custom_stopwords:
        stopwords.update(custom_stopwords)

    stopwords_comp = {"،","آض","آمينَ","آه","آهاً","آي","أ","أب","أجل","أجمع","أخ",
                    "أخذ","أصبح","أضحى","أقبل","أقل","أكثر","ألا","أم","أما",
                    "أمامك","أمامكَ","أمسى","أمّا","أن","أنا","أنت","أنتم",
                    "أنتما","أنتن","أنتِ","أنشأ","أنّى","أو","أوشك","أولئك",
                    "أولئكم","أولاء","أولالك","أوّهْ","أي","أيا","أين","أينما",
                    "أيّ","أَنَّ","أََيُّ","أُفٍّ","إذ","إذا","إذاً","إذما","إذن","إلى",
                    "إليكم","إليكما","إليكنّ","إليكَ","إلَيْكَ","إلّا","إمّا","إن",
                    "إنّما","إي","إياك","إياكم","إياكما","إياكن","إيانا","إياه",
                    "إياها","إياهم","إياهما","إياهن","إياي","إيهٍ","إِنَّ","ا",
                    "ابتدأ","اثر","اجل","احد","اخرى","اخلولق","اذا","اربعة",
                    "ارتدّ","استحال","اطار","اعادة","اعلنت","اف","اكثر","اكد",
                    "الألاء","الألى","الا","الاخيرة","الان","الاول","الاولى","التى",
                    "التي","الثاني","الثانية","الذاتي","الذى","الذي","الذين",
                    "السابق","الف","اللائي","اللاتي","اللتان","اللتيا","اللتين",
                    "اللذان","اللذين","اللواتي","الماضي","المقبل","الوقت",
                    "الى","اليوم","اما","امام","امس","ان","انبرى","انقلب",
                    "انه","انها","او","اول","اي","ايار","ايام","ايضا","ب",
                    "بات","باسم","بان","بخٍ","برس","بسبب","بسّ","بشكل","بضع",
                    "بطآن","بعد","بعض","بك","بكم","بكما","بكن","بل","بلى",
                    "بما","بماذا","بمن","بن","بنا","به","بها","بي","بيد",
                    "بين","بَسْ","بَلْهَ","بِئْسَ","تانِ","تانِك","تبدّل","تجاه","تحوّل",
                    "تلقاء","تلك","تلكم","تلكما","تم","تينك","تَيْنِ","تِه","تِي",
                    "ثلاثة","ثم","ثمّ","ثمّة","ثُمَّ","جعل","جلل","جميع","جير","حار",
                    "حاشا","حاليا","حاي","حتى","حرى","حسب","حم","حوالى","حول",
                    "حيث","حيثما","حين","حيَّ","حَبَّذَا","حَتَّى","حَذارِ","خلا","خلال",
                    "دون","دونك","ذا","ذات","ذاك","ذانك","ذانِ","ذلك","ذلكم",
                    "ذلكما","ذلكن","ذو","ذوا","ذواتا","ذواتي","ذيت","ذينك",
                    "ذَيْنِ","ذِه","ذِي","راح","رجع","رويدك","ريث","رُبَّ","زيارة",
                    "سبحان","سرعان","سنة","سنوات","سوف","سوى","سَاءَ","سَاءَمَا",
                    "شبه","شخصا","شرع","شَتَّانَ","صار","صباح","صفر","صهٍ","صهْ",
                    "ضد","ضمن","طاق","طالما","طفق","طَق","ظلّ","عاد","عام",
                    "عاما","عامة","عدا","عدة","عدد","عدم","عسى","عشر","عشرة",
                    "علق","على","عليك","عليه","عليها","علًّ","عن","عند","عندما",
                    "عوض","عين","عَدَسْ","عَمَّا","غدا","غير","ـ","ف","فان","فلان",
                    "فو","فى","في","فيم","فيما","فيه","فيها","قال","قام","قبل",
                    "قد","قطّ","قلما","قوة","كأنّما","كأين","كأيّ","كأيّن","كاد",
                    "كان","كانت","كذا","كذلك","كرب","كل","كلا","كلاهما","كلتا",
                    "كلم","كليكما","كليهما","كلّما","كلَّا","كم","كما","كي","كيت",
                    "كيف","كيفما","كَأَنَّ","كِخ","لئن","لا","لات","لاسيما","لدن","لدى",
                    "لعمر","لقاء","لك","لكم","لكما","لكن","لكنَّما","لكي","لكيلا",
                    "للامم","لم","لما","لمّا","لن","لنا","له","لها","لو","لوكالة",
                    "لولا","لوما","لي","لَسْتَ","لَسْتُ","لَسْتُم","لَسْتُمَا","لَسْتُنَّ","لَسْتِ",
                    "لَسْنَ","لَعَلَّ","لَكِنَّ","لَيْتَ","لَيْسَ","لَيْسَا","لَيْسَتَا","لَيْسَتْ","لَيْسُوا",
                    "لَِسْنَا","ما","ماانفك","مابرح","مادام","ماذا","مازال","مافتئ",
                    "مايو","متى","مثل","مذ","مساء","مع","معاذ","مقابل","مكانكم",
                    "مكانكما","مكانكنّ","مكانَك","مليار","مليون","مما","ممن","من",
                    "منذ","منها","مه","مهما","مَنْ","مِن","نحن","نحو","نعم","نفس",
                    "نفسه","نهاية","نَخْ","نِعِمّا","نِعْمَ","ها","هاؤم","هاكَ","هاهنا",
                    "هبّ","هذا","هذه","هكذا","هل","هلمَّ","هلّا","هم","هما","هن",
                    "هنا","هناك","هنالك","هو","هي","هيا","هيت","هيّا","هَؤلاء",
                    "هَاتانِ","هَاتَيْنِ","هَاتِه","هَاتِي","هَجْ","هَذا","هَذانِ","هَذَيْنِ",
                    "هَذِه","هَذِي","هَيْهَاتَ","و","وا","واحد","واضاف","واضافت","واكد",
                    "وان","واهاً","واوضح","وراءَك","وفي","وقال","وقالت","وقد",
                    "وقف","وكان","وكانت","ولا","ولم","ومن","مَن","وهو","وهي",
                    "ويكأنّ","وَيْ","وُشْكَانََ","يكون","يمكن","يوم","ّأيّان"}

    zen = TextBlob(text) # TODO: check for alternatives
    words = zen.words

    return " ".join([w for w in words if not w in stopwords and not w in stopwords_comp and len(w) >= 2])

Example

In [55]:
# Example usage
tokens = ["هذا", "مثال", "على", "إزالة", "كلمات", "التوقف", "بشكل", "متقدم"]
custom_stopwords = ["متقدم"]
filtered_tokens = remove_arabic_stopwords(tokens, custom_stopwords=custom_stopwords)
print(filtered_tokens)

NameError: name 'CAMEL_STOPWORDS' is not defined

### 2.6 Handling Hashtags

*Purpose:* The idea is that arabic text can sometimes contains hashtags as for example "مبارك_عليكم_الشهر ربي اجعل شهر رمضان فاتحة خير لنا وبداية أجمل أقدارنا وحقق لنا ما نتمنى يا كريم#" which need to be converted to " مبارك عليكم الشهر ربي اجعل شهر رمضان فاتحةخير لنا وبداية أجمل أقدارنا
وحقق لنا ما نتمنى يا كريم"

In [None]:
def start_with_hashtag(word: str) -> bool:
    '''
    A method that checks whether a word starts with a hashtag

    @param word: a single word
    '''

    if word.startswith("#"):
        return True
    else:
        return False

def split_hashtag_to_words(tag: str) -> list:
    '''
    A method that convert a hashtag to list of words

    @param tag: a hashtag
    '''

    tag = tag.replace('#', '')
    tags = tag.split('_')

    if len(tags) > 1:
        return tags
    
    pattern = re.compile(r"[A-Z][a-z]+|\d+|[A-Z]+(?![a-z])")
    return pattern.findall(tag)

def extract_hashtag(text: str) -> list:
    '''
    A method that removes hashtags from tags in a sentence

    @param text: a sentence that contains a hashtag
    '''
    
    hash_list = ([re.sub(r"(\W+)$", "", i) for i in text.split() if i.startswith("#")])
    word_list = []

    for word in hash_list:
        word_list.extend(split_hashtag_to_words(word))

    return word_list


def clean_arabic_hashtag(text: str) -> str:
    '''
    A method that replace each tag within a text with it equivalent text separated format

    @param text: a sentence that contains a hashtag
    '''
    
    words = text.split()
    text = list()

    for word in words:
        if start_with_hashtag(word):
            text.extend(extract_hashtag(word))
        else:
            text.append(word)
    
    return " ".join(text)


### 2.7 Handling Emojis 🤪

In [None]:

def handle_emojis(text: str, mode: str = 'remove') -> str:
    '''
    A method that handles emojis.
    '''
    if mode == 'remove':
        return emoji.replace_emoji(text, '')
    elif mode == 'description':
        return emoji.demojize(text, language='ar')
    return text

In [None]:
# Example usage
text_with_emoji = "أنا أحب القراءة 📚 وأستمتع بها كثيراً 😊"
text_without_emoji = handle_emojis(text_with_emoji, 'remove')
text_with_descriptions = handle_emojis(text_with_emoji, 'description')

print("Original:", text_with_emoji)
print("Without emojis:", text_without_emoji)
print("With emoji descriptions:", text_with_descriptions)

> <span style="color: red">**_TODO:_**</span> Search on how to extract meaning from emoji

### 2.8 Normalization

> <span style="color: green">**_Normalization:_**</span> match digits that have the same writing but different encodings.

In [None]:
def normalize_arabic(text: str, tool: str) -> str:
    '''
    A method that match digits that have same writing but different encodings

    @param text: a sentence that requires normalizing its text.
    @param tool: determining which library name to utilize in normalizing text.
    '''
    
    if tool == "tnkeeh":
        normalizer = tn.Tnkeeh(normalize=True)
        output = normalizer.clean_raw_text(text)
        return output
    elif tool == "camel":
        return normalize_unicode(text)
    else:
        text = text.strip()
        text = re.sub("[إأٱآا]", "ا", text)
        text = re.sub("ى", "ي", text)
        text = re.sub("ؤ", "ء", text)
        text = re.sub("ئ", "ء", text)
        text = re.sub("ة", "ه", text)
        text = re.sub("گ", "ك", text)
        text = re.sub("ڤ", "ف", text)
        text = re.sub("چ", "ج", text)
        text = re.sub("پ", "ب", text)
        text = re.sub("ڜ", "ش", text)
        text = re.sub("ڪ", "ك", text)
        text = re.sub("ڧ", "ق", text)
        text = re.sub("ٱ", "ا", text)
        noise = re.compile(""" ّ    | # Tashdid
                                َ    | # Fatha
                                ً    | # Tanwin Fath
                                ُ    | # Damma
                                ٌ    | # Tanwin Damm
                                ِ    | # Kasra
                                ٍ    | # Tanwin Kasr
                                ْ    | # Sukun
                                ـ     # Tatwil/Kashida
                            """, re.VERBOSE)
        text = re.sub(noise, '', text)
        text = re.sub(r'(.)\1+', r"\1\1", text) # Convert repeated characters to single occurrence
        return araby.strip_tashkeel(text)

Example

In [25]:
# Example usage
raw_text = "هذا نص تجريبي يحتوي على أحرف مختلفة مثل إ و أ و آ و ى و ڤ و چ"
normalized_text = normalize_arabic(raw_text)
print(normalized_text)

هذا نص تجريبي يحتوي علي احرف مختلفه مثل ا و ا و ا و ي و ف و ج


### 2.9 Specific Noise Removal

> <span style="color: green">**_Noise Removal:_**</span> extend noise removal to handle more cases.

In [None]:
def remove_arabic_noise(text: str) -> str:
    '''
    A method that remove specific noise in text such as tatweel, html tags, etc.

    :params: **text**: a sentence to be processed
    '''

    # Remove tatweel
    text = re.sub(r'\u0640', '', text)

    # Remove non-Arabic characters
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)

    # Remove HTML tags
    text = re.sub('<.*?>', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

Example

In [28]:
# Example usage
noisy_text = "هَـــذا نَـــصّ <b>تَجْــرِيــبـِـيّ</b> مع   مسافات  زائدة"
clean_text = remove_arabic_noise(noisy_text)
print(clean_text)

هذا نص تجريبي مع مسافات زائدة


### 2.10 Tokenization

> <span style="color: green">**_Tokenization:_**</span> is the process of breaking a sequence of text into smaller units called tokens, such as words, phrases, symbols, and other elements. For the Arabic language, tokenization is a complex task due to the differences between the written and spoken forms of the language.

In [None]:
def tokenize_arabic(text: str, method='simple'):
    '''
    A method that tokenize a sentence.

    :params: **text**: a sentence to be tokenized
    :params: **method**: either `simple` or `morphological`
    '''

    if method == 'simple':
        return simple_word_tokenize(text)
    elif method == 'morphological':
        disambiguator = MLEDisambiguator.pretrained() # Load a pre-trained disambiguator
        tokenizer = MorphologicalTokenizer(disambiguator) # Create a tokenize
        words = tokenizer.tokenize(text) # Tokenize text
        return words

Example

In [14]:
text = "هذا مثال على تقطيع النص العربي بطريقة متقدمة."
simple_tokens = tokenize_arabic(text, 'simple')
morphological_tokens = tokenize_arabic(text, 'morphological')

print("Simple tokenization:", simple_tokens)
print("Morphological tokenization:", morphological_tokens)

TypeError: MorphologicalTokenizer.__init__() missing 1 required positional argument: 'scheme'

### 2.11 Dediacritization

> <span style="color: green">**_Dediacritization:_**</span> Dediacritization is the process of removing Arabic diacritical marks. Diacritics increase data sparsity and so most Arabic NLP techniques ignore them.

In [None]:
def arabic_dediacrition(text: str, method='remove', tool='pyarabic') -> str:
    '''
    A method that remove arabic diacritical marks.

    @param text: a sentence that requires dediacritizating.
    @param method: options - 'remove', 'normalize' and 'keep'
    @param tool: options - 'pyarabic' or 'camel'
    '''

    if method == 'remove':
        if tool == 'pyarabic':
            return araby.strip_diacritics(text)
        elif tool == 'camel':
            return dediac_ar(text)
    elif method == 'normalize':
        return araby.normalize_hamza(araby.strip_shadda(text))
    else:
        return text

Example

In [None]:
# Example usage
text_with_diacritics = "اللُّغَةُ العَرَبِيَّةُ جَمِيلَةٌ"
removed_diacritics = arabic_dediacrition(text_with_diacritics, 'remove')
normalized_diacritics = arabic_dediacrition(text_with_diacritics, 'normalize')

print("Original:", text_with_diacritics)
print("Removed diacritics:", removed_diacritics)
print("Normalized diacritics:", normalized_diacritics)

### 2.12 Dialect Identification

> <span style="color: green">**_Dialects:_**</span> 

In [None]:
def identify_dialect(text: str, target: str) -> list:
    '''
    A method that identifies which city, country or region does a text comes from.

    :params: **target**: a sentences that requires identifying its dialect.
    '''

    did = DialectIdentifier.pretrained() # a pretrained dialect identification system that can distinguish between 25 city dialects as well as Modern Standard Arabic.
    # In addition to city dialects, the model provides the results aggregated by region and by country. 
    # While these agregated results are less fine-grained, they tend to be more accurate.

    if target == "city":
        return did.predict(text, "city")
    elif target == "country":
        return did.predict(text, "country")
    elif target == "region":
        return did.predit(text, "region")
    else:
        dialect = did.predict(text)
        return dialect

def normalize_dialect(text, target_dialect='MSA'):
    # This is a placeholder function. In practice, you would use more sophisticated
    # methods to normalize dialects, which is an active area of research.
    return text

Example

In [None]:
# Example usage
text = "شلونك حبيبي؟ شخبارك اليوم؟"
dialect = identify_dialect(text)
normalized_text = normalize_dialect(text)

print("Original text:", text)
print("Identified dialect:", dialect)
print("Normalized to MSA:", normalized_text)

### 2.13 Punctuation

> <span style="color: green">**_Punctuation:_**</span> 

In [None]:
def remove_arabic_punctuations(text: str) -> str:
    '''
    A method that removes punctuations

    :params: text: a sentence to processed.
    '''

    return re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)

### 2.14 Named entity recognition (NER)

> <span style="color: green">**_Named entity recognition:_**</span> find and label named entities like proper nouns, organisations, places, etc.

For each token in an input sentence, `NERecognizer` outputs a label that indicates the type of named-entity.The system outputs one of the following labels for each token: `'B-LOC'`, `'B-ORG'`, `'B-PERS'`, `'B-MISC'`, `'I-LOC'`, `'I-ORG'`, `'I-PERS'`, `'I-MISC'`, `'O'`.
Named-entites can either be a `LOC` (location), `ORG` (organization), `PERS` (person), or `MISC` (miscallaneous).

Labels beginning with `B` indicate that their corresponding tokens are the begininging of a multi-word named-entity or is a single-token named-entity'. Those begining with `I` indicate that their corresponding tokens are continuations of a multi-word named-entity. Words that aren't named-entities are given the `'O'` label.

The example below illustrates how `NERecognizer` can be used to label named-entities in a given sentence.

In [None]:
def recognize_arabic_entities(text: str):
    '''
    A method that provides ner for a text such that for each token in an input sentence, the method outputs a label indicates the type of named-entity.
    The system outputs one of the following lables for each token `B-LOC`, `B-ORG`, `B-PERS`, `B-MISC`, `I-LOC`, `I-ORG`, `I-PERS`, `I-MISC`, `O`.

    Named-entities can either be a `LOC` (location), `ORG` (organization), `PERS` (person), or `MISC` (miscallaneous)

    Labels beginning with `B` indicate that their corresponding tokens are the beginning of a multi-word name-entity or is a single token named-entity.
    Those begining with `I` indicate that their corresponding tokens are continuations of a multi-word name-entity. Words that aren't named-entities are given the label `O` label.

    :params:
    text: a sentence that requires named-entity recognition
    '''

    ner = NERecognizer.pretrained()
    labels = ner.predict_sentence(text)
    words = simple_word_tokenize(text)
    entities = []
    current_entity = []
    current_label = None
    
    for word, label in zip(words, labels):
        if label.startswith('B-'):
            if current_entity:
                entities.append((' '.join(current_entity), current_label))
                current_entity = []
            current_entity.append(word)
            current_label = label[2:]
        elif label.startswith('I-') and current_entity:
            current_entity.append(word)
        else:
            if current_entity:
                entities.append((' '.join(current_entity), current_label))
                current_entity = []
                current_label = None
    
    if current_entity:
        entities.append((' '.join(current_entity), current_label))
    
    return entities

Example

In [None]:
# Example usage
text = "يعيش محمد في القاهرة ويعمل في شركة جوجل."
entities = recognize_arabic_entities(text)
print("Text:", text)
print("Recognized entities:", entities)

### 2.15 Morphological Analysis

> <span style="color: green">**_Morphological Analysis:_**</span> is the process of generating all possible readings (analyses) of a given word out of context. All analyses are generated from the undiacritized form of the input word. Each of these analyses is defined by a set lexical and morphological features. 

In [None]:
def arabic_morph_analysis(text: str):
    '''
    A method that generates all possible readings of a given word out of context.

    :params: text: a sentence to be processed.
    '''
    # First, we need to load a morphological database.
    # Here, we load the default database which is used for analyzing
    # Modern Standard Arabic. 
    db = MorphologyDB.builtin_db()

    analyzer = Analyzer(db)

    analyses = analyzer.analyze(text)

    return analyses

### 2.16 Word Segmentation

> <span style="color: green">**_Word Segmentation:_**</span> 

In [None]:
def arabic_word_segmentation(text: str) -> str:
    '''
    A method that segment connected words into a sentence with proper spaces between words.

    @param text: a sentence requires segmentation
    '''
    mlp_model = MaxLikelihoodProbabilityModel.pretrained()
    segmented = mlp_model.segment(text)
    return ' '.join(segmented)

Example

In [4]:
# Example usage
text = "وقالمصدرإنهناكتحسنافيالوضع"
segmented_text = arabic_segmentation(text)
print("Original:", text)
print("Segmented:", segmented_text)

NameError: name 'MaxLikelihoodProbabilityModel' is not defined

### 2.17 Part-of-speech tagging (POS tagging)

> <span style="color: green">**_Part-of-speech tagging:_**</span> is the process of determining 

In [None]:
def arabic_pos_tagging(text: str) -> list:
    '''
    A method that provides part-of-speech (pos) tagging for arabic text

    :params: text: a sentence to be processed
    '''

    mle = MLEDisambiguator.pretrained()
    tagger = DefaultTagger(mle, 'pos')

    # The tagger expects pre-tokenized text
    sentence = simple_word_tokenize(text)

    pos_tags = tagger.tag(sentence)

    return pos_tags

### 2.18 Disambiguation

> <span style="color: green">**_Disambiguation:_**</span> is the process of determining what is the most likely analysis of a word in a given context. Disambiguation is the backbone for many Arabic NLP tasks such as diacritization, POS tagging and morphological tokenization.

In [None]:
def arabic_disambiguation(text: str, model: str="calima"):
    '''
    A method that determines what is the most likely analysis of a word.

    :params: **text**: a sentence to be processed
    :params: **model**: the name of the model either `calima` or default (Maximum Likelihood Estimation Model)
    '''

    if model == "calima":
        disambiguator = MLEDisambiguator.pretrained('calima-msa-r13')
        disambiguated = disambiguator.disambiguate(text.split())
        return [d.analyses[0].analysis['lex'] for d in disambiguated]
    else:
        mle = MLEDisambiguator.pretrained()
        disambig = mle.disambiguate(text)

        # For each disambiguated word d in disambig, d.analyses is a list of analyses
        # sorted from most likely to least likely. Therefore, d.analyses[0] would
        # be the most likely analysis for a given word. Below we extract different
        # features from the top analysis of each disambiguated word into seperate lists.
        diacritized = [d.analyses[0].analysis['diac'] for d in disambig]
        pos_tags = [d.analyses[0].analysis['pos'] for d in disambig]
        lemmas = [d.analyses[0].analysis['lex'] for d in disambig]

        return diacritized, pos_tags, lemmas


Example

In [None]:
# Example usage
text = "ذهب الرجل إلى البنك"
diacritized, pos_tags, lemmas = arabic_disambiguation(text)
print("Original:", text)
print("Diacritized:", diacritized)
print("POS tags:", pos_tags)
print("Lemmas:", lemmas)

### 2.19 Elongated Words

> <span style="color: green">**_Elongated Words:_**</span> 

In [None]:
def normalize_elongated_words(text: str) -> str:
    '''
    A method that removes word elongation.

    :params: **text**: a sentence to be processed
    '''

    text = re.sub(r'(.)\1+', r'\1\1', text)
    return text

Example

In [None]:
# Example usage
elongated_text = "يااااا سلاااام على هذا البرنااامج الراااائع"
normalized_text = normalize_elongated_words(elongated_text)
print("Elongated:", elongated_text)
print("Normalized:", normalized_text)

### 2.20 Data Augmentation

> <span style="color: green">**_Data Augmentation:_**</span> 

In [None]:
def augment_arabic_data(text: str, num_augmentations: int=1) -> str:
    '''
    A method that changes the form of each word in text.

    :params: **text**: a string to be process
    :params: **num_augmentations**: number of times to perform augmentations
    '''
    morph = analyzer.pretrained_analyzer()
    words = text.split()
    augmented_texts = []

    for _ in range(num_augmentations):
        new_words = []
        for word in words:
            analysis = morph.analyze(word)
            if analysis:
                # Randomly choose a different form of the word
                new_word = random.choice(analysis).inflected
                new_words.append(new_word)
            else:
                new_words.append(word)
        augmented_texts.append(' '.join(new_words))

    return augmented_texts

Example

In [None]:
# Example usage
original_text = "الكتاب مفيد للقراءة"
augmented_data = augment_arabic_data(original_text, num_augmentations=3)

print("Original:", original_text)
print("Augmented data:")
for i, text in enumerate(augmented_data, 1):
    print(f"{i}. {text}")

### 2.21 Generation

> <span style="color: green">**_Generation:_**</span> is the process of inflecting a lemma for a set of morphological features.

In [None]:
def arabic_word_generation(word: str, pos: str = 'noun', gen: str = 'm', num: str = 'p'):
    '''
    A method that inflects the lemmma of a word for a set of morphological features
    '''

    # We need to indicate that the database we are loading will be
    # used for generation.
    db = MorphologyDB.builtin_db(flags='g')

    generator = Generator(db)

    # get lemma of a word
    lemma = word
    features = {
        'pos': pos,
        'gen': gen,
        'num': num
    }

    analyses = generator.generate(lemma, features)

    # Extract and print unique diacritizations from generated analyses
    return set([a['diac'] for a in analyses])

> <span style="color: yellow">**_Note:_**</span> `'pos'` is the only *required* feature that needs to be specified.

### 2.22 Reinflection

> <span style="color: green">**_Reinflection:_**</span> is the process of converting a given word in any form to a different form (i.e. tense, gender, etc). The CAMeL Tools reinflector works similar to the generator except that the word doesn't have to be a lemma and it is not have to be restricted to a specific `'pos'`.

In [None]:
def arabic_reinflection(word: str, num: str = 'd', prc1: str = 'bi_prep') -> set: 
    '''
    A method that converts a given word in any different form (i.e. tense, gender, etc).
    '''

    # We need to indicate that the database we are loading will be
    # used for reinflection.
    db = MorphologyDB.builtin_db(flags='r')

    reinflector = Reinflector(db)

    features = {
        'num': num,
        'prc1': prc1
    }

    analyses = reinflector.reinflect(word, features)

    # Extract and print unique diacritizations from reinflected analyses
    return set(a['diac'] for a in analyses)

## 3. Handling Outliers

### 3.1 Handling Very Common Word Removal

In [None]:
def handling_common_words(df: pd.DataFrame, mode='remove'):
    pass

Example

### 3.2 Handling Very Rare Word Removal

In [None]:
def handling_rare_words(df: pd.DataFrame, mode='remove'):
    pass

Example

### 3.3 Handling Numbers and Special Characters in Arabic Text

In [1]:
def handle_numbers_and_special_chars(text, mode='remove'):
    '''
    A method that either replace arabic numbers to hindi numbers or remove them.
    '''
    if mode == 'remove':
        # Remove numbers and special characters
        return re.sub(r'[^\u0600-\u06FF\s]', '', text)
    elif mode == 'normalize':
        # Normalize Arabic numbers to Hindi numbers
        number_map = {
            '٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4',
            '٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9'
        }
        
        for arabic, hindi in number_map.items():
            text = text.replace(arabic, hindi)
        
        return text

Example

In [None]:
# Example usage
text = "يوجد ٣ تفاحات و٥ برتقالات في السلة!"
removed_numbers = handle_numbers_and_special_chars(text, 'remove')
normalized_numbers = handle_numbers_and_special_chars(text, 'normalize')

print("Original:", text)
print("Removed numbers and special chars:", removed_numbers)
print("Normalized numbers:", normalized_numbers)

# <p style="padding:50px;background-color:#DA8359;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Feature Engineering</p>

# <p style="padding:50px;background-color:#DA8359;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Preporcessing</p>

## 1. Text Classification

In [None]:
def classify_arabic_text(text: str, model_name: str ="aubmindlab/bert-base-arabertv2"):
    '''
    A method that classify text

    :params: **text**:
    :params: **model_name**:
    '''
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    return predictions.tolist()[0]

Example

In [4]:
# Example usage
text = "هذا النص رائع ومفيد جداً"
classification = classify_arabic_text(text)
print(f"Text: {text}")
print(f"Classification probabilities: {classification}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'torch' is not defined

## 2. Sentiment Analysis

In [None]:
def analyze_arabic_sentiment(text):
    sentiment_pipeline = pipeline("sentiment-analysis", model="CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
    result = sentiment_pipeline(text)[0]
    return result['label'], result['score']

Example

In [49]:
# Example usage
text = "أنا سعيد جداً بهذا المنتج!"
sentiment, score = analyze_arabic_sentiment(text)
print(f"Text: {text}")
print(f"Sentiment: {sentiment}, Score: {score}")

NameError: name 'analyze_arabic_sentiment' is not defined

## 3. Word Embedding

> <span style="color: green">**_Word Embedding:_**</span> 

In [None]:
def arabic_word_embedding(text: str):
    pass

## 4. Multi-Label Labelling

## 5. Topic Modeling

> <span style="color: green">**_Topic Modeling:_**</span> is an unsupervised machine learning technique for finding abstract topics in a large collection of documents. It helps in organizing, understanding and summarizing large collections of textual information and discovering the latent topics that vary among documents in a given corpus.

Latent Dirichlet allocation (LDA) and Non-Negative Matrix Fatorization (NMF) are two of the most popular topic modeling techniques. LDA uses a probabilistic approach whereas NMF uses matrix factorization approach, however, new techniques that are based on BERT for topic modeling do exist.

In [None]:
import pandas as pd
from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
from gensim.models import LdaMulticore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF


In [None]:
data = 

## 6. Translate to English

> <span style="color: red">**_TODO:_**</span> Translate Arabic to English and perform natural language processing.

# <p style="padding:50px;background-color:#DA8359;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Visualize Data</p>

In [None]:
def clean_text(text):
    ## Remove punctuations
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)  # remove punctuation
    
    ## Remove extra whitespace
    text = re.sub('\s+', ' ', text)

    ## Remove Emojis
    text = remove_emoji(text)

    ## Convert text to lowercases
    text = text.lower()

    ## Arabisy the text
    text = to_arabic(text)

    ## Remove stop words
    text = remove_stop_words(text)

    ## Remove numbers
    text = re.sub("\d+", " ", text)

    ## Remove Tashkeel
    text = normalizeArabic(text)

    #text = re.sub('\W+', ' ', text)
    text = re.sub('[A-Za-z]+',' ',text)
    text = re.sub(r'\\u[A-Za-z0-9\\]+',' ',text)
    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)  
    #Stemming
    #text = stem(text)
    return text

# <p style="padding:50px;background-color:#DA8359;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Resources</p>

## Research

1. BERT for Arabic Topic Modeling: An Experimental Study on BERTopic Technique (https://github.com/iwan-rg/Arabic-Topic-Modeling?tab=readme-ov-file)
