In [178]:
import re
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords

In [179]:
def replace_pattern(text,pattern,replace = ''):
    # Replace diacritics from the text
    cleaned_text = pattern.sub(replace, text)
    return cleaned_text

In [180]:
# Example usage
arabic_text = "قَوْلُهُ : ( أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ ) قَالَ الزَّرْكَشِيُّ( 14 / 123 )"

In [181]:
def preprocess(text):
    # remove any brackets that have only numbers inside and remove all numbers 
    reg = r'\(\s*(\d+)\s*\/\s*(\d+)\s*\)|\d+'
    text = replace_pattern(text, re.compile(reg))
    # replace all different types of brackets with a single type
    reg_opening_brackets = r'[\[\{]'
    reg_closing_brackets = r'[\]\}]'
    text = replace_pattern(text, re.compile(reg_opening_brackets), '(')
    text = replace_pattern(text, re.compile(reg_closing_brackets), ')')
    # remove some unwanted characters
    reg = r'[/!\-؛،؟:]'
    text = replace_pattern(text, re.compile(reg))
    # remove extra spaces
    reg = r'\s+'
    text = replace_pattern(text, re.compile(reg), ' ')
    return text

In [182]:
def tokenize(text):
    # tokenize the text
    tokenizer = TreebankWordTokenizer()
    text = preprocess(text)
    tokens = tokenizer.tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stopwords.words('arabic')]
    return filtered_tokens

In [183]:
print(tokenize(arabic_text))

['قَوْلُهُ', '(', 'أَوْ', 'قَطَعَ', 'الْأَوَّلُ', 'يَدَهُ', 'إلَخْ', ')', 'قَالَ', 'الزَّرْكَشِيُّ']


In [184]:
preprocessed_text = preprocess(arabic_text)
print(preprocessed_text)

قَوْلُهُ ( أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ ) قَالَ الزَّرْكَشِيُّ


In [185]:
# remove diacritics
reg = r'[\u064B-\u065F\u0670\uFE70-\uFE7F]'
arabic_text = replace_pattern(arabic_text, re.compile(reg))
print(arabic_text)

قوله : ( أو قطع الأول يده إلخ ) قال الزركشي( 14 / 123 )
