#**Step 1: Preprocessing:**

###**1- Tokenization**

word-based tokenization

In [9]:
import spacy

# Load language models for English and Arabic
nlp_en = spacy.load("en_core_web_sm")
nlp_ar = spacy.blank("ar")  # spaCy supports Arabic tokenization, but tagging requires more tools

# Tokenization function
def tokenize_text(text, nlp_model):
    doc = nlp_model(text)
    tokens = [token.text for token in doc]
    return tokens

# Example texts
text_en =  "Artificial intelligence is the future. Artificial intelligence improves human life. The future of Artificial intelligence is bright."
text_ar =  "الذكاء الاصطناعي هو المستقبل. الذكاء الاصطناعي يعزز حياة الإنسان. المستقبل الذكاء."

tokens_en = tokenize_text(text_en, nlp_en)
tokens_ar = tokenize_text(text_ar, nlp_ar)

print("English Tokens:", tokens_en)
print("Arabic Tokens:", tokens_ar)

English Tokens: ['Artificial', 'intelligence', 'is', 'the', 'future', '.', 'Artificial', 'intelligence', 'improves', 'human', 'life', '.', 'The', 'future', 'of', 'Artificial', 'intelligence', 'is', 'bright', '.']
Arabic Tokens: ['الذكاء', 'الاصطناعي', 'هو', 'المستقبل', '.', 'الذكاء', 'الاصطناعي', 'يعزز', 'حياة', 'الإنسان', '.', 'المستقبل', 'الذكاء', '.']


character-based tokenization

In [10]:
C_tokens_arabic  = [char for char in text_ar]
C_tokens_english = [char for char in text_en]

# Display tokens
print("Tokens for Arabic Text:", C_tokens_arabic)
print("Tokens for English Text:", C_tokens_english)

Tokens for Arabic Text: ['ا', 'ل', 'ذ', 'ك', 'ا', 'ء', ' ', 'ا', 'ل', 'ا', 'ص', 'ط', 'ن', 'ا', 'ع', 'ي', ' ', 'ه', 'و', ' ', 'ا', 'ل', 'م', 'س', 'ت', 'ق', 'ب', 'ل', '.', ' ', 'ا', 'ل', 'ذ', 'ك', 'ا', 'ء', ' ', 'ا', 'ل', 'ا', 'ص', 'ط', 'ن', 'ا', 'ع', 'ي', ' ', 'ي', 'ع', 'ز', 'ز', ' ', 'ح', 'ي', 'ا', 'ة', ' ', 'ا', 'ل', 'إ', 'ن', 'س', 'ا', 'ن', '.', ' ', 'ا', 'ل', 'م', 'س', 'ت', 'ق', 'ب', 'ل', ' ', 'ا', 'ل', 'ذ', 'ك', 'ا', 'ء', '.']
Tokens for English Text: ['A', 'r', 't', 'i', 'f', 'i', 'c', 'i', 'a', 'l', ' ', 'i', 'n', 't', 'e', 'l', 'l', 'i', 'g', 'e', 'n', 'c', 'e', ' ', 'i', 's', ' ', 't', 'h', 'e', ' ', 'f', 'u', 't', 'u', 'r', 'e', '.', ' ', 'A', 'r', 't', 'i', 'f', 'i', 'c', 'i', 'a', 'l', ' ', 'i', 'n', 't', 'e', 'l', 'l', 'i', 'g', 'e', 'n', 'c', 'e', ' ', 'i', 'm', 'p', 'r', 'o', 'v', 'e', 's', ' ', 'h', 'u', 'm', 'a', 'n', ' ', 'l', 'i', 'f', 'e', '.', ' ', 'T', 'h', 'e', ' ', 'f', 'u', 't', 'u', 'r', 'e', ' ', 'o', 'f', ' ', 'A', 'r', 't', 'i', 'f', 'i', 'c', 'i', 'a', 'l',

###**2- Stopword Removal**

In [11]:
# English and Arabic stopword removal
from spacy.lang.en.stop_words import STOP_WORDS as EN_STOPWORDS
from spacy.lang.ar.stop_words import STOP_WORDS as AR_STOPWORDS

def remove_stopwords(tokens, stopwords):
    return [token for token in tokens if token.lower() not in stopwords]

filtered_en = remove_stopwords(tokens_en, EN_STOPWORDS)
filtered_ar = remove_stopwords(tokens_ar, AR_STOPWORDS)

print("Filtered English Tokens:", filtered_en)
print("Filtered Arabic Tokens:", filtered_ar)


Filtered English Tokens: ['Artificial', 'intelligence', 'future', '.', 'Artificial', 'intelligence', 'improves', 'human', 'life', '.', 'future', 'Artificial', 'intelligence', 'bright', '.']
Filtered Arabic Tokens: ['الذكاء', 'الاصطناعي', 'المستقبل', '.', 'الذكاء', 'الاصطناعي', 'يعزز', 'حياة', 'الإنسان', '.', 'المستقبل', 'الذكاء', '.']


###**3- Noice Removal**

In [14]:
import string

punctuation = string.punctuation

cleaned_ar = [word for word in filtered_ar if word not in punctuation]
cleaned_en = [word for word in filtered_en if word not in punctuation]

# showing The Punctuation
print("Punctuation Elements :",punctuation)

# Display tokens after removing punctuation
print("\n\nFiltered Arabic Tokens without Punctuation:", cleaned_ar)
print("\nFiltered English Tokens without Punctuation:", cleaned_en)

Punctuation Elements : !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Filtered Arabic Tokens without Punctuation: ['الذكاء', 'الاصطناعي', 'المستقبل', 'الذكاء', 'الاصطناعي', 'يعزز', 'حياة', 'الإنسان', 'المستقبل', 'الذكاء']

Filtered English Tokens without Punctuation: ['Artificial', 'intelligence', 'future', 'Artificial', 'intelligence', 'improves', 'human', 'life', 'future', 'Artificial', 'intelligence', 'bright']


###**4- Normalization**

In [15]:
# Normalization function
def normalize_text(tokens, is_arabic=False):
    if is_arabic:
        # Normalize Arabic letters
        normalized = [token.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا") for token in tokens]
    else:
        normalized = tokens
    # Convert to lowercase
    normalized = [token.lower() for token in normalized]
    return normalized

# Normalization
normalized_en = normalize_text(cleaned_en)
normalized_ar = normalize_text(cleaned_ar, is_arabic=True)

print("Normalized English Tokens:", normalized_en)
print("Normalized Arabic Tokens:", normalized_ar)


Normalized English Tokens: ['artificial', 'intelligence', 'future', 'artificial', 'intelligence', 'improves', 'human', 'life', 'future', 'artificial', 'intelligence', 'bright']
Normalized Arabic Tokens: ['الذكاء', 'الاصطناعي', 'المستقبل', 'الذكاء', 'الاصطناعي', 'يعزز', 'حياة', 'الانسان', 'المستقبل', 'الذكاء']


###**5- POS Tagging**

In [31]:
!pip install camel-tools --upgrade



In [33]:
!python -m camel_tools.download.downloader --url https://download.camel-tools.org/calima-msa-r13.zip --output /tmp/calima-msa-r13.zip
!unzip /tmp/calima-msa-r13.zip -d /tmp/calima-msa-r13
!python -m camel_tools register_builtin_db --db-path /tmp/calima-msa-r13/calima-msa-r13-db --db-name calima-msa-r13 --force
!camel_data -i morphology-db-msa-s31 # Install the required morphological database
!camel_data -i disambig-bert-unfactored-msa

/usr/bin/python3: Error while finding module specification for 'camel_tools.download.downloader' (ModuleNotFoundError: No module named 'camel_tools.download')
unzip:  cannot find or open /tmp/calima-msa-r13.zip, /tmp/calima-msa-r13.zip.zip or /tmp/calima-msa-r13.zip.ZIP.
/usr/bin/python3: No module named camel_tools.__main__; 'camel_tools' is a package and cannot be directly executed
No new packages will be installed.
No new packages will be installed.


In [36]:
# English POS Tagging
def pos_tagging_english(tokens, nlp_model):
    doc = nlp_model(" ".join(tokens))
    return [(token.text, token.pos_) for token in doc]

pos_tags_en = pos_tagging_english(normalized_en, nlp_en)

# Arabic POS Tagging using Camel Tools (requires installation)
# pip install camel-tools
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.morphology.database import MorphologyDB

# Initialize MorphologyDB with the desired database
db = MorphologyDB.builtin_db('calima-msa-r13')
# Initialize Analyzer with the MorphologyDB instance
analyzer = Analyzer(db=db)
def pos_tag_arabic(tokens):
    pos_tags = []
    for token in tokens:
        analyses = analyzer.analyze(token)
        if analyses:
            pos_tags.append((token, analyses[0]['pos']))
        else:
            pos_tags.append((token, 'unknown'))
    return pos_tags

pos_tags_ar = pos_tag_arabic(normalized_ar)

print("English POS Tags:", pos_tags_en)
print("Arabic POS Tags:", pos_tags_ar)


English POS Tags: [('artificial', 'ADJ'), ('intelligence', 'NOUN'), ('future', 'ADJ'), ('artificial', 'ADJ'), ('intelligence', 'NOUN'), ('improves', 'VERB'), ('human', 'ADJ'), ('life', 'NOUN'), ('future', 'ADJ'), ('artificial', 'ADJ'), ('intelligence', 'NOUN'), ('bright', 'ADJ')]
Arabic POS Tags: [('الذكاء', 'noun'), ('الاصطناعي', 'adj'), ('المستقبل', 'noun'), ('الذكاء', 'noun'), ('الاصطناعي', 'adj'), ('يعزز', 'verb'), ('حياة', 'verb'), ('الانسان', 'noun'), ('المستقبل', 'noun'), ('الذكاء', 'noun')]


#**Step 2: Analysis and Interpretation**

After applying the preprocessing steps, the text undergoes significant transformations that improve its quality and readiness for further analysis or modeling. Here's a breakdown of how each step contributes to this improvement:

### **1. Tokenization**
- **Changes in Text**: The input text is divided into smaller units, typically words or sentences. For example:  
  - **English**: `"The weather is lovely today, isn't it?"` → `['The', 'weather', 'is', 'lovely', 'today', ',', "isn't", 'it', '?']`  
  - **Arabic**: `"الطقس جميل اليوم، أليس كذلك؟"` → `['الطقس', 'جميل', 'اليوم', '،', 'أليس', 'كذلك', '؟']`
- **Benefits**:
  - Converts the text into manageable pieces for analysis.
  - Essential for downstream tasks like stopword removal, tagging, and feature extraction.

---

### **2. Stopword Removal**
- **Changes in Text**: Common, non-informative words are removed.  
  - **English**: `['The', 'weather', 'is', 'lovely', 'today']` → `['weather', 'lovely', 'today']`  
  - **Arabic**: `['الطقس', 'جميل', 'اليوم']` → `['جميل', 'اليوم']`
- **Benefits**:
  - Reduces text dimensionality by eliminating filler words.
  - Helps focus on content-rich words that carry meaningful information.

---

### **3. Noise Removal**
- **Changes in Text**: Symbols, numbers, and unnecessary punctuation are stripped away.  
  - **English**: `['weather', 'lovely', 'today', ',']` → `['weather', 'lovely', 'today']`  
  - **Arabic**: `['جميل', 'اليوم', '؟']` → `['جميل', 'اليوم']`
- **Benefits**:
  - Cleans up irrelevant elements that could interfere with analysis.
  - Makes the text more uniform and interpretable.

---

### **4. Normalization**
- **Changes in Text**: Text is standardized:  
  - **English**: Converts all characters to lowercase for uniformity.  
    Example: `['Lovely', 'Today']` → `['lovely', 'today']`  
  - **Arabic**: Standardizes letter forms (e.g., "أ" → "ا") and removes extra spaces.  
    Example: `['أليس', 'كذلك']` → `['اليس', 'كذلك']`
- **Benefits**:
  - Ensures consistency in the data.
  - Reduces variability caused by different cases or spelling differences, especially in Arabic.

---

### **5. POS Tagging**
- **Changes in Text**: Adds grammatical labels to each token:  
  - **English**: `['weather', 'lovely', 'today']` → `[('weather', 'NOUN'), ('lovely', 'ADJ'), ('today', 'NOUN')]`  
  - **Arabic**: `['جميل', 'اليوم']` → `[('جميل', 'ADJ'), ('اليوم', 'NOUN')]`
- **Benefits**:
  - Provides grammatical context, enabling tasks like dependency parsing and syntactic analysis.
  - Helps differentiate between word meanings (e.g., "play" as a noun vs. verb).

---

### **Summary of Benefits**
The preprocessing steps collectively enhance the text by:
- **Simplifying the structure**: Tokenization and stopword removal reduce complexity and highlight relevant content.
- **Cleaning and standardizing**: Noise removal and normalization produce cleaner, uniform text.
- **Providing linguistic insights**: POS tagging adds syntactic and semantic value, which is essential for feature engineering in NLP tasks.

This processed text is now more suitable for tasks like text classification, sentiment analysis, or predictive modeling, as irrelevant details are minimized and the focus is shifted to meaningful patterns.