<a href="https://colab.research.google.com/github/MariamHawwari/MariamHawwari/blob/main/TF_IDF_Mariam_Hawwari_(Midterm_Project).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Midterm Project:** Arabic Text Tokenization, Stem/Lem & TF-IDF Analysis



*   **Student:** Mariam Hawwari <br>
*   **Course:** Indexation <br>
*   **Instructor:** Dr. Alaf Makke <br>
*   **Objective:** Preprocess Arabic text by removing stopwords, applying stem or lem, and calculating TF-IDF scores for a set of sentences.<br>
*   **Steps:** 08 steps
<br>

# **01. Install & Import Libraries**

In [None]:
!pip install nltk
!pip install qalsadi

import nltk  # Natural Language Toolkit: A library for natural language processing.
nltk.download('stopwords')  # Downloads a list of common stopwords for various languages.
nltk.download('punkt_tab') ## Download the Punkt tokenizer model for sentence tokenization.
nltk.download('punkt')  # Downloads a tokenizer model that splits text into sentences and words.
import pandas as pd  # Import pandas to create a DataFrame.
import string # Import string module for punctuation

from nltk.corpus import stopwords  # Provides access to the stopwords for filtering common words.
from nltk.tokenize import word_tokenize  # Tokenizer to split text into words.
from sklearn.feature_extraction.text import TfidfVectorizer  # Import TfidfVectorizer for TF-IDF calculation.
from qalsadi.lemmatizer import Lemmatizer  # Arabic lemmatizer from qalsadi



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **02. Define Arabic Sentences**

In [None]:
sentence1 = "أنا أذهب إلى المدرسة كل يوم."
sentence2 = "المدرسة مكان رائع للتعلم."
sentence3 = "التعلم يساعدنا في تحقيق النجاح."

# **03. Load Arabic Stopwords**

In [None]:
# Load from an external library (NLTK)
arabic_stopwords = stopwords.words('arabic')
print(arabic_stopwords)

['إذ', 'إذا', 'إذما', 'إذن', 'أف', 'أقل', 'أكثر', 'ألا', 'إلا', 'التي', 'الذي', 'الذين', 'اللاتي', 'اللائي', 'اللتان', 'اللتيا', 'اللتين', 'اللذان', 'اللذين', 'اللواتي', 'إلى', 'إليك', 'إليكم', 'إليكما', 'إليكن', 'أم', 'أما', 'أما', 'إما', 'أن', 'إن', 'إنا', 'أنا', 'أنت', 'أنتم', 'أنتما', 'أنتن', 'إنما', 'إنه', 'أنى', 'أنى', 'آه', 'آها', 'أو', 'أولاء', 'أولئك', 'أوه', 'آي', 'أي', 'أيها', 'إي', 'أين', 'أين', 'أينما', 'إيه', 'بخ', 'بس', 'بعد', 'بعض', 'بك', 'بكم', 'بكم', 'بكما', 'بكن', 'بل', 'بلى', 'بما', 'بماذا', 'بمن', 'بنا', 'به', 'بها', 'بهم', 'بهما', 'بهن', 'بي', 'بين', 'بيد', 'تلك', 'تلكم', 'تلكما', 'ته', 'تي', 'تين', 'تينك', 'ثم', 'ثمة', 'حاشا', 'حبذا', 'حتى', 'حيث', 'حيثما', 'حين', 'خلا', 'دون', 'ذا', 'ذات', 'ذاك', 'ذان', 'ذانك', 'ذلك', 'ذلكم', 'ذلكما', 'ذلكن', 'ذه', 'ذو', 'ذوا', 'ذواتا', 'ذواتي', 'ذي', 'ذين', 'ذينك', 'ريث', 'سوف', 'سوى', 'شتان', 'عدا', 'عسى', 'عل', 'على', 'عليك', 'عليه', 'عما', 'عن', 'عند', 'غير', 'فإذا', 'فإن', 'فلا', 'فمن', 'في', 'فيم', 'فيما', 'فيه', 'فيها', '

# **04. Preprocess & Tokenize Text**

In [None]:
def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    tokens = word_tokenize(text) # Tokenize
    filtered_tokens = []
    for word in tokens:
        if word not in arabic_stopwords:
            filtered_tokens.append(word)
    return filtered_tokens

# **05. Lemmatize Text**

* *I’m using lemmatization because my aim is to compare sentences/documents to see if they share similar **themes or topics** rather than just the overlap in word forms. Lemmatization ensures that all forms of a word are counted as the same term, reducing the noise created by variations. In this context, using stemming would produce non-words and group related words under a common root, which can obscure individual word meanings and decrease accuracy.*

In [None]:
lemmatizer = Lemmatizer()
def lemmatize_tokens(tokens):
    lemmatized_tokens = []
    for word in tokens:
        lemmatized_word = lemmatizer.lemmatize(word)
        lemmatized_tokens.append(lemmatized_word)
    return lemmatized_tokens

# **06. Process Results for Each Sentence**

In [None]:
processed_sentence1 = preprocess_text(sentence1)
processed_sentence2 = preprocess_text(sentence2)
processed_sentence3 = preprocess_text(sentence3)

lemmatized_sentence1 = lemmatize_tokens(processed_sentence1)
lemmatized_sentence2 = lemmatize_tokens(processed_sentence2)
lemmatized_sentence3 = lemmatize_tokens(processed_sentence3)

# Join the lemmatized tokens back into strings
lemmatized_sentence1_str = " ".join(lemmatized_sentence1)
lemmatized_sentence2_str = " ".join(lemmatized_sentence2)
lemmatized_sentence3_str = " ".join(lemmatized_sentence3)


# Print results for each sentence
print("Lemmatized Sentence 1:", lemmatized_sentence1_str)
print("Lemmatized Sentence 2:", lemmatized_sentence2_str)
print("Lemmatized Sentence 3:", lemmatized_sentence3_str)

Lemmatized Sentence 1: ذهب مدرس يوم
Lemmatized Sentence 2: مدرس مكان رائع تعلم
Lemmatized Sentence 3: تعلم ساعد تحقيق نجاح


# **07.Calculate & Display TF-IDF Scores**



*   **First Method:** Display scores in a structured tabular format. <br>
*   **Note:** The results are flipped in colab.

In [None]:
# Calculate TF-IDF scores
corpus = [lemmatized_sentence1_str, lemmatized_sentence2_str, lemmatized_sentence3_str]

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus (Expects a list of strings)
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Get feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a DataFrame to display TF-IDF scores in columns
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names, index=["Sentence1", "Sentence2", "Sentence3"])

# Display the DataFrame
print("\nTF-IDF Scores in Columns:")
print(df_tfidf)


TF-IDF Scores in Columns:
              تحقيق      تعلم       ذهب      رائع      ساعد      مدرس  \
Sentence1  0.000000  0.000000  0.622766  0.000000  0.000000  0.473630   
Sentence2  0.000000  0.428046  0.000000  0.562829  0.000000  0.428046   
Sentence3  0.528635  0.402040  0.000000  0.000000  0.528635  0.000000   

               مكان      نجاح       يوم  
Sentence1  0.000000  0.000000  0.622766  
Sentence2  0.562829  0.000000  0.000000  
Sentence3  0.000000  0.528635  0.000000  


*   **Second Method:** Display scores in a text format.

In [None]:
# Calculate TF-IDF scores
corpus = [lemmatized_sentence1_str, lemmatized_sentence2_str, lemmatized_sentence3_str]

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Get feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Display TF-IDF scores
for i, sentence in enumerate(corpus):
    print(f"\nTF-IDF for Sentence {i+1} ({sentence}):")
    for j, feature in enumerate(feature_names):
        print(f"{feature}: {tfidf_matrix[i, j]:.4f}")


TF-IDF for Sentence 1 (ذهب مدرس يوم):
تحقيق: 0.0000
تعلم: 0.0000
ذهب: 0.6228
رائع: 0.0000
ساعد: 0.0000
مدرس: 0.4736
مكان: 0.0000
نجاح: 0.0000
يوم: 0.6228

TF-IDF for Sentence 2 (مدرس مكان رائع تعلم):
تحقيق: 0.0000
تعلم: 0.4280
ذهب: 0.0000
رائع: 0.5628
ساعد: 0.0000
مدرس: 0.4280
مكان: 0.5628
نجاح: 0.0000
يوم: 0.0000

TF-IDF for Sentence 3 (تعلم ساعد تحقيق نجاح):
تحقيق: 0.5286
تعلم: 0.4020
ذهب: 0.0000
رائع: 0.0000
ساعد: 0.5286
مدرس: 0.0000
مكان: 0.0000
نجاح: 0.5286
يوم: 0.0000


Exception ignored in: <function ArabicDictionary.__del__ at 0x7d768ed22ca0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/arramooz/arabicdictionary.py", line 109, in __del__
    self.db_connect.close()
sqlite3.ProgrammingError: SQLite objects created in a thread can only be used in that same thread. The object was created in thread id 137949572218880 and this is thread id 137948871763520.


# **08. Analyse TF/IDF Scores**

In [None]:
word_counts = {}
for sentence in corpus:
    for word in sentence.split():  # Split sentence by spaces to get words
        word_counts[word] = word_counts.get(word, 0) + 1

common_words = []
unique_words = []

for word, count in word_counts.items():
    if count > 1:
        common_words.append(word)
    else:
        unique_words.append(word)

print("\nCommon Words: Low TF-IDF words are generally less important for distinguishing the content/topic of a sentence or document from others:\n")
for word in common_words:
    if word in df_tfidf.columns:  # Check if the word is a valid column name
        tfidf_scores = df_tfidf[word].values  # Get TF-IDF scores for the word
        # Find the sentence with the highest TF-IDF score for this word (if it exists)
        max_tfidf_index = tfidf_scores.argmax() if tfidf_scores.max() > 0 else -1

        if max_tfidf_index != -1:  # if the word appeared at least once in the corpus
            print(f"- {word}: {tfidf_scores.max():.2f}")  # Print the highest TF-IDF score for this word

print("\n\nUnique Words: High TF-IDF words are more important for highlighting unique or relevant terms that help distinguish a sentence from others.\n")
for word in unique_words:
    if word in df_tfidf.columns:  # Check if the word is a valid column name
        tfidf_scores = df_tfidf[word].values
        max_tfidf_index = tfidf_scores.argmax() if tfidf_scores.max() > 0 else -1
        if max_tfidf_index != -1:
            print(f"- {word}: {tfidf_scores.max():.2f}")

Exception ignored in: <function ArabicDictionary.__del__ at 0x7d768ed22ca0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/arramooz/arabicdictionary.py", line 109, in __del__
    self.db_connect.close()
sqlite3.ProgrammingError: SQLite objects created in a thread can only be used in that same thread. The object was created in thread id 137949572218880 and this is thread id 137948871763520.



Common Words: Low TF-IDF words are generally less important for distinguishing the content/topic of a sentence or document from others:

- مدرس: 0.47
- تعلم: 0.43


Unique Words: High TF-IDF words are more important for highlighting unique or relevant terms that help distinguish a sentence from others.

- ذهب: 0.62
- يوم: 0.62
- مكان: 0.56
- رائع: 0.56
- ساعد: 0.53
- تحقيق: 0.53
- نجاح: 0.53
