In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF


In [2]:
# Sample Arabic text data
text_data = [
    "أنا سعيد لأنني أستطيع التحدث باللغة العربية.",  # I am happy that I can speak Arabic.
    "أحب قراءة الكتب العربية.",  # I love reading Arabic books.
    "أعتقد أن اللغة العربية جميلة جدا.",  # I think the Arabic language is very beautiful.
    "آمل أن أتمكن من تحسين لغتي العربية يوما ما.",  # I hope I can improve my Arabic someday.
    "شكرا جزيلا على وقتك.",  # Thank you for your time.
]

In [3]:
stop_words = ["أنا", "لأن", "اللغة", "أحب", "أعتقد", "أن", "جدا", "آمل", "أن", "أتمكن", "من", "تحسين", "يوما", "شكرا", "جزيلا", "على", "وقتك"]
text_data_processed = [
    " ".join([word for word in text.split() if word not in stop_words])
    for text in text_data
]


In [4]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=100)
tfidf_matrix = vectorizer.fit_transform(text_data_processed)

# Perform NMF with 3 topics
nmf = NMF(n_components=3, random_state=0)
nmf_model = nmf.fit(tfidf_matrix)

# Get the topic words
topic_words = []
for topic_idx in range(nmf.n_components):
    topic_words.append(
        [
            vectorizer.get_feature_names_out()[i]
            for i, score in enumerate(nmf_model.components_[topic_idx])
            if score > 0.01
        ]
    )

# Print the topics
for topic_idx, words in enumerate(topic_words):
    print(f"Topic {topic_idx + 1}: {', '.join(words)}")


Topic 1: العربية, الكتب, جدا, جميلة, قراءة, لغتي, ما
Topic 2: وقتك
Topic 3: أستطيع, التحدث, العربية, باللغة, سعيد, لأنني


In [5]:
def predict_dominant_topic(text):
  """
  Predicts the dominant topic for a new piece of text.

  Args:
      text: The text to predict the dominant topic for.

  Returns:
      The index of the dominant topic.
  """
  # Preprocess the text
  text_processed = " ".join([word for word in text.split() if word not in stop_words])
  # Vectorize the text
  new_tfidf_vec = vectorizer.transform([text_processed])
  # Get the document-topic distribution
  doc_topic_dist = nmf_model.transform(new_tfidf_vec)
  # Find the dominant topic
  dominant_topic = np.argmax(doc_topic_dist, axis=1)
  return dominant_topic[0]


In [6]:
# Example usage
new_text = "أنا أستمتع بتعلم اللغة العربية."  # I enjoy learning Arabic.
dominant_topic_index = predict_dominant_topic(new_text)
print(f"The dominant topic for the new text is: {topic_words[dominant_topic_index]}")

The dominant topic for the new text is: ['العربية', 'الكتب', 'جدا', 'جميلة', 'قراءة', 'لغتي', 'ما']
