In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# Step 1: Load your Arabic data into a pandas DataFrame
data = pd.read_csv("data/s_data.csv")

In [6]:
data.shape

(4969, 2)

In [4]:
data.head()

Unnamed: 0,text,class
0,﻿ اعتقل اندونيسي من قبل الشرطة بعد ان اثار جدل...,religon
1,﻿ عمان - الراي - اكد المهندس نضال الحديد امين ...,env
2,(2) امه هي امنه بنت وهب بن عبد مناف بن زهره بن...,religon
3,(ناسا) لتركيب نظام قياسي ضوئي شمسي طيفي لرصد ا...,ST
4,@ 3drees: حماقي حلف بالله انه ما يستمتع الا مع...,art


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4969 entries, 0 to 4968
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4969 non-null   object
 1   class   4969 non-null   object
dtypes: object(2)
memory usage: 77.8+ KB


In [8]:
# Step 2: Preprocess the text data to remove likes, emojis, non-text characters, English letters, and links
def preprocess_text(text):
    # Remove likes (assuming likes start with @ and end with space)
    text = re.sub(r'@\S+\s', '', text)
    
    # Remove links (matches common URL patterns)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove emojis and other non-text characters
    text = re.sub(r'[^\w\sء-ي]', '', text)
    
    # Remove English letters
    text = re.sub(r'[a-zA-Z]', '', text)
    
    
    return text

data['text'] = data['text'].apply(preprocess_text)

In [9]:
# Step 3: Feature Extraction - TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['text'])
y = data['class']

In [10]:
# Step 4: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Step 5: Train a Support Vector Machine (SVM) classifier
classifier = SVC(kernel='linear', C=1.0, random_state=42)
classifier.fit(X_train, y_train)

In [19]:
# Step 6: Predict the class of new text data
new_text = ["الفنان اعالي محمد حماقي"]
new_text = [preprocess_text(text) for text in new_text]
new_text_features = tfidf_vectorizer.transform(new_text)
predicted_class = classifier.predict(new_text_features)[0]

print("Predicted Class:", predicted_class)


Predicted Class: art


In [13]:
# Step 7: Evaluate the classifier's accuracy on the test set
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9698189134808853


In [14]:
# Gradio Interface
import gradio as gr

def classify_text(text):
    text = preprocess_text(text)
    text_feature = tfidf_vectorizer.transform([text])
    predicted_class = classifier.predict(text_feature)[0]
    return predicted_class

iface = gr.Interface(
    fn=classify_text,
    inputs=gr.inputs.Textbox(),
    outputs=gr.outputs.Label(),
    live=True,
    layout="vertical",
    title="Arabic Text Classifier",
    description="Enter your Arabic text, and the model will predict its class.",
    examples=[["أهلاً وسهلاً، كيف يمكنني مساعدتك؟"]],
)

iface.launch()

  from .autonotebook import tqdm as notebook_tqdm
  inputs=gr.inputs.Textbox(),
  inputs=gr.inputs.Textbox(),
  inputs=gr.inputs.Textbox(),
  outputs=gr.outputs.Label(),
  outputs=gr.outputs.Label(),
  iface = gr.Interface(


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


