In [None]:
!pip -q install transformers datasets accelerate torch scikit-learn gensim tashaphyne arabic-reshaper

import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt

import nltk
from nltk.stem.isri import ISRIStemmer

from tashaphyne.stemming import ArabicLightStemmer

from gensim.models import FastText
from nltk.stem.snowball import SnowballStemmer

import torch
import torch.nn as nn



In [None]:
try:
    df = pd.read_csv("/content/arabic_stemming.csv", encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv("/content/arabic_stemming.csv", encoding='windows-1256')
    except UnicodeDecodeError:
        df = pd.read_csv("/content/arabic_stemming.csv", encoding='ISO-8859-6')
df.head()

NameError: name 'pd' is not defined

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle
embeddings = {
    "tfidf": {}
}
for stem in ["isri", "light", "snowball"]:
    vectorizer = TfidfVectorizer(max_features=5000)

    processed_text_column = df[f"text_{stem}"].fillna('')

    tfidf_matrix = vectorizer.fit_transform(processed_text_column)
    X = tfidf_matrix.toarray()

    df[f"tfidf_{stem}"] = list(X)

    embeddings["tfidf"][stem] = X

    print(f"TF-IDF ({stem}) shape:", X.shape)
output_tfidf = {
    "labels": df["targe"].values,
    "embeddings": embeddings
}

df["tfidf_isri"] = list(embeddings["tfidf"]["isri"])
df["tfidf_light"] = list(embeddings["tfidf"]["light"])
df["tfidf_snowball"] = list(embeddings["tfidf"]["snowball"])

print("TF-IDF columns added")




In [None]:
df.head()

In [None]:
#!pip install fasttext

In [None]:
import pandas as pd
import numpy as np
import fasttext
import fasttext.util
import pickle


labels = df["targe"].values

texts = {
    "isri": df["text_isri"].astype(str),
    "light": df["text_light"].astype(str),
    "snowball": df["text_snowball"].astype(str)
}

fasttext.util.download_model("ar", if_exists="ignore")
model = fasttext.load_model("cc.ar.300.bin")

embeddings = {}

for name, series in texts.items():
    X = np.stack(
        series.apply(
            lambda text: np.mean(
                [model.get_word_vector(t) for t in text.split()]
                if len(text.split()) > 0 else [np.zeros(300)],
                axis=0
            )
        ).values
    )
    embeddings[name] = X
    print(f"{name} embedding shape:", X.shape)

output = {
    "labels": labels,
    "fasttext": embeddings
}

df["fasttext_isri"] = list(embeddings["isri"])
df["fasttext_light"] = list(embeddings["light"])
df["fasttext_snowball"] = list(embeddings["snowball"])

print("FastText columns added")



Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz

isri embedding shape: (1333, 300)
light embedding shape: (1333, 300)
snowball embedding shape: (1333, 300)
FastText columns added


In [None]:
df.head()

Unnamed: 0,text,targe,text_isri,text_light,text_snowball,tfidf_isri,tfidf_light,tfidf_snowball,fasttext_isri,fasttext_light,fasttext_snowball
0,تحيي الفنانه زهره هندي حفلا فنيا يوم مارس المق...,0,تحي فنن زهر هند حفل فنا يوم ارس قبل سرح تكل ار...,حي فنانه زهر هند حفل يا وم مارس مقبل مسرح اتاك...,تح فنانه زهر هند حفل فن يوم مارس مقبل مسرح اتا...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.012732509, 0.0020617926, -0.004555963, 0.0...","[-0.013797628, 0.004341846, -0.0012340471, 0.0...","[-0.015050052, 0.0022576062, -0.000284112, 0.0..."
1,بلغ عدد المتفرجين الذين حضروا مهرجان البولفار ...,0,بلغ عدد تفرج الذين حضر هرج لفر في دور ال ماب ش...,لغ عدد متفرج الذين حضر مهرج بولفار في دور ال م...,بلغ عدد متفرج ذين حضرو مهرج بولفار في دور ال م...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0066916524, -0.0048177545, -0.007236052, 0...","[-0.015389659, 0.0003111106, -0.0044296472, 0....","[-0.0108956555, -0.0035783497, -0.008187279, 0..."
2,اخبارنا المغربيه متابعه اطلق نشطاء مغاربه حمله...,0,خبر غرب تبع طلق شطء غرب حمل علي سبو ضمن مع خرج...,خبار مغربيه متابع طلق شطاء مغارب حمل على فايسب...,اخبار مغربيه متابع اطلق نشطاء مغارب حمل عل فاي...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0070065507, -0.0065683094, -0.0039173136, ...","[0.00573556, -0.010408119, 0.0144616, 0.045182...","[-0.0027987205, -0.007865474, 0.013234409, 0.0..."
3,اكد المشاركون في ندوه دوليه انعقدت بمدينه الجد...,0,اكد شرك في نده دول عقد بمد جدد حول وضع حصن ارخ...,كد مشارك في دو دول انعقد مدين جديده حول موضوع ...,اكد مشارك في ندو دول انعقد بمد جديده حول موضوع...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.044068214237375586, 0.0, 0.0, 0.0, 0.0...","[-0.005036245, 0.00054471643, -0.005070172, 0....","[-0.006615632, -0.0038718744, -0.0058937846, 0...","[-0.010778804, -0.0026696026, -0.0039697182, 0..."
4,تدشن الفنانه الشعبيه المغربيه نجاه اعتابو ابتد...,0,تدش فنن شعب غرب نجه عتبو بدء من ثلاثاء قبل جول...,دش فنانه شعبيه مغربيه جا عتابو بتداء من ثلاثاء...,تدش فنانه شعبيه مغربيه نجا اعتاب ابتداء من ثلا...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.045495314777301704, 0.0, 0.0...","[-0.003823867, 0.008242876, -0.001444543, 0.05...","[-0.005567756, -0.0026367286, 0.005419133, 0.0...","[-0.012710737, 0.0085825045, 0.0034522333, 0.0..."


In [22]:
import numpy as np
import pickle
import torch
from transformers import AutoTokenizer, AutoModel

MODEL_NAME = "asafaya/bert-base-arabic"
MAX_LEN = 50
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

labels = df["targe"].values

texts = {
    "isri": df["text_isri"].astype(str).tolist(),
    "light": df["text_light"].astype(str).tolist(),
    "snowball": df["text_snowball"].astype(str).tolist()
}

bert_embeddings = {}

with torch.no_grad():
    for stem, sentences in texts.items():
        seqs = []

        for text in sentences:
            encoding = tokenizer(
                text,
                padding="max_length",
                truncation=True,
                max_length=MAX_LEN,
                return_tensors="pt"
            )

            encoding = {k: v.to(DEVICE) for k, v in encoding.items()}
            outputs = model(**encoding)

            seq = outputs.last_hidden_state.squeeze(0).cpu().numpy()
            seqs.append(seq)

        X = np.stack(seqs)
        bert_embeddings[stem] = X

        print(f"BERT ({stem}) shape:", X.shape)
output = {
    "labels": labels,
    "embeddings": {
        "bert": bert_embeddings
    }
}

print("BERT embeddings (3 stemmers) saved to bert_embeddings.pkl")

df["bert_isri"] = [x.tolist() for x in bert_embeddings["isri"]]
df["bert_light"] = [x.tolist() for x in bert_embeddings["light"]]
df["bert_snowball"] = [x.tolist() for x in bert_embeddings["snowball"]]


KeyboardInterrupt: 

In [None]:
df.head()

In [19]:
df.to_csv("arabic_dataset_all_embeddings_3d.csv", index=False)
