In [10]:
!pip install transformers
!pip install pyspark
!pip install sparkdl


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [35]:
!pip install pyspark transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [36]:
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import VectorUDT, Vectors
from transformers import BertModel
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# إنشاء جلسة Spark
spark = SparkSession.builder \
    .appName("Text Classification") \
    .getOrCreate()


In [37]:
# تعيين المسارات للمجلدين
fake_news_folder = "/content/drive/MyDrive/dataset/fake"
real_news_folder = "/content/drive/MyDrive/dataset/real"

# قراءة النصوص الكاذبة
texts = []
labels = []
for filename in os.listdir(fake_news_folder):
    file_path = os.path.join(fake_news_folder, filename)
    with open(file_path, "r", encoding='cp1256') as file:
        text = file.read()
        texts.append(text)
        labels.append(0)

# قراءة النصوص الصادقة
for filename in os.listdir(real_news_folder):
    file_path = os.path.join(real_news_folder, filename)
    with open(file_path, "r", encoding='cp1256') as file:
        text = file.read()
        texts.append(text)
        labels.append(1)


In [38]:
# تحميل المحددات من النموذج
tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabertv2')

# تحويل النصوص إلى تمثيلات BERT
encoded_texts = []
for news_text in texts:
    inputs = tokenizer.encode_plus(
        news_text,
        None,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    encoded_texts.append(inputs['input_ids'])

# تحويل التمثيلات إلى تنسورفلو
features = tf.concat(encoded_texts, axis=0)

# تحويل التسميات إلى تنسورفلو
labels = tf.convert_to_tensor(labels)

# تحويل الأبعاد
features = tf.reshape(features, (-1, 128))

# تحويل features و labels إلى NumPy arrays
features = features.numpy()
labels = labels.numpy()

from sklearn.utils import shuffle

# خلط البيانات
features, labels = shuffle(features, labels, random_state=42)

# تقسيم البيانات إلى بيانات التحقق والاختبار
train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, test_size=0.2, random_state=42)

# تحويل البيانات إلى DataFrame في Spark
train_df = spark.createDataFrame(zip(train_features.tolist(), train_labels.tolist()), ["text", "label"])
test_df = spark.createDataFrame(zip(test_features.tolist(), test_labels.tolist()), ["text", "label"])

# تحويل النص إلى نوع Vector
to_vector = udf(lambda a: Vectors.dense(a), VectorUDT())
train_df = train_df.withColumn("text", to_vector("text"))
test_df = test_df.withColumn("text", to_vector("text"))

# تحميل نموذج اللغة العربية المقطّر
model_name = "asafaya/bert-base-arabic"
model = BertModel.from_pretrained(model_name)



Some weights of the model checkpoint at asafaya/bert-base-arabic were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Accuracy: 0.6318407960199005


In [41]:

# إعداد نموذج Spark للتصنيف باستخدام Logistic Regression
lr = LogisticRegression(featuresCol="text", labelCol="label")
pipeline = Pipeline(stages=[lr])

# بناء نموذج التصنيف
classification_model = pipeline.fit(train_df)

# تقييم النموذج على بيانات الاختبار
predictions = classification_model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# تقييم النموذج باستخدام BinaryClassificationEvaluator
binary_evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction")
area_under_roc = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderROC"})
area_under_pr = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderPR"})

# تقييم النموذج باستخدام MulticlassClassificationEvaluator
multiclass_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = multiclass_evaluator.evaluate(predictions, {multiclass_evaluator.metricName: "accuracy"})
precision = multiclass_evaluator.evaluate(predictions, {multiclass_evaluator.metricName: "weightedPrecision"})
recall = multiclass_evaluator.evaluate(predictions, {multiclass_evaluator.metricName: "weightedRecall"})
f1_score = multiclass_evaluator.evaluate(predictions, {multiclass_evaluator.metricName: "f1"})

# طباعة النتائج
print("Area under ROC:", area_under_roc)
print("Area under PR:", area_under_pr)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Area under ROC: 0.7142254220456803
Area under PR: 0.6908207758346883
Accuracy: 0.6318407960199005
Precision: 0.6316525841660656
Recall: 0.6318407960199004
F1 Score: 0.6317310533553263


In [47]:
classification_model.save("LR")