In [None]:
# 1. تحميل المكتبات
import pandas as pd
import re
import numpy as np
import torch
import emoji
import language_tool_python
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback


In [None]:
# 2. تحميل البيانات
file_path = 'germeval.training.txt.txt'  # تأكد من وضع المسار الصحيح للملف

df = pd.read_csv(file_path, sep='\t', header=None, names=['text', 'label1', 'label2'])

df = df[['text', 'label1']]
df = df[df['label1'].isin(['OTHER', 'OFFENSE'])]  # الاحتفاظ بالتصنيفات المهمة فقط
df['label'] = df['label1'].map({'OTHER': 0, 'OFFENSE': 1})
df = df[['text', 'label']]


In [None]:
df.head

In [None]:
# 3. تنظيف النصوص
def clean_text(text):
    text = re.sub(r'\|LBR\|', ' ', text)  # إزالة الرموز مثل |LBR|
    text = re.sub(r'@\w+', '', text)  # إزالة الأسماء المستعارة
    text = re.sub(r'#\w+', '', text)  # إزالة الهاشتاقات
    text = re.sub(r'http\S+', '', text)  # إزالة الروابط
    text = emoji.replace_emoji(text, replace='')  # إزالة الرموز التعبيرية
    text = re.sub(r'(.)\1+', r'\1\1', text)  # تقليل التكرار في الأحرف
    return text.strip()

df['text'] = df['text'].apply(clean_text)

In [None]:
# 4. تصحيح الأخطاء الإملائية باستخدام LanguageTool
language_tool = language_tool_python.LanguageTool('de')
def correct_with_languagetool(text):
    return language_tool.correct(text)

df['text'] = df['text'].apply(correct_with_languagetool)

In [None]:
# 5. إزالة النصوص الفارغة
df = df[df['text'].str.strip() != '']

In [None]:
# 6. تقسيم البيانات
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

In [None]:
# 7. التعامل مع عدم توازن البيانات باستخدام SMOTE
smote = SMOTE(random_state=42)
train_texts_resampled, train_labels_resampled = smote.fit_resample(
    np.array(train_texts).reshape(-1, 1), train_labels
)
train_texts = train_texts_resampled.flatten()


In [None]:
# 8. تحضير النموذج
model_name = 'bert-base-multilingual-cased'  # نموذج متقدم

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


In [None]:
# 9. ضبط وتدريب النموذج
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {'f1': f1_score(labels, preds)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=list(zip(train_texts, train_labels)),
    eval_dataset=list(zip(test_texts, test_labels)),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()


In [None]:
# 10. تقييم النموذج
preds_output = trainer.predict(list(zip(test_texts, test_labels)))
preds = np.argmax(preds_output.predictions, axis=-1)
print(classification_report(test_labels, preds))
print(confusion_matrix(test_labels, preds))
