In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Embedding, LSTM, Dense, Dropout,
                                   BatchNormalization, GlobalMaxPooling1D)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import os

In [None]:
print("🔧 إعداد GPU...")
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print(f"✅ GPU متاح: {physical_devices[0]}")
else:
    print("💻 تشغيل على CPU")


In [None]:
# إنشاء مجلد النماذج
os.makedirs('models', exist_ok=True)

print("📁 قراءة البيانات...")
data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

print(f"📊 حجم التدريب: {data_train.shape}")
print(f"📊 حجم الاختبار: {data_test.shape}")

In [None]:
print(f"📈 توزيع الفئات:")
class_counts = data_train['Class Index'].value_counts().sort_index()
print(class_counts)


In [None]:
print("🧹 تنظيف البيانات...")
data_train['Title'] = data_train['Title'].fillna('').astype(str)
data_train['Description'] = data_train['Description'].fillna('').astype(str)
data_test['Title'] = data_test['Title'].fillna('').astype(str)
data_test['Description'] = data_test['Description'].fillna('').astype(str)


In [None]:
# دمج النصوص
texts_train = (data_train['Title'] + ' ' + data_train['Description']).tolist()
texts_test = (data_test['Title'] + ' ' + data_test['Description']).tolist()

print(f"📝 عينة من النصوص: {texts_train[0][:100]}...")

In [None]:
# معاملات محسنة للسرعة والدقة
max_words = 20000  # تقليل القاموس للسرعة
max_len = 100      # تقليل طول الجملة للسرعة
batch_size = 128   # زيادة batch size للسرعة

print("🔤 بناء القاموس...")
tokenizer = Tokenizer(
    num_words=max_words,
    oov_token="<OOV>",
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
)
tokenizer.fit_on_texts(texts_train)

print(f"📖 حجم القاموس: {min(len(tokenizer.word_index), max_words)}")


In [None]:
# تحويل النصوص
X_train_seq = tokenizer.texts_to_sequences(texts_train)
X_test_seq = tokenizer.texts_to_sequences(texts_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

print(f"✂️ شكل البيانات: {X_train_pad.shape}")


In [None]:
# معالجة التصنيفات
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(data_train['Class Index'])
y_train_cat = to_categorical(y_train)
num_classes = y_train_cat.shape[1]

print(f"🏷️ عدد الفئات: {num_classes}")
print(f"🏷️ أسماء الفئات: {label_encoder.classes_}")


In [None]:
# تقسيم البيانات
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train_pad, y_train_cat,
    test_size=0.15,  # تقليل validation set
    random_state=42,
    stratify=y_train_cat
)

print(f"🔄 التدريب: {X_train_split.shape}, التحقق: {X_val.shape}")

print("🏗️ بناء النموذج المبسط...")


In [None]:
# نموذج مبسط وسريع
model = Sequential([
    # Embedding بسيط
    Embedding(
        input_dim=max_words,
        output_dim=128,  # تقليل البعد للسرعة
        input_length=max_len,
        mask_zero=True  # تجاهل padding
    ),

In [None]:
 # LSTM واحد فقط
    LSTM(64, dropout=0.3, recurrent_dropout=0.3),

    # Dense layers بسيطة
    BatchNormalization(),
    Dropout(0.5),

    Dense(32, activation='relu'),
    Dropout(0.3),

    # طبقة الإخراج
    Dense(num_classes, activation='softmax')
])


In [None]:
# optimizer محسن
optimizer = Adam(
    learning_rate=0.01,  # learning rate أعلى للبداية السريعة
    clipnorm=1.0
)

model.compile(
    loss='categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)

print(model.summary())

In [None]:
# callbacks محسنة
callbacks = [
    EarlyStopping(
        monitor='val_accuracy',
        patience=3,  # patience أقل
        restore_best_weights=True,
        mode='max'
    ),

    ReduceLROnPlateau(
        monitor='val_accuracy',  # مراقبة accuracy بدلاً من loss
        factor=0.2,
        patience=2,
        min_lr=1e-6,
        verbose=1
    )
]

print("🚀 بدء التدريب...")

In [None]:
# تدريب سريع
history = model.fit(
    X_train_split, y_train_split,
    epochs=15,  # epochs أقل
    batch_size=batch_size,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)


In [None]:
# التقييم
print("📊 تقييم النموذج...")
train_acc = model.evaluate(X_train_split, y_train_split, verbose=0)[1]
val_acc = model.evaluate(X_val, y_val, verbose=0)[1]

print(f"🎯 دقة التدريب: {train_acc:.4f}")
print(f"🎯 دقة التحقق: {val_acc:.4f}")


In [None]:
# التنبؤ
print("🔮 إجراء التنبؤات...")
y_pred = model.predict(X_test_pad, batch_size=batch_size, verbose=1)
y_pred_classes = np.argmax(y_pred, axis=1)


In [None]:
# حفظ النماذج
print("💾 حفظ النماذج...")
model.save('models/lstm_simple.h5')

import pickle
with open('models/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('models/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [None]:
# رسم النتائج
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training', linewidth=2)
plt.plot(history.history['val_accuracy'], label='Validation', linewidth=2)
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training', linewidth=2)
plt.plot(history.history['val_loss'], label='Validation', linewidth=2)
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig('models/results.png', dpi=150)
plt.show()

print("✅ اكتمل التدريب!")