In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.layers import (
    Embedding, LSTM, Dense, Dropout, BatchNormalization, 
    Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D,
    Concatenate, SpatialDropout1D
)
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import re
import nltk
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# GPU optimization
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

arabic_stopwords = set(stopwords.words('arabic'))

# ==================== 
# 1. Enhanced Data Loading and Validation
# ==================== 
def load_and_validate_data(file_path):
    """Load data with comprehensive validation"""
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
        print(f"Dataset loaded successfully with shape: {df.shape}")
        
        # Check required columns
        required_columns = ['text', 'new_label']  # Adjust based on your CSV structure
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"Warning: Missing columns {missing_columns}")
            print(f"Available columns: {df.columns.tolist()}")
        
        # Data quality checks
        print(f"Missing values: {df.isnull().sum().sum()}")
        print(f"Duplicate rows: {df.duplicated().sum()}")
        
        # Remove duplicates and missing values
        df = df.drop_duplicates()
        df = df.dropna()
        
        # Class distribution
        print(f"Class distribution:\n{df['new_label'].value_counts()}")
        
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# ==================== 
# 2. Enhanced Arabic Text Preprocessing
# ==================== 
def advanced_preprocess_arabic(text):
    """Enhanced Arabic text preprocessing"""
    if not isinstance(text, str):
        return ""
    
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Remove emails
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove diacritics (tashkeel) - more comprehensive
    text = re.sub(r'[\u0617-\u061A\u064B-\u0652\u0670\u0640]', '', text)
    
    # Remove tatweel (kashida)
    text = re.sub(r'\u0640+', '', text)
    
    # Remove punctuations, numbers, and special characters
    text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', ' ', text)
    
    # Normalize Arabic letters
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'گ', 'ك', text)
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove stopwords and short words
    words = text.split()
    words = [w for w in words if w not in arabic_stopwords and len(w) > 2]
    
    return ' '.join(words)

# ==================== 
# 3. Data Analysis and Statistics
# ==================== 
def analyze_data(df):
    """Analyze dataset statistics"""
    print("=" * 50)
    print("DATASET ANALYSIS")
    print("=" * 50)
    
    # Text length analysis
    df['text_length'] = df['text'].str.len()
    df['word_count'] = df['text'].str.split().str.len()
    
    print(f"Average text length: {df['text_length'].mean():.2f}")
    print(f"Average word count: {df['word_count'].mean():.2f}")
    print(f"Max text length: {df['text_length'].max()}")
    print(f"Min text length: {df['text_length'].min()}")
    
    # Class balance
    class_counts = df['new_label'].value_counts()
    print(f"\nClass distribution:")
    for label, count in class_counts.items():
        print(f"{label}: {count} ({count/len(df)*100:.2f}%)")
    
    return df

# ==================== 
# 4. Advanced Model Architecture
# ==================== 
def create_advanced_model(vocab_size, max_len, num_classes, embedding_dim=200):
    """Create an advanced model with multiple techniques"""
    
    # Input layer
    input_layer = tf.keras.layers.Input(shape=(max_len,))
    
    # Embedding layer with dropout
    embedding = Embedding(
        vocab_size, 
        embedding_dim, 
        input_length=max_len,
        mask_zero=True,
        embeddings_regularizer=l2(0.001)
    )(input_layer)
    
    # Spatial dropout for embedding
    embedding = SpatialDropout1D(0.2)(embedding)
    
    # Bidirectional LSTM layers
    lstm1 = Bidirectional(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3))(embedding)
    lstm1 = BatchNormalization()(lstm1)
    
    lstm2 = Bidirectional(LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3))(lstm1)
    lstm2 = BatchNormalization()(lstm2)
    
    # Global pooling layers
    max_pool = GlobalMaxPooling1D()(lstm2)
    avg_pool = GlobalAveragePooling1D()(lstm2)
    
    # Concatenate pooling outputs
    concat = Concatenate()([max_pool, avg_pool])
    
    # Dense layers with regularization
    dense1 = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(concat)
    dense1 = BatchNormalization()(dense1)
    dense1 = Dropout(0.5)(dense1)
    
    dense2 = Dense(128, activation='relu', kernel_regularizer=l2(0.001))(dense1)
    dense2 = BatchNormalization()(dense2)
    dense2 = Dropout(0.3)(dense2)
    
    # Output layer
    if num_classes == 2:
        output = Dense(1, activation='sigmoid', name='output')(dense2)
        loss = 'binary_crossentropy'
    else:
        output = Dense(num_classes, activation='softmax', name='output')(dense2)
        loss = 'sparse_categorical_crossentropy'
    
    model = tf.keras.Model(inputs=input_layer, outputs=output)
    
    # Advanced optimizer with learning rate scheduling
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=0.001,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-07,
        clipnorm=1.0
    )
    
    if num_classes == 2:
        metrics = ['accuracy', 'precision', 'recall']
    else:
        metrics = ['accuracy']
    model.compile(
        optimizer=optimizer,
        loss=loss,
        metrics=metrics
    )
    
    return model

# ==================== 
# 5. Advanced Training Configuration
# ==================== 
def create_callbacks(model_name="best_model.keras"):
    """Create advanced callbacks for training"""
    callbacks = [
        EarlyStopping(
            monitor='val_accuracy',
            patience=7,
            restore_best_weights=True,
            verbose=1,
            mode='max'
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=3,
            min_lr=1e-7,
            verbose=1,
            mode='min'
        ),
        ModelCheckpoint(
            model_name,
            monitor='val_accuracy',
            save_best_only=True,
            save_weights_only=False,
            mode='max',
            verbose=1
        )
    ]
    return callbacks

# ==================== 
# 6. Main Training Pipeline
# ==================== 
# Load and validate data
df = load_and_validate_data("Arabic_dataset.csv")

# Analyze data
df = analyze_data(df)
    
# Preprocess text data
print("Preprocessing text data...")
df["text"] = df["text"].astype(str).apply(advanced_preprocess_arabic)
    
# Remove empty texts after preprocessing
df = df[df['text'].str.len() > 0]
    
texts = df["text"].tolist()
labels = df["new_label"].tolist()
    
# ==================== 
# Tokenization with optimization
# ==================== 
vocab_size = 15000  # Increased vocabulary
max_len = 150       # Optimized sequence length
    
tokenizer = Tokenizer(
        num_words=vocab_size, 
        oov_token="<OOV>",
        filters='',  # We already preprocessed
        lower=False  # Arabic doesn't have upper/lower case
    )
    
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")
    
# Save tokenizer
with open("tokenizer_optimized.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
print(f"Vocabulary size: {len(tokenizer.word_index)}")
print(f"Average sequence length: {np.mean([len(seq) for seq in sequences]):.2f}")
    
# ==================== 
# Label encoding
# ==================== 
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
num_classes = len(encoder.classes_)
    
# Save label encoder
with open("label_encoder_optimized.pickle", "wb") as f:
    pickle.dump(encoder, f)
    
print(f"Number of classes: {num_classes}")
print(f"Classes: {encoder.classes_}")
    
# ==================== 
# Advanced data split with stratification
# ==================== 
X_train, X_test, y_train, y_test = train_test_split(
        padded, encoded_labels, 
        test_size=0.2, 
        random_state=42,
        stratify=encoded_labels
    )
    
X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train,
        test_size=0.2,
        random_state=42,
        stratify=y_train
    )
    
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")
    
# ==================== 
# Handle class imbalance
# ==================== 
class_weights = compute_class_weight(
        'balanced',
        classes=np.unique(y_train),
        y=y_train
    )
class_weight_dict = dict(enumerate(class_weights))
print(f"Class weights: {class_weight_dict}")
    
# ==================== 
# Create and train model
# ==================== 
model = create_advanced_model(vocab_size, max_len, num_classes)
print(model.summary())
    
# Training configuration
callbacks = create_callbacks("arabic_nlp_optimized.keras")
    
# Train model
print("Starting training...")
history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=100,
        batch_size=64,  # Optimized batch size
        callbacks=callbacks,
        class_weight=class_weight_dict,
        verbose=1
    )
    


Dataset loaded successfully with shape: (18787, 2)
Missing values: 0
Duplicate rows: 194
Class distribution:
new_label
Normal        6751
Depression    4369
Anxiety       4175
Stress        2698
Suicide        600
Name: count, dtype: int64
DATASET ANALYSIS
Average text length: 283.33
Average word count: 55.71
Max text length: 19822
Min text length: 3

Class distribution:
Normal: 6751 (36.31%)
Depression: 4369 (23.50%)
Anxiety: 4175 (22.45%)
Stress: 2698 (14.51%)
Suicide: 600 (3.23%)
Preprocessing text data...
Vocabulary size: 54908
Average sequence length: 36.05
Number of classes: 5
Classes: ['Anxiety' 'Depression' 'Normal' 'Stress' 'Suicide']
Training set: (11750, 150)
Validation set: (2938, 150)
Test set: (3673, 150)
Class weights: {0: np.float64(0.8831266441187523), 1: np.float64(0.8639705882352942), 2: np.float64(0.5504802061372687), 3: np.float64(1.3694638694638694), 4: np.float64(6.119791666666667)}


None
Starting training...
Epoch 1/100
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.3311 - loss: 3.5224
Epoch 1: val_accuracy improved from -inf to 0.36351, saving model to arabic_nlp_optimized.keras
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 1s/step - accuracy: 0.3316 - loss: 3.5188 - val_accuracy: 0.3635 - val_loss: 3.5329 - learning_rate: 0.0010
Epoch 2/100
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.6451 - loss: 1.9983
Epoch 2: val_accuracy improved from 0.36351 to 0.46154, saving model to arabic_nlp_optimized.keras
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 1s/step - accuracy: 0.6453 - loss: 1.9975 - val_accuracy: 0.4615 - val_loss: 2.3464 - learning_rate: 0.0010
Epoch 3/100
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.7633 - loss: 1.4914
Epoch 3: val_accuracy improved from 0.46154 to 0.71069, saving mode

In [5]:
# ==================== 
# Evaluation
# ==================== 
print("\nEvaluating model...")
    
# Load best model
best_model = tf.keras.models.load_model(
        "arabic_nlp_optimized.keras"
    )
    
# Evaluate on test set
eval_results = best_model.evaluate(X_test, y_test, verbose=0)
if num_classes == 2:
        test_accuracy, test_precision, test_recall = eval_results
        f1_score = 2 * (test_precision * test_recall) / (test_precision + test_recall)
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print(f"Test Precision: {test_precision:.4f}")
        print(f"Test Recall: {test_recall:.4f}")
        print(f"Test F1-Score: {f1_score:.4f}")
else:
        test_accuracy = eval_results[0]
        print(f"Test Accuracy: {test_accuracy:.4f}")
    
# Predictions and classification report
y_pred = best_model.predict(X_test)
if num_classes == 2:
        y_pred_classes = (y_pred > 0.5).astype(int).flatten()
else:
        y_pred_classes = np.argmax(y_pred, axis=1)
    
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes, target_names=encoder.classes_))
 
# ==================== 
# Save training history
# ==================== 
with open("training_history.pickle", "wb") as f:
        pickle.dump(history.history, f)
    
print("\nTraining completed successfully!")
print("Saved files:")
print("- arabic_nlp_optimized.keras (model)")
print("- tokenizer_optimized.pickle")
print("- label_encoder_optimized.pickle")
print("- training_history.pickle")


Evaluating model...
Test Accuracy: 1.4772
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 155ms/step

Classification Report:
              precision    recall  f1-score   support

     Anxiety       0.73      0.77      0.75       832
  Depression       0.76      0.78      0.77       850
      Normal       0.77      0.73      0.75      1335
      Stress       0.57      0.56      0.57       536
     Suicide       0.74      0.79      0.76       120

    accuracy                           0.73      3673
   macro avg       0.71      0.73      0.72      3673
weighted avg       0.73      0.73      0.73      3673


Training completed successfully!
Saved files:
- arabic_nlp_optimized.keras (model)
- tokenizer_optimized.pickle
- label_encoder_optimized.pickle
- training_history.pickle
