In [1]:
# Required Libraries
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
import nltk
import numpy as np
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')

# Load Dataset
dataset = pd.read_csv('/kaggle/input/datasetdlarabic/aljazeera_data.csv')  # Update path if necessary
dataset = dataset.dropna()  # Remove any null values

# Preprocessing Pipeline
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('arabic'))
        self.stemmer = ISRIStemmer()  # Arabic-specific stemmer

    def preprocess(self, text):
        # Tokenization
        tokens = word_tokenize(text)
        # Remove stop words
        tokens = [t for t in tokens if t not in self.stop_words]
        # Stemming
        tokens = [self.stemmer.stem(t) for t in tokens]
        return tokens

preprocessor = TextPreprocessor()
dataset['Processed_Text'] = dataset['Text'].apply(preprocessor.preprocess)

# Encoding Labels
label_encoder = LabelEncoder()
dataset['Encoded_Score'] = label_encoder.fit_transform(dataset['Score'])

# Split Data
X_train, X_test, y_train, y_test = train_test_split(
    dataset['Processed_Text'], dataset['Encoded_Score'], test_size=0.2, random_state=42
)

# Dataset Class
class NLPDataset(Dataset):
    def __init__(self, texts, labels, vocab=None):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab or self.build_vocab()

    def build_vocab(self):
        vocab = set(token for text in self.texts for token in text)
        return {word: idx for idx, word in enumerate(vocab, start=1)}

    def encode_text(self, text):
        return [self.vocab[token] for token in text if token in self.vocab]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded_text = self.encode_text(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        return torch.tensor(encoded_text, dtype=torch.long), torch.tensor(label, dtype=torch.long)

train_dataset = NLPDataset(X_train, y_train)
test_dataset = NLPDataset(X_test, y_test, vocab=train_dataset.vocab)

def collate_fn(batch):
    texts, labels = zip(*batch)
    max_len = max(len(text) for text in texts)
    padded_texts = [torch.cat([text, torch.zeros(max_len - len(text))]) for text in texts]
    return torch.stack(padded_texts), torch.tensor(labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# GRU Model
class BiGRUModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(BiGRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Multiply by 2 for bidirectional

    def forward(self, x):
        x = self.embedding(x.long())
        _, hidden = self.gru(x)
        # Combine forward and backward hidden states
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        output = self.fc(hidden)
        return output

# Model Parameters
vocab_size = len(train_dataset.vocab) + 1
embed_dim = 128
hidden_dim = 256
output_dim = len(label_encoder.classes_)

# Initialize the GRU Model
model = BiGRUModel(vocab_size, embed_dim, hidden_dim, output_dim)

# Check if CUDA is available and move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training Settings
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 10

# Training Loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        # Move data to GPU
        texts, labels = texts.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation Function
def evaluate_model(model, data_loader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for texts, labels in data_loader:
            # Move data to GPU
            texts, labels = texts.to(device), labels.to(device)

            outputs = model(texts)
            predicted = torch.argmax(outputs, dim=1)
            y_true.extend(labels.tolist())
            y_pred.extend(predicted.tolist())
    return y_true, y_pred


# Calculate Metrics
y_train_true, y_train_pred = evaluate_model(model, train_loader)
y_test_true, y_test_pred = evaluate_model(model, test_loader)


# Training Metrics
train_mse = mean_squared_error(y_train_true, y_train_pred)
train_mae = mean_absolute_error(y_train_true, y_train_pred)
train_r2 = r2_score(y_train_true, y_train_pred)
train_acc = accuracy_score(y_train_true, y_train_pred)

# Testing Metrics
test_mse = mean_squared_error(y_test_true, y_test_pred)
test_mae = mean_absolute_error(y_test_true, y_test_pred)
test_r2 = r2_score(y_test_true, y_test_pred)
test_acc = accuracy_score(y_test_true, y_test_pred)

# Print Metrics
print("Training Metrics:")
print(f"MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, R2: {train_r2:.4f}, Accuracy: {train_acc:.4f}")
print("Testing Metrics:")
print(f"MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, R2: {test_r2:.4f}, Accuracy: {test_acc:.4f}")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Epoch 1, Loss: 1.0261
Epoch 2, Loss: 0.5891
Epoch 3, Loss: 0.4420
Epoch 4, Loss: 0.3942
Epoch 5, Loss: 0.2912
Epoch 6, Loss: 0.2121
Epoch 7, Loss: 0.1533
Epoch 8, Loss: 0.1043
Epoch 9, Loss: 0.0789
Epoch 10, Loss: 0.0622
Training Metrics:
MSE: 0.0548, MAE: 0.0106, R2: 0.9978, Accuracy: 0.9973
Testing Metrics:
MSE: 5.3958, MAE: 0.5901, R2: 0.6454, Accuracy: 0.9046
