In [22]:
import ssl
import nltk
import numpy as np
import pandas as pd
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load Dataset
dataset = pd.read_csv('/kaggle/input/datasetdlarabic/aljazeera_data.csv')  # Update path if necessary
dataset = dataset.dropna()  # Remove any null values



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer

# Preprocessing Pipeline
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('arabic'))
        self.stemmer = ISRIStemmer()  # Use ISRIStemmer for Arabic

    def preprocess(self, text):
        # Tokenization
        tokens = word_tokenize(text)
        # Remove stop words
        tokens = [t for t in tokens if t not in self.stop_words]
        # Stemming
        tokens = [self.stemmer.stem(t) for t in tokens]
        return tokens




In [24]:
preprocessor = TextPreprocessor()
dataset['Processed_Text'] = dataset['Text'].apply(preprocessor.preprocess)

# Encoding Labels
label_encoder = LabelEncoder()
dataset['Encoded_Score'] = label_encoder.fit_transform(dataset['Score'])

# Split Data
X_train, X_test, y_train, y_test = train_test_split(
    dataset['Processed_Text'], dataset['Encoded_Score'], test_size=0.2, random_state=42
)


In [25]:
# Dataset Class with Padding
class NLPDataset(Dataset):
    def __init__(self, texts, labels, vocab=None, max_len=50):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab or self.build_vocab()
        self.max_len = max_len

    def build_vocab(self):
        vocab = set(token for text in self.texts for token in text)
        return {word: idx for idx, word in enumerate(vocab, start=1)}

    def encode_text(self, text):
        # Encode text and pad to `max_len`
        encoded = [self.vocab[token] for token in text if token in self.vocab]
        if len(encoded) < self.max_len:
            encoded += [0] * (self.max_len - len(encoded))  # Pad with 0s
        else:
            encoded = encoded[:self.max_len]  # Truncate to max_len
        return encoded

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded_text = self.encode_text(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        return torch.tensor(encoded_text, dtype=torch.long), torch.tensor(label, dtype=torch.long)


In [26]:
train_dataset = NLPDataset(X_train, y_train)
test_dataset = NLPDataset(X_test, y_test, vocab=train_dataset.vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [27]:
# RNN Model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        _, hidden = self.rnn(x)
        output = self.fc(hidden.squeeze(0))
        return output


In [30]:
# Model Parameters
vocab_size = len(train_dataset.vocab) + 1
embed_dim = 128
hidden_dim = 256
output_dim = len(label_encoder.classes_)

model = RNNModel(vocab_size, embed_dim, hidden_dim, output_dim)

# Training Settings
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 10


In [31]:
# Training Loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 1.3796
Epoch 2, Loss: 1.0687
Epoch 3, Loss: 1.1039
Epoch 4, Loss: 1.1509
Epoch 5, Loss: 1.0052
Epoch 6, Loss: 0.9054
Epoch 7, Loss: 0.8388
Epoch 8, Loss: 0.8316
Epoch 9, Loss: 0.8853
Epoch 10, Loss: 0.9201


In [32]:
# Evaluation Metrics Function
def evaluate_model(model, data_loader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for texts, labels in data_loader:
            outputs = model(texts)
            predicted = torch.argmax(outputs, dim=1)
            y_true.extend(labels.tolist())
            y_pred.extend(predicted.tolist())
    return y_true, y_pred

In [33]:
# Calculate Metrics
y_train_true, y_train_pred = evaluate_model(model, train_loader)
y_test_true, y_test_pred = evaluate_model(model, test_loader)




In [34]:
# Training Metrics
train_mse = mean_squared_error(y_train_true, y_train_pred)
train_mae = mean_absolute_error(y_train_true, y_train_pred)
train_r2 = r2_score(y_train_true, y_train_pred)
train_acc = accuracy_score(y_train_true, y_train_pred)



In [35]:
# Testing Metrics
test_mse = mean_squared_error(y_test_true, y_test_pred)
test_mae = mean_absolute_error(y_test_true, y_test_pred)
test_r2 = r2_score(y_test_true, y_test_pred)
test_acc = accuracy_score(y_test_true, y_test_pred)

# Print Metrics
print("Training Metrics:")
print(f"MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, R2: {train_r2:.4f}, Accuracy: {train_acc:.4f}")


Training Metrics:
MSE: 37.9629, MAE: 2.0371, R2: -0.5011, Accuracy: 0.8594


In [36]:
print("Testing Metrics:")
print(f"MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, R2: {test_r2:.4f}, Accuracy: {test_acc:.4f}")

Testing Metrics:
MSE: 26.2898, MAE: 1.5194, R2: -0.7279, Accuracy: 0.8799
