In [1]:
# 📦 Install and import required packages
!pip install nltk scikit-learn torch

import pandas as pd
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem.isri import ISRIStemmer
from torch.utils.data import TensorDataset, DataLoader

# 🔁 Download NLTK resources
nltk.download('stopwords')

# ✅ Load your dataset
df = pd.read_csv("data_semantically_scored.csv")  # ← Replace with your filename

# ✅ Step 2: Preprocessing (tokenize, remove stopwords, stem, discretize)
tokenizer = ToktokTokenizer()
stemmer = ISRIStemmer()
stop_words = set(nltk.corpus.stopwords.words('arabic'))

def preprocess(text):
    tokens = tokenizer.tokenize(str(text))
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    stemmed = [stemmer.stem(t) for t in tokens]
    return ' '.join(stemmed)

df['processed'] = df['title'].apply(preprocess)

# Discretize score into 5 bins for classification
df['score_bin'] = pd.cut(df['score'], bins=[0,2,4,6,8,10], labels=[0,1,2,3,4]).astype(int)

# ✅ Step 3: Vectorization
vectorizer = TfidfVectorizer(max_features=500)
X = vectorizer.fit_transform(df['processed']).toarray()
y = df['score_bin'].values

# ✅ Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16)

# ✅ Step 3: Model Definition
class TextClassifier(nn.Module):
    def __init__(self, model_type='rnn', input_size=500, hidden_size=64, output_size=5):
        super().__init__()
        self.rnn_type = model_type
        self.hidden = hidden_size

        if model_type == 'gru':
            self.rnn = nn.GRU(input_size, hidden_size, batch_first=True)
        elif model_type == 'lstm':
            self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        elif model_type == 'birnn':
            self.rnn = nn.RNN(input_size, hidden_size, batch_first=True, bidirectional=True)
        else:
            self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)

        self.fc = nn.Linear(hidden_size * (2 if model_type == 'birnn' else 1), output_size)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add sequence dimension
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out

# ✅ Training function
def train_model(model_type):
    model = TextClassifier(model_type)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(5):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    return model

# ✅ Step 4: Evaluation
def evaluate_model(model):
    model.eval()
    all_preds, all_true = [], []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.numpy())
            all_true.extend(labels.numpy())
    print("Accuracy:", accuracy_score(all_true, all_preds))
    print(classification_report(all_true, all_preds))

# ✅ Train & Evaluate All Models
for model_type in ['rnn', 'birnn', 'gru', 'lstm']:
    print(f"===== {model_type.upper()} Evaluation =====")
    model = train_model(model_type)
    evaluate_model(model)




[nltk_data] Downloading package stopwords to /home/med/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: Cannot convert float NaN to integer