# PART 1: CLASSIFICATION TASK

In [None]:
# STEP 1: Web Scraping Arabic Text Data
!pip install requests beautifulsoup4

import requests
from bs4 import BeautifulSoup
import pandas as pd

# Example Arabic news page
url = 'https://www.aljazeera.net/news/'

headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# Example: Extract article titles
texts = [a.get_text(strip=True) for a in soup.find_all('h3')][:10]

# Assign random relevance scores (0 to 10)
import random
data = {'Text': texts, 'Score': [round(random.uniform(0, 10), 2) for _ in texts]}
df = pd.DataFrame(data)

df.head()




Unnamed: 0,Text,Score
0,جماعات إسرائيلية متطرفة تنشر فيديو تحريضيا لتف...,9.46
1,البابا يندد بالوضع الإنساني المأساوي في قطاع غزة,8.69
2,"""أتمنى الموت"" صرخة بثينة من قلب خيمة في غزة",9.48
3,بيانات ملاحية تكشف رسو سفينة متجهة لإسرائيل بم...,6.09
4,بالصور.. مدن عربية وعالمية تشهد مظاهرات حاشدة ...,9.81


In [None]:
# STEP 2: NLP Preprocessing Pipeline
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

arabic_stopwords = set(stopwords.words('arabic'))

def preprocess(text):
    # Remove non-Arabic chars and punctuation
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in arabic_stopwords]
    return tokens

df['Tokens'] = df['Text'].apply(preprocess)
df.head()




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,Text,Score,Tokens
0,جماعات إسرائيلية متطرفة تنشر فيديو تحريضيا لتف...,9.46,"[جماعات, إسرائيلية, متطرفة, تنشر, فيديو, تحريض..."
1,البابا يندد بالوضع الإنساني المأساوي في قطاع غزة,8.69,"[البابا, يندد, بالوضع, الإنساني, المأساوي, قطا..."
2,"""أتمنى الموت"" صرخة بثينة من قلب خيمة في غزة",9.48,"[أتمنى, الموت, صرخة, بثينة, قلب, خيمة, غزة]"
3,بيانات ملاحية تكشف رسو سفينة متجهة لإسرائيل بم...,6.09,"[بيانات, ملاحية, تكشف, رسو, سفينة, متجهة, لإسر..."
4,بالصور.. مدن عربية وعالمية تشهد مظاهرات حاشدة ...,9.81,"[بالصور, مدن, عربية, وعالمية, تشهد, مظاهرات, ح..."


In [None]:
# STEP 3: Train Models (RNN, Bi-RNN, GRU, LSTM)
!pip install torch torchvision

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Vocabulary preparation
from collections import Counter
all_tokens = [token for tokens in df['Tokens'] for token in tokens]
vocab = {word: i+1 for i, (word, _) in enumerate(Counter(all_tokens).most_common())}
vocab['<PAD>'] = 0

def encode(tokens, max_len=20):
    ids = [vocab.get(t, 0) for t in tokens]
    return ids[:max_len] + [0]*(max_len - len(ids))

df['Encoded'] = df['Tokens'].apply(lambda x: encode(x, max_len=20))

# PyTorch Dataset
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

X_train, X_test, y_train, y_test = train_test_split(df['Encoded'].tolist(), df['Score'].tolist(), test_size=0.2)
train_loader = DataLoader(TextDataset(X_train, y_train), batch_size=4, shuffle=True)




In [None]:
# We can now define and train RNN, Bi-RNN, GRU, and LSTM models (example for LSTM):
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, hidden_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        _, (hn, _) = self.lstm(x)
        return self.fc(hn[-1]).squeeze(1)

model = LSTMModel(len(vocab))
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = loss_fn(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 100.1483
Epoch 2, Loss: 92.1151
Epoch 3, Loss: 83.2016
Epoch 4, Loss: 75.9205
Epoch 5, Loss: 70.5259
Epoch 6, Loss: 65.2730
Epoch 7, Loss: 59.6681
Epoch 8, Loss: 54.3961
Epoch 9, Loss: 51.1547
Epoch 10, Loss: 47.7497


In [None]:
# STEP 4: Evaluate Models
from sklearn.metrics import mean_squared_error, r2_score

with torch.no_grad():
    X_test_tensor = torch.tensor(X_test, dtype=torch.long)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
    predictions = model(X_test_tensor).numpy()

print("MSE:", mean_squared_error(y_test, predictions))
print("R² Score:", r2_score(y_test, predictions))


MSE: 9.15314769744873
R² Score: -1429.173583984375


# PART 2: TRANSFORMER - GPT-2 FOR TEXT GENERATION


In [1]:
# 1. Install and Load Pretrained GPT-2
!pip install transformers

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
import os, logging, warnings

logging.getLogger().setLevel(logging.CRITICAL)
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(device)




In [2]:
# 2. Custom Dataset (Prepare your data)
from torch.utils.data import Dataset, DataLoader
import csv

class CustomTextDataset(Dataset):
    def __init__(self, path):
        super().__init__()
        self.samples = []
        self.eot = "<|endoftext|>"

        with open(path, encoding='utf-8') as f:
            reader = csv.reader(f)
            next(reader)  # skip header if needed
            for row in reader:
                if row and len(row) > 0:
                    text = f"{row[0]} {self.eot}"
                    self.samples.append(text)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

dataset = CustomTextDataset('/content/arabic_dataset.csv')
loader = DataLoader(dataset, batch_size=1, shuffle=True)


In [3]:
# 3. Hyperparameters
from transformers import AdamW, get_linear_schedule_with_warmup

BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 3e-5
WARMUP_STEPS = 500
MAX_SEQ_LEN = 400


In [4]:
# 4. Training the Model
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1)

model.train()
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0
tmp_tensor = None
models_dir = "trained_models"
os.makedirs(models_dir, exist_ok=True)

for epoch in range(EPOCHS):
    print(f"\nEPOCH {epoch} {'='*30}")

    for i, item in enumerate(loader):
        encoded = tokenizer.encode(item[0], return_tensors='pt').to(device)

        if encoded.size()[1] > MAX_SEQ_LEN:
            continue

        if not torch.is_tensor(tmp_tensor):
            tmp_tensor = encoded
            continue

        if tmp_tensor.size()[1] + encoded.size()[1] > MAX_SEQ_LEN:
            batch_input = tmp_tensor
            tmp_tensor = encoded
        else:
            tmp_tensor = torch.cat([tmp_tensor, encoded[:, 1:]], dim=1)
            continue

        outputs = model(batch_input, labels=batch_input)
        loss = outputs.loss
        loss.backward()
        sum_loss += loss.item()
        proc_seq_count += 1

        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0
            batch_count += 1
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            optimizer.zero_grad()

        if batch_count == 100:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0

    torch.save(model.state_dict(), os.path.join(models_dir, f"gpt2_custom_epoch{epoch}.pt"))


Token indices sequence length is longer than the specified maximum sequence length for this model (4826 > 1024). Running this sequence through the model will result in indexing errors









In [12]:
# 5. Generate Text from Trained Model

def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_probs = probs[ind]
    top_probs = top_probs / np.sum(top_probs)
    choice = np.random.choice(n, 1, p=top_probs)
    return int(ind[choice][0])

MODEL_EPOCH = 4
model.load_state_dict(torch.load(f"trained_models/gpt2_custom_epoch{MODEL_EPOCH}.pt"))
model.eval()

output_file = f"generated_text_epoch{MODEL_EPOCH}.txt"
if os.path.exists(output_file):
    os.remove(output_file)

with torch.no_grad():
    for _ in range(10):
        cur_ids = tokenizer.encode("SENTENCE:", return_tensors='pt').to(device)
        finished = False

        for i in range(10):
            outputs = model(cur_ids, labels=cur_ids)
            logits = outputs.logits
            softmax_logits = torch.softmax(logits[0, -1], dim=0)
            next_token_id = choose_from_top(softmax_logits.cpu().numpy(), n=10)
            cur_ids = torch.cat([cur_ids, torch.tensor([[next_token_id]]).to(device)], dim=1)

            if next_token_id in tokenizer.encode("<|endoftext|>"):
                finished = True
                break

        if finished:
            output_text = tokenizer.decode(cur_ids.squeeze())
            with open(output_file, 'a', encoding='utf-8') as f:
                f.write(output_text + "\n\n")



In [15]:
print(output_text)

أنكر المتهم التهم المنسوبة إليه. ولن يُحكم عليه بأي عقوبة سوى سنة واحدة تحت المراقبة.
