# Import Necessary Libraries

In [1]:
import re
import nltk
import pymongo
import pandas as pd
import numpy as np
from pymongo.errors import ConnectionFailure
import string
import qalsadi.lemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import torch.optim as optim
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

2024-05-25 22:44:17.861297: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-25 22:44:17.900367: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-25 22:44:17.900404: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-25 22:44:17.901425: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-25 22:44:17.907707: I tensorflow/core/platform/cpu_feature_guar

# Connect with MongoDB to fetch Data from database : Lab4 , Collection: Sports

In [2]:
try:
    client = pymongo.MongoClient("mongodb://localhost:27017")
    database = client["Lab4"]
    collection = database['Sports']

    print('Connected to MongoDB successfully !')
    documents = collection.find()

except ConnectionFailure as e:
    print('Could not Connect to MongoDB:', e)
except Exception as e:
    print('An error occured:', e)

Connected to MongoDB successfully !


In [3]:
document = []
for doc in documents:
    Content = doc['Content']
    document.append(Content)  

In [4]:
df = pd.DataFrame(document, columns=['Text'])

In [5]:
df

Unnamed: 0,Text
0,\nلا شك أن كأس العالم في نسخته الأخيرة والتي...
1,احمد اوسار\nهناك سؤال وحديث تداولته بعض الجما...
2,\nخسارة مؤلمة تجرعها المنتخب الاسباني بطل أو...
3,\nلا شك أن انتقال الدولي المغربي المهدي بنعط...
4,\n1 - على الرغم من بعض مظاهر الضعف و علامات...
...,...
165,\nتحدثت الصحف البريطانية عن احتمالية عودة ال...
166,خسر برشلونة في ملعبه بنتيجة قوية وكبيرة ضد ري...
167,أيام و يرحل مورينيو عن قلعة السانتياغو بيرناب...
168,انتهت مواجهة بايرن ميونخ وبرشلونة بمجزرة كامل...


## Prepare our Dataset that we Add Score Column (Target) that Presents the relevance of each text

In [6]:
keywords = ['ميسي','فريق ','كأس', 'المغرب','مباراة', 'برشلونة'
            ,'البطولة','الوطنية','الفوز','اللعب','الهجومي'
            ,'الركلات','الأداء','الهجوم', 'والدفاع','الانتقال','المنتخب','الخسارة'
            ,'الأسود','الأطلس', 'ملعب','التغييرات','الجماهير','المغربية',
            'إسبانيا','المدرب', 'الوطني', 'كرة','القدم','مدريد','ريال']

## Function to calculate relevalance score Based on TF-IDF

In [7]:
def calculate_relevance(texts, keywords):
    vectorizer = TfidfVectorizer(vocabulary=keywords)
    tfidf_matrix = vectorizer.fit_transform(texts)
    
    # Sum the TF-IDF scores for each document
    relevance_scores = tfidf_matrix.sum(axis=1)
    # Normalize the scores between 0 and 10
    max_score = relevance_scores.max()
    normalized_scores = (relevance_scores / max_score) * 10 if max_score > 0 else relevance_scores
    
    return normalized_scores

In [8]:
scores = calculate_relevance(df['Text'], keywords)

In [9]:
scores = np.asarray(scores).flatten()
df['Score'] = ["{:.1f}".format(score) for score in scores]

In [10]:
df['Score'].value_counts()

Score
3.3     12
4.7      8
0.0      8
6.5      8
6.6      8
6.3      8
7.1      6
5.7      6
6.8      6
7.8      6
5.9      4
5.8      4
7.6      4
9.0      4
7.7      4
8.0      4
5.6      4
5.3      4
5.4      4
8.1      4
8.6      4
7.5      4
5.2      4
7.0      4
8.2      4
9.1      2
5.1      2
4.9      2
8.8      2
4.6      2
7.2      2
7.4      2
7.3      2
4.5      2
4.3      2
10.0     2
6.7      2
8.4      2
7.9      2
3.8      2
5.0      2
6.4      2
Name: count, dtype: int64

In [11]:
df

Unnamed: 0,Text,Score
0,\nلا شك أن كأس العالم في نسخته الأخيرة والتي...,8.0
1,احمد اوسار\nهناك سؤال وحديث تداولته بعض الجما...,4.7
2,\nخسارة مؤلمة تجرعها المنتخب الاسباني بطل أو...,7.7
3,\nلا شك أن انتقال الدولي المغربي المهدي بنعط...,7.6
4,\n1 - على الرغم من بعض مظاهر الضعف و علامات...,9.0
...,...,...
165,\nتحدثت الصحف البريطانية عن احتمالية عودة ال...,6.4
166,خسر برشلونة في ملعبه بنتيجة قوية وكبيرة ضد ري...,8.4
167,أيام و يرحل مورينيو عن قلعة السانتياغو بيرناب...,6.7
168,انتهت مواجهة بايرن ميونخ وبرشلونة بمجزرة كامل...,6.3


## Function contain techniques to clean text and Normalisation and Discretization

In [12]:
def CleanNormalize(text):

    #RegEx Patterns
    p_mention = r'\@[\_0-9a-zA-Z]+\:?'
    p_url = r'https?://[A-Za-z0-9./]+'
    p_others = r'[a-zA-Z0-9]+'

    
    if(not isinstance(text, str)):
        return ''
    
    #Cleaning
    text = re.sub(p_mention, 'تنويهحساب', text)  #check mentions
    text = re.sub(p_url, 'وجودرابط', text)   #check URLs
        
    text = re.sub('['+string.punctuation+']', ' ', text)  # Remove punctuation    
    text = re.sub(p_others, '', text)  #remove english chars and numbers
    
    #Normalization
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(noise, '', text)
    
    text = re.sub(r'(.)\1+', r'\1\1', text) #repeated chars
    
    #Removing linebreaks and extra whitespaces
    text = text.replace('\n', ' ').replace('\r', ' ')  
    text = re.sub('\W', ' ', text)
    text = re.sub(' +', ' ', text)
    
    return text.strip()

In [13]:
df['Text'] = df['Text'].apply(lambda x: CleanNormalize(x))

In [14]:
df

Unnamed: 0,Text,Score
0,لا شك ان كاس العالم في نسخته الاخيره والتي اقي...,8.0
1,احمد اوسار هناك سءال وحديث تداولته بعض الجماهي...,4.7
2,خساره مءلمه تجرعها المنتخب الاسباني بطل اوروبا...,7.7
3,لا شك ان انتقال الدولي المغربي المهدي بنعطيه ا...,7.6
4,علي الرغم من بعض مظاهر الضعف و علامات الخلل ال...,9.0
...,...,...
165,تحدثت الصحف البريطانيه عن احتماليه عوده المدير...,6.4
166,خسر برشلونه في ملعبه بنتيجه قويه وكبيره ضد ريا...,8.4
167,ايام و يرحل مورينيو عن قلعه السانتياغو بيرنابي...,6.7
168,انتهت مواجهه بايرن ميونخ وبرشلونه بمجزره كامله...,6.3


# preprocessing NLP pipeline

In [15]:
ar_stop_list = open("arabic_stopwords.txt", encoding="utf-8")
stop_words = ar_stop_list.read().split('\n')
lemmatizer = qalsadi.lemmatizer.Lemmatizer()
def preprocessing_data(data):
    # Tokenization
    data = word_tokenize(data)
    # Remove Stop words
    data = [kalima for kalima in data if kalima not in stop_words]
    # Lemmatization
    data = [lemmatizer.lemmatize(kalima) for kalima in data]
    return data

In [16]:
df['Text'] = df['Text'].apply(lambda x: preprocessing_data(x))

In [17]:
df

Unnamed: 0,Text,Score
0,"[شك, كاسي, عالم, نسخة, الاخيره, اقيمت, الاراضي...",8.0
1,"[حمد, اوسار, سءال, حديث, تداول, جماهير, القنيط...",4.7
2,"[خسار, مءلمه, تجرع, منتخب, الاسباني, بطل, اورو...",7.7
3,"[شك, انتقال, دولي, مغرب, مهد, بنعطيه, صفوف, نا...",7.6
4,"[رغم, مظاهر, ضعف, علام, خلل, اعترى, الانجاز, ك...",9.0
...,...,...
165,"[تحدث, صحف, البريطانيه, احتمال, عود, مدير, فن,...",6.4
166,"[خسر, برشلونه, ملعب, بنتيجه, قوة, كبير, مدريد,...",8.4
167,"[رحل, مورينيو, قلع, سانتياغو, بيرنابيو, سدل, س...",6.7
168,"[انتهى, مواجه, بايرن, ميونخ, وبرشلونه, مجزر, ك...",6.3


# Feature Engineering

### Tokenization:
Tokenization is the process of breaking down a piece of text into smaller units, such as words or subwords. Each token typically represents a meaningful unit of the text.

#### Example:
- "hello world" becomes [1, 2]
- "how are you" becomes [3, 4, 5]
- "I am fine" becomes [6, 7, 8]

### Padding:
Padding ensures that all sequences have the same length, which is necessary for batch processing in neural networks. 

#### Example:
- After tokenization, we find the maximum length of the sequences. In this case, it's 3.
- We pad the sequences to have the same length by adding zeros to the end:
  - [1, 2, 0]
  - [3, 4, 5]
  - [6, 7, 8]


In [18]:
texts_flat = [' '.join(tokens) for tokens in df['Text']]
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts_flat)
sequences = tokenizer.texts_to_sequences(texts_flat)
print('Tokenized sequences:' ,sequences)

Tokenized sequences: [[263, 15, 12, 3326, 202, 2191, 2192, 484, 2193, 14, 3327, 126, 35, 2194, 882, 116, 27, 2195, 264, 1277, 561, 18, 14, 99, 281, 3328, 126, 36, 1278, 883, 7, 883, 2196, 235, 60, 61, 100, 2197, 392, 203, 34, 3329, 1046, 309, 1601, 24, 265, 648, 3330, 393, 53, 3331, 649, 310, 2198, 27, 3332, 758, 149, 1602, 3333, 3334, 14, 485, 759, 884, 7, 2199, 15, 12, 2200, 3335, 343, 12, 884, 2201, 1279, 3336, 10, 1603, 760, 117, 1280, 3337, 4, 3338, 2202, 761, 1603, 22, 3339, 2203, 166, 885, 1278, 7, 1604, 311, 2, 486, 1281, 344, 3340, 2204, 2205, 650, 3341, 886, 651, 3342, 101, 266, 267, 2, 219, 149, 3343, 1047, 394, 1048, 3344, 4, 1605, 487, 395, 13, 3345, 887, 150, 2206, 562, 3346, 7, 1606, 3347, 167, 2207, 652, 888, 345, 6, 204, 345, 24, 486, 653, 1607, 220, 168, 7, 486, 3348, 183, 36, 1608, 24, 3349, 346, 889, 268, 2208, 1609, 3350, 563, 36, 2209, 564, 1610, 1611, 1612, 93, 1613, 2210, 1614, 347, 1615, 265, 3351, 883, 565, 4, 36, 14, 66, 2211, 654, 135, 46, 3352, 3353, 102, 3

In [19]:
# Padding
max_len = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
print('Padded sequences:', padded_sequences)

Padded sequences: [[ 263   15   12 ...    0    0    0]
 [ 348 3369  763 ...    0    0    0]
 [ 240 3393  770 ...    0    0    0]
 ...
 [ 294   42 1849 ...    0    0    0]
 [ 210  238  170 ...    0    0    0]
 [  46  150    8 ...    0    0    0]]


In [20]:
target = np.array(df['Score'], dtype=np.float32)

# Splitting the dataset


We need to split the dataset into training and testing sets. We will use the first 80% of the dataset as the training set, another 20% as the testing

In [21]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, target, test_size = 0.2, random_state=42)

## Stock The Dataset

In [22]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
class ArabicDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = torch.tensor(self.sequences[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return sequence, label

## Create Loader

In [23]:
train_dataset = ArabicDataset(X_train, y_train)
test_dataset = ArabicDataset(X_test, y_test)

In [24]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Define the Models Architecture

## Define RNN

In [25]:
class TextRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(TextRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=n_layers, 
                          bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hidden[-1, :, :])
        return self.fc(hidden)

## Define GRU

In [26]:
class TextGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(TextGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, 
                          bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hidden[-1, :, :])
        return self.fc(hidden)

## Define LSTM

In [27]:
class TextLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(TextLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                           bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hidden[-1, :, :])
        return self.fc(hidden)

# tuning hyper-parameters

In [28]:
VOCAB_SIZE = len(tokenizer.word_index) + 1
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

In [29]:
model = TextLSTM(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Train the model

In [30]:
def train(model, iterator, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    predictions = []
    labels = []
    for text, label in iterator:
        optimizer.zero_grad()
        text, label = text.to(device), label.to(device)
        output = model(text).squeeze(1)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        predictions.extend(output.detach().cpu().numpy())
        labels.extend(label.detach().cpu().numpy())
    return epoch_loss / len(iterator), predictions, labels

def evaluate(model, iterator, criterion, device):
    model.eval()
    epoch_loss = 0
    predictions = []
    labels = []
    with torch.no_grad():
        for text, label in iterator:
            text, label = text.to(device), label.to(device)
            output = model(text).squeeze(1)
            loss = criterion(output, label)
            epoch_loss += loss.item()
            predictions.extend(output.detach().cpu().numpy())
            labels.extend(label.detach().cpu().numpy())
    return epoch_loss / len(iterator), predictions, labels

In [31]:
def calculate_regression_metrics(predictions, labels):
    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    r2 = r2_score(labels, predictions)
    return mse, mae, r2

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Training loop

In [33]:
N_EPOCHS = 35

In [34]:
def train_and_evaluate(models, loaders, optimizer, criterion, device, epochs):
    for model_class in models:
        model = model_class(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT).to(device)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        print('----------------------------------------------------')
        print('----------------------------------------------------')
        print(f"\nTraining {model_class.__name__}...")
        for epoch in range(epochs):
            train_loss, train_predictions, train_labels = train(model, loaders['train'], optimizer, criterion, device)
            test_loss, test_predictions, test_labels = evaluate(model, loaders['test'], criterion, device)
            train_mse, train_mae, train_r2 = calculate_regression_metrics(train_predictions, train_labels)
            test_mse, test_mae, test_r2 = calculate_regression_metrics(test_predictions, test_labels)
            print(f'------------------------------   Epoch: {epoch+1:02} --------------------------------')
            print(f'Train Loss: {train_loss:.3f}, Test Loss: {test_loss:.3f}')
            print(f'Train MSE: {train_mse:.3f}, Test MSE: {test_mse:.3f}')
            print(f'Train MAE: {train_mae:.3f}, Test MAE: {test_mae:.3f}')
            print(f'Train R²: {train_r2:.3f}, Test R²: {test_r2:.3f}')
            print('----------------------------------------------------')

In [35]:
models = [TextRNN, TextGRU, TextLSTM]
loaders = {'train': train_loader, 'test': test_loader}

train_and_evaluate(models, loaders, optimizer, criterion, device, N_EPOCHS)

----------------------------------------------------
----------------------------------------------------

Training TextRNN...
------------------------------   Epoch: 01 --------------------------------
Train Loss: 14.234, Test Loss: 5.773
Train MSE: 14.344, Test MSE: 5.786
Train MAE: 3.041, Test MAE: 1.821
Train R²: -2.346, Test R²: -0.350
----------------------------------------------------
------------------------------   Epoch: 02 --------------------------------
Train Loss: 4.216, Test Loss: 2.064
Train MSE: 4.253, Test MSE: 2.164
Train MAE: 1.524, Test MAE: 1.151
Train R²: 0.008, Test R²: 0.495
----------------------------------------------------
------------------------------   Epoch: 03 --------------------------------
Train Loss: 2.393, Test Loss: 1.084
Train MSE: 2.470, Test MSE: 1.035
Train MAE: 1.253, Test MAE: 0.769
Train R²: 0.424, Test R²: 0.759
----------------------------------------------------
------------------------------   Epoch: 04 -------------------------------

#### Among the three architectures, TextLSTM consistently showed the best performance, achieving the lowest losses and highest R² scores across epochs. TextGRU followed closely behind, while TextRNN generally lagged behind the other two in terms of performance.