# **Installing the required Packages**

In [None]:
!pip install arabert
!pip install nltk
!pip install arabic_reshaper
!pip install python-bidi
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets
!pip install transformers


# **Importing the required Libraries and tool**

In [None]:
# Importing the necessary libraries
import os
import glob
import subprocess
import shutil
import nltk
from nltk import word_tokenize
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import logging
import transformers
import arabic_reshaper

from transformers import EarlyStoppingCallback, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, DataCollatorForLanguageModeling, AutoModelForMaskedLM
from datasets import Dataset
from sklearn.model_selection import train_test_split
from google.colab import drive
from wordcloud import WordCloud, STOPWORDS
from bidi.algorithm import get_display
from collections import Counter

# **Data Understanding**

###**Reading the CSV Files**

In [None]:
# Mounting Google Drive
drive.mount("/content/drive", force_remount=True)

# Defining the path to the directory containing all the CSV files for the Dataset
drive_dataset_folder_path = "/content/drive/MyDrive/########"

# Changing the current working directory
os.chdir(drive_dataset_folder_path)

# Getting a list of all CSV files in the folder
csv_files = glob.glob("*.csv")

# Initializing a list to store the DataFrames
dfs = []

# Looping through the list of CSV files and read each one into a DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenating all DataFrames into one for easier analysis
merged_df = pd.concat(dfs, ignore_index=True)

# Shuffling the rows of the combined DataFrame
PGN_df = merged_df.sample(frac=1).reset_index(drop=True)

# Displaying the first few rows of the combined DataFrame
print(PGN_df.head())



### **Data Inspection**

***Investigating the Shape of the merged PGN Dataset (Number of Rows and Columns)***

In [None]:
# getting number of rows and columns for the dataset

print(f"Number of Rows and Columns in the Datafram: {PGN_df.shape}")

In [None]:
# getting the info of dataframes

print(PGN_df.info())


**Comment:**
It was obvious that there were no Null values in any of the dataframes

***Getting a detailed Summary about the dataframe including the Categorical ones***

In [None]:
print(PGN_df.describe(include='all'))


***Checking for any null values***

In [None]:
# Check for missing values
print("Missing values:")
print(PGN_df.isnull().sum())

***Checking for any duplicates***

In [None]:
# Check for duplicates in the entire DataFrame
duplicates = merged_df.duplicated()

# Count the number of duplicate rows
num_duplicates = duplicates.sum()

# Display the number of duplicates
print(f"Number of duplicate rows: {num_duplicates}")

In [None]:
# Checking for duplicates in 'Content' column
duplicates = PGN_df[PGN_df.duplicated(['Content'], keep=False)]
print("\nDuplicate entries in 'Content' column:")
print(duplicates)

# Count duplicates in 'Content' column
duplicate_count = PGN_df.duplicated(['Content']).sum()
print(f"\nNumber of duplicate entries in 'Content' column: {duplicate_count}")


In [None]:
# getting unique number of Sectors and their names

print(PGN_df['Sector'].nunique())
print(PGN_df['Sector'].unique())

###**Exploratory Data Analysis (EDA)**

In [None]:
# Dictionary for Arabic to English translations
translations = {
    'التموين': 'Supply',
    'التعليم': 'Education',
    'الصحة': 'Healthcare',
    'البنوك': 'Banking',
    'الاتصالات': 'Communication',
    'القضاء': 'Judiciary',
    'المياه والصرف الصحي': 'Water and Sanitary',
    'الكهرباء': 'Electricity',
    'البيئة':'Environment',
    'الزراعة': 'Agriculture'

}

# Counting the number of rows per sector
sector_counts = PGN_df['Sector'].value_counts().reset_index()
sector_counts.columns = ['Sector', 'Count']

# Reversing the letters of each sector name for readability (since they are in Arabic)
sector_counts['Reversed_Sector'] = sector_counts['Sector'].apply(lambda x: get_display(arabic_reshaper.reshape(x)))
sector_counts['Label'] = sector_counts.apply(lambda x: f"{x['Reversed_Sector']} - {translations[x['Sector']]}", axis=1)


# Creating the bar plot with Seaborn
plt.figure(figsize=(12, 8))
# Using a colorful palette
barplot = sns.barplot(x='Label', y='Count', data=sector_counts, palette='tab10')

# Adding the exact number of Articles on each bar
for index, row in sector_counts.iterrows():
    barplot.text(index, row['Count'], row['Count'], color='black', ha="center")

# Set plot labels and title
plt.xlabel('Sector (Arabic - English)')
plt.ylabel('Number of Articles')
plt.title('Distribution of Articles Among Sectors')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Show the plot
plt.show()

## **Following light preprocessing as recommeneded by scholars, the Data Preprocessing Steps we need to apply for the integrated dataframe are the following:**

1. Replacing User Mentions with relevant tokens ['مستخدم']
2. Replacing URLS with relevant tokens ['رابط']
3. Replacing Eamil Addresses with relevant tokens ['بريد']
4. Removing HTML Tags
5. Removing Emojis
6. Normaliztaion (Stripping Arabic Tashkeel (Diacritics), Stripping Arabic Tatweel, Inserting White Spaces before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words, Removing non Digit Repetition, Replacing Slash with Dash, mapping hindi numbers to Arabic ones)
6. Handling Noisy Characters


#### **Selecting the needed columns**

In [None]:
merged_df_summary = merged_df[['Sector','Keyword', 'Title', 'Content']]
merged_df_summary.head()

#### **Merging the Title and the Content Columns**

It was chosen to merge both the **Title** and **Content** Columns to enhance the model performance

In [None]:
merged_df_title_content = merged_df_summary.copy()
merged_df_title_content['Title_Content'] = merged_df_summary['Title'] + " " + merged_df_summary['Content']
merged_df_title_content

In [None]:
merged_df_title_content.drop(columns=['Title', 'Content'], inplace=True)
merged_df_title_content.head()

***Some Checks for Noisy Characters***

In [None]:
# Investigating the English Words in the dataset

def contains_english_words(text):

    return bool(re.search(r'\b[A-Za-z]+\b', text))

# Print Articles Content containing English words
articles_with_english_words = merged_df_title_content[merged_df_title_content['Title_Content'].astype(str).apply(contains_english_words)]['Title_Content'].tolist()
articles_with_english_words

**Comment:**
Some Java Script Functions were found in the content which are considerd Noise. Also English and Arabic words need to be split by spaces

### **AraBERT Preprocessing Function**

***AraBERT's preprocessing function is chosen for the follwoing reasons:***


1. It is optimized for Arabic text, handling tasks such specific to the Arabic language as well as the newspaper articles, as normalization (including removing diacritics and tatweel), and special character handling (including HTML tages, URLs, emails, mentions). Leveraging these optimizations can help ensure that the input data is appropriately formatted for further processing by other Arabic BERT models.

2. Arabic BERT models typically use similar tokenization schemes, especially that they are based on the same underlying architecture (BERT architecture). By using AraBERT's preprocessing function, the data is prepared in a tokenized format that is compatible with other Arabic BERT models, including ARBERT, facilitating seamless integration into the pretraining and fine-tuning.


In [None]:
# Importing AraBERT Preprocessor Function
from arabert.preprocess import ArabertPreprocessor

# Initialize AraBERT Preprocessor
model_name = "aubmindlab/bert-base-arabertv02"
arabert = ArabertPreprocessor(model_name=model_name, insert_white_spaces=True)


***AraBERT Preprocess Function***

Using **aubmindlab/bert-base-arabertv02** will ensure the following upon adopting the AraBERT Preprocessor function:


*   keep_emojis(bool, optional, defaults to False): don't remove emojis while preprocessing --> will default to **False** for **AraBERTv02**

*   remove_html_markup( bool, optional, defaults to True): Whether to remove html artfacts --> will default to **True** for **AraBERTv02**

*   replace_urls_emails_mentions(bool, optional, defaults to True): Whether to replace email urls and mentions by special tokens --> will default to **True** for **AraBERTv02**

*   strip_tashkeel(bool, optional, defaults to True): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA) --> will default to **True** for **AraBERTv02**

*   strip_tatweel(bool, optional, defaults to True): remove tatweel '\u0640' --> will default to **True** for **AraBERTv02**

*   insert_white_spaces(bool, optional, defaults to True): insert whitespace before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words --> will default to **True** for **AraBERTv02**

*   remove_non_digit_repetition(bool, optional, defaults to True): replace repetition of more than 2 non-digit character with 2 of this character --> will default to **True** for **AraBERTv02**

*   replace_slash_with_dash(bool, optional, defaults to None): --> will be automatically set to **True** for **AraBERTv02**

*   map_hindi_numbers_to_arabic(bool, optional, defaults to None): Replaces hindi numbers with the corresponding Arabic one. ex: "١٩٩٥" --> "1995" --> will be automatically set to **True** for **AraBERTv02**

*   apply_farasa_segmentation(bool, optional, defaults to None): --> will be automatically set to **None** for **AraBERTv02**

In [None]:
# AraBERT preprocess function
def preprocess_text(text):
    return arabert.preprocess(text)



In [None]:
# Apply it on the dataset and generate a new column called Preprocessed_Content

merged_df_title_content.loc[:,'Processed_Content'] = merged_df_title_content.loc[:,'Title_Content'].apply(preprocess_text)
merged_df_title_content

***Checking English Words again after applying AraBERT Preprocessor Function***

In [None]:
# Print Articles Content containing English words
articles_with_english_words = merged_df_title_content[merged_df_title_content['Processed_Content'].astype(str).apply(contains_english_words)]['Processed_Content'].tolist()
articles_with_english_words

**Comment:**

- Not all English words are correctly seperated from the Arabic ones
- Emails are not correctly replaced by [بريد] token since they are represented in a protected format
- One Java Script Function is still not handled (mbInitialization ( ) ; mbCallAd ( 99577 , 663572 , 40222 ) ;)

***Replacing emails with "[بريد]" since they have a special format in the dataset which cannot be recognized bt AraBERT***

In [None]:
def replace_special_tokens(text):


    return text.replace("[email protected]" ,'[بريد]')

merged_df_title_content['Processed_Content'] = merged_df_title_content['Processed_Content'].astype(str).apply(replace_special_tokens)
merged_df_title_content['Processed_Content'].tolist()

***English words were not correctly seperated from the Arabic ones. It was decided to separate English and Arabic words that are not correctly separated by spaces.***

In [None]:
def separate_english_arabic(text):

    # Add space between Arabic and English words
    text = re.sub(r'([a-zA-Z])([\u0621-\u064A\u0660-\u0669])', r'\1 \2', text)  # Add space between English and Arabic characters
    text = re.sub(r'([\u0621-\u064A\u0660-\u0669])([a-zA-Z])', r'\1 \2', text)  # Add space between Arabic and English characters
    return text




merged_df_title_content['Processed_Content']= merged_df_title_content['Processed_Content'].astype(str).apply(separate_english_arabic)



***Removing the Javascript codes represnting noise***

In [None]:

def contains_javascript_function(text):



    if 'mbInitialization' in text:

        return True
    else:
        return False


def remove_noisy_characters(text):

    return re.sub(r'mbInitialization\s*\(\s*\)\s*;\s*mbCallAd\s*\(\s*99577\s*,\s*663572\s*,\s*40222\s*\)\s*;', '', text)




In [None]:
# Before removing the Java Script Function

reviews = merged_df_title_content[merged_df_title_content['Processed_Content'].astype(str).apply(contains_javascript_function)]['Processed_Content'].tolist()

reviews

In [None]:
# Before removing the Java Script Function
merged_df_title_content['Processed_Content'] = merged_df_title_content['Processed_Content'].astype(str).apply(remove_noisy_characters)



reviews = merged_df_title_content[merged_df_title_content['Processed_Content'].astype(str).apply(contains_javascript_function)]['Processed_Content'].tolist()

reviews

In [None]:
merged_df_title_content['Processed_Content'] = merged_df_title_content['Processed_Content'].astype(str).apply(remove_noisy_characters)




In [None]:
# After Removing the Java Script Function
reviews = merged_df_title_content[merged_df_title_content['Processed_Content'].astype(str).apply(contains_javascript_function)]['Processed_Content'].tolist()

reviews

# **Saving the Preprocessed Dataset to Google Drive for further usage in the DAPT approach**

In [None]:
# dropping the Title_Content Column (the one that is not preprocessed)

merged_df_title_content.drop(columns=['Title_Content'], inplace=True)
merged_df_title_content.head()



In [None]:
drive.mount('/content/drive')

# Save DataFrame to CSV
merged_df_title_content.to_csv('/content/preprocessed_dataset.csv', index=False)


# Destination file path on Drive
drive_destination_file = "/content/drive/My Drive/######"  # Update this as needed

# Copy the file from the source path to the destination path
shutil.copy('/content/preprocessed_dataset.csv', drive_destination_file)