# Setup and Installations

In [None]:
# !pip install datasets

In [None]:
# !pip install transformers==4.48.3 datasets accelerate torch -q

!pip install datasets transformers accelerate -U

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12

# ensure transformer version

In [None]:
import transformers
print(transformers.__version__)


4.52.3


# Import Libraries and Configuration

In [None]:
import unicodedata
import sys
import re
import string
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset, DatasetDict, ClassLabel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity # to calcalating  similarity


# Mount Google Drive

In [None]:

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Define Paths

In [None]:
# from pickle import NONE

BOOK_DATASET_CSV_PATH = 'drive/MyDrive/ArabicBooksClassifier/jamalon7.csv'
MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-mix"





device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# !ls 'drive/MyDrive/ArabicBooksClassifier'

Using device: cuda


In [None]:
df = pd.read_csv(BOOK_DATASET_CSV_PATH)


df['Description'].notnull().sum()
df['Title'].nunique()



# df['Description'].sum()

4050

# Arabic Text Preprocessor Class

In [None]:
class ArabicTextPreprocessor:



    _EASTERN_ARABIC_NUMERALS = '٠١٢٣٤٥٦٧٨٩'
    _WESTERN_ARABIC_NUMERALS = '0123456789'
    _ARABIC_CHAR_MAP = {
        'أ': 'ا', 'إ': 'ا', 'آ': 'ا',
        'ة': 'ه',
        'ى': 'ي'
    }
    _ARABIC_DIACRITICS_TATWEEL_REGEX = re.compile(r'[\u064B-\u0652\u0640]')
    _CHARS_TO_PRESERVE = '.-/'
    _ARABIC_PUNCTUATIONS_BASE = '`÷×؛<>_()*&^%][ـ،:"؟\'{}~¦+|!”…“–ـ«»'
    _ENGLISH_PUNCTUATIONS_BASE = string.punctuation
    _MULTI_WHITESPACE_REGEX = re.compile(r'\s+')

    _UNICODE_CONTROL_CHAR_REGEX = re.compile(

        r'[\u202A-\u202F\u200B-\u200F\u00A0\uFEFF\u2060-\u206F]'
    )

    _EMOJI_PATTERN = re.compile(
        "["
        u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF"
        u"\U0001F700-\U0001F77F" u"\U0001F780-\U0001F7FF" u"\U0001F800-\U0001F8FF"
        u"\U0001F900-\U0001F9FF" u"\U0001FA70-\U0001FAFF" u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE)

    def __init__(self):
        self.numeral_translation_table = None
        self.char_norm_translation_table = None
        self.punctuation_removal_table = None
        try:
            self.numeral_translation_table = str.maketrans(
                self._EASTERN_ARABIC_NUMERALS,
                self._WESTERN_ARABIC_NUMERALS
            )
        except Exception as e:
            print(f"Error initializing numeral table: {e}", file=sys.stderr)
        try:
            self.char_norm_translation_table = str.maketrans(self._ARABIC_CHAR_MAP)
        except Exception as e:
            print(f"Error initializing char norm table: {e}", file=sys.stderr)
        try:
            _english_punctuations_to_remove_str = ''.join(
                c for c in self._ENGLISH_PUNCTUATIONS_BASE if c not in self._CHARS_TO_PRESERVE
            )
            _punctuations_to_remove_str = self._ARABIC_PUNCTUATIONS_BASE + _english_punctuations_to_remove_str
            self.punctuation_removal_table = str.maketrans('', '', _punctuations_to_remove_str)
        except Exception as e:
             print(f"Error initializing punctuation table: {e}", file=sys.stderr)

    def _normalize_unicode(self, text: str, form: str = 'NFC') -> str:
        if not isinstance(text, str): return text
        try: return unicodedata.normalize(form, text)
        except Exception: return text
    def _remove_emojis(self, text:str) -> str:
        if not isinstance(text, str): return text
        try: return self._EMOJI_PATTERN.sub('', text)
        except Exception: return text
    def _normalize_arabic_chars(self, text: str) -> str:
        if self.char_norm_translation_table is None: return text
        if not isinstance(text, str): return text
        try: return text.translate(self.char_norm_translation_table)
        except Exception: return text
    def _standardize_numerals(self, text: str) -> str:
        if self.numeral_translation_table is None: return text
        if not isinstance(text, str): return text
        try: return text.translate(self.numeral_translation_table)
        except Exception: return text
    def _remove_diacritics_and_tatweel(self, text: str) -> str:
        if not isinstance(text, str): return text
        try: return self._ARABIC_DIACRITICS_TATWEEL_REGEX.sub('', text)
        except Exception: return text
    def _remove_punctuations(self, text: str) -> str:
        if self.punctuation_removal_table is None: return text
        if not isinstance(text, str): return text
        try: return text.translate(self.punctuation_removal_table)
        except Exception: return text
    def _lowercase_latin(self, text: str) -> str:
        if not isinstance(text, str): return text
        try: return text.lower()
        except Exception: return text


    def _normalize_whitespace(self, text: str) -> str:
        if not isinstance(text, str): return text
        try:
            text = text.strip()
            return self._MULTI_WHITESPACE_REGEX.sub(' ', text)
        except Exception: return text

    def _remove_unicode_control_chars(self, text: str) -> str:
          if not isinstance(text, str): return text
          try:
              return self._UNICODE_CONTROL_CHAR_REGEX.sub('', text)
          except Exception: return text

    def preprocess(self, text: str) -> str:
        if not isinstance(text, str):
            print(f"preprocess Error: Input must be a string, received {type(text)}.", file=sys.stderr)
            return text
        if self.char_norm_translation_table is None or \
           self.numeral_translation_table is None or \
           self.punctuation_removal_table is None:
             print("Error: Preprocessor tables not initialized correctly.", file=sys.stderr)
             return text
        processed_text = self._normalize_unicode(text, 'NFC')
        processed_text = self._remove_emojis(processed_text)
        processed_text = self._normalize_arabic_chars(processed_text)
        processed_text = self._standardize_numerals(processed_text)
        processed_text = self._remove_diacritics_and_tatweel(processed_text)
        processed_text = self._remove_punctuations(processed_text)
        processed_text = self._lowercase_latin(processed_text)
        processed_text = self._normalize_whitespace(processed_text)
        return processed_text

In [None]:
raw_book_dataset_df = None

raw_book_dataset_df = pd.read_csv(BOOK_DATASET_CSV_PATH)


# raw_book_dataset_df["Description"].nunique()

raw_book_dataset_df[raw_book_dataset_df.duplicated(subset=["Description"] , keep=False)].sort_values("Description")[['Description','Title','Author']].value_counts()
raw_book_dataset_df[raw_book_dataset_df.duplicated(subset=["Description"] , keep=False)][["Description",'Title','Author']].drop_duplicates(subset=['Description']).sort_values("Description")
# raw_book_dataset_df['Author'].nunique()
# # 3346

# raw_book_dataset_df['Description'].nunique()
# # 3988
# raw_book_dataset_df['Title'].nunique()
# # 4050

Unnamed: 0,Description,Title,Author
476,"""أنا لا أهتم بالسياسة قدر اهتمامي بالعدالة"" ي...",موت فوضوي صدفة,داريو فو
2266,"""مغيرو قواعد اللعبة"" هم ثوريون ومبتكرون، هم ش...",مغير قواعد اللعبة,بيتر فيسك
400,"""وما دام لا زايد ولا ناقص ليش أنا هالشكل؟.. أ...",إلعب.. وقول الستر,أحمد ثاني
1713,(وما أوتيت من العلم إلا قليلا) بسم الله والحم...,مبادئ في الإحصاء,إبراهيم أبو عقيل
2671,1- غير خاف أن دراسة نطاق إعمال الشكل، في نظام...,الشكل في الفقه الإسلامي,محمد سوار
...,...,...,...
4124,مؤلفان في علم المنطق يعدان من اهم مؤلفات الشيخ...,منطق المشرقيين ويليه : الأرجوزة المزدوجة في ال...,ابن سينا
4407,"مجموعتي في صور (7 - 10 سنة) - 18 عنوانا"" تتوجه...",البحر في صور,ترجمة فيفيان أبي راشد
3680,من أهم المصطلحات في الخطاب التحليلي الاجتماعي ...,العلمانية تحت المجهر,عبد الوهاب المسيري ، عزيز العظمة
4028,نشأ هذا الكتاب من محاضرات خاصة بمادة دراسية تت...,العدالة كإنصاف : إعادة صياغة,جون رولز


In [None]:
!pip install langdetect



In [None]:
from langdetect import detect, DetectorFactory, LangDetectException

# For consistent results from langdetect if run multiple times
try:
    DetectorFactory.seed = 0
except Exception as e:
    print(f"Could not set DetectorFactory.seed: {e}", file=sys.stderr)



raw_book_dataset_df = None
cleaned_book_df = None

try:
    print(f"Loading raw book data from: {BOOK_DATASET_CSV_PATH}")
    raw_book_dataset_df = pd.read_csv(BOOK_DATASET_CSV_PATH)
    print(f"Successfully loaded {len(raw_book_dataset_df)} raw rows.")


    essential_cols = ['Description', 'Title']
    for col in essential_cols:
        if col not in raw_book_dataset_df.columns:
            raise ValueError(f"Essential column '{col}' not found in the dataset.")

    # if 'Author' not in raw_book_dataset_df.columns:
    #     print("Warning: 'Author' column not found. Deduplication will proceed without 0.", file=sys.stderr)
    #     raw_book_dataset_df['Author'] = "Unknown Author"


    # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%س start  of language detection %%%%%%%%%%%%%%%%%%%%%%%%

    print("Performing language detection on 'Description'...")
    def detect_language_safe(text):
        try:
            if pd.isna(text) or not isinstance(text, str) or len(text.strip()) < 20:

                return "unknown_or_short"
            return detect(text)
        except LangDetectException:
            return "error_detecting"
        except Exception:
            return "unknown_error"

    raw_book_dataset_df['lang'] = raw_book_dataset_df['Description'].apply(detect_language_safe)
    print("Language detection complete.")

    # raw_book_dataset_df['lang'].value_counts()
    # %%%%%%%%%%%%%%%%%%%%%%%%%%%%% end of language detection %%%%%%%%%%%%%%%%%%%%%%%%

    # %%%%%%%%%%%%%%%%%%%%%%%%%%%%% start  filter by language %%%%%%%%%%%%%%%%%%%%%%%%
    num_before_lang_filter = len(raw_book_dataset_df)
    raw_book_dataset_df = raw_book_dataset_df[raw_book_dataset_df['lang'] == 'ar'].copy() # Use .copy() to avoid SettingWithCopyWarning

    print(f"Removed {num_before_lang_filter - len(raw_book_dataset_df)} non-Arabic or problematic language rows.")
    print(f"Dataset size after language filtering: {len(raw_book_dataset_df)}")

    if raw_book_dataset_df.empty:
        raise ValueError("No Arabic descriptions found after language filtering. Cannot proceed.")
    # %%%%%%%%%%%%%%%%%%%%%%%%%%%%% end  filter by language %%%%%%%%%%%%%%%%%%%%%%%%


    # %%%%%%%%%%%%%%%%%%%%%%%%%%%%% start  handlling NaNs in these Columns after we filter language %%%%%%%%%%%%%%%%%%%%%%%%
    key_cols_for_nan_check = ['Description', 'Title']
    num_before_nan_drop = len(raw_book_dataset_df)
    raw_book_dataset_df.dropna(subset=key_cols_for_nan_check, inplace=True)
    print(f"Removed {num_before_nan_drop - len(raw_book_dataset_df)} rows with NaN in {key_cols_for_nan_check}.")
    print(f"Dataset size after NaN drop: {len(raw_book_dataset_df)}")

    # %%%%%%%%%%%%%%%%%%%%%%%%%%%%% end handle NaNs in these Columns %%%%%%%%%%%%%%%%%%%%%%%%

    if raw_book_dataset_df.empty:
        raise ValueError("Dataset became empty after NaN drop. Cannot proceed.")


    # %%%%%%%%%%%%%%%%%%%%%%%%%%%%% start type Conversion to strig %%%%%%%%%%%%%%%%%%%%%%%%
    cols_to_str = ['Description', 'Title', 'Author']
    # Also convert Category to string if it exists, for consistency later
    if 'Category' in raw_book_dataset_df.columns:
        cols_to_str.append('Category')

    for col in cols_to_str:
        if col in raw_book_dataset_df.columns:
            raw_book_dataset_df[col] = raw_book_dataset_df[col].astype(str)

    # print(f" ال Converted columns  كمل ياعمنا{cols_to_str} to string type.")

    # %%%%%%%%%%%%%%%%%%%%%%%%%%%%% end type Conversion to strig %%%%%%%%%%%%%%%%%%%%%%%%

    deduplication_subset = []
    if 'Title' in raw_book_dataset_df.columns: deduplication_subset.append('Title')
    if 'Author' in raw_book_dataset_df.columns: deduplication_subset.append('Author')
    if 'Description' in raw_book_dataset_df.columns: deduplication_subset.append('Description')

    if deduplication_subset:
        num_before_dedup = len(raw_book_dataset_df)
        raw_book_dataset_df.drop_duplicates(subset=deduplication_subset, keep='first', inplace=True)
        print(f"Removed {num_before_dedup - len(raw_book_dataset_df)} duplicate rows based on {deduplication_subset}.")
        print(f"Final cleaned dataset size: {len(raw_book_dataset_df)}")
    else:
        print("Warning: No suitable columns found for deduplication.", file=sys.stderr)

    if raw_book_dataset_df.empty:
        raise ValueError("Dataset became empty after deduplication. Cannot proceed.")

    cleaned_book_df = raw_book_dataset_df

except FileNotFoundError:
    print(f"Error: Book dataset file not found at {BOOK_DATASET_CSV_PATH}", file=sys.stderr)
    cleaned_book_df = None
except ValueError as ve:
    print(f"ValueError: {ve}", file=sys.stderr)
    cleaned_book_df = None
except ImportError:
    print("ImportError: `langdetect` library might not be installed. Please install it (`pip install langdetect`) and restart.", file=sys.stderr)
    cleaned_book_df = None
except Exception as e:
    print(f"An unexpected error occurred during initial data loading/cleaning: {e}", file=sys.stderr)
    cleaned_book_df = None




Loading raw book data from: drive/MyDrive/ArabicBooksClassifier/jamalon7.csv
Successfully loaded 4443 raw rows.
Performing language detection on 'Description'...
Language detection complete.
Removed 57 non-Arabic or problematic language rows.
Dataset size after language filtering: 4386
Removed 0 rows with NaN in ['Description', 'Title'].
Dataset size after NaN drop: 4386
Removed 375 duplicate rows based on ['Title', 'Author', 'Description'].
Final cleaned dataset size: 4011


In [None]:
if raw_book_dataset_df is not None:
    print(f"\n--- Initial Data Cleaning Summary ---")
    print(f"Shape of cleaned_book_df: {raw_book_dataset_df.shape}")
    # cleaned_book_df.info()
else:
    print("\nInitial data cleaning failed. `cleaned_book_df` is None.")

raw_book_dataset_df.head()


--- Initial Data Cleaning Summary ---
Shape of cleaned_book_df: (4011, 13)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Title,Author,Description,Pages,Publication year,Publisher,Cover,Category,Subcategory,Price,lang
0,0,0,فى فقه الصراع على القدس وفلسطين,محمد عمارة,الإسلامية كانت القدس رمز الصراع وبوابة الانتص...,180,2006,دار الشروق – مصر,غلاف ورقي,الأدب والخيال,الأدب الإسلامي,15.0,ar
1,1,6,عذراء قريش,جرجي زيدان,"روايات تاريخ الإسلام"" سلسلة من الروايات التار...",176,0,دار البشير للطباعة والنشر والتوزيع,غلاف عادي,الأدب والخيال,الأدب الإسلامي,18.75,ar
2,2,13,نفحات من الأدب الإسلامي,محمد الصابوني,ھﺬه ﻣﺬﻛﺮات ﻓﻲ اﻷدب اﻹﺳﻼﻣﻲ، وﺿﻌﮫﺎ اﻟﻤﺆﻟﻒ ﻟﻄﻼﺑﻪ...,168,1996,دار البشائر الإسلامية للطباعة والنشر والتوزيع,غلاف ورقي,الأدب والخيال,الأدب الإسلامي,18.75,ar
3,3,18,بسط الأعذار عن حب العذار,بدر الدين المنهاجي,كتاب في الادب وصفه المؤلف بعد أن أطلع على كتا...,464,2016,دار الكتب العلمية,غلاف كرتوني,الأدب والخيال,الأدب الإسلامي,45.0,ar
4,4,21,قصة كاملة... لم يؤلفها بشر,علي الطنطاوي,قصة حقيقية شهد نهايتها العلامة الدمشقي علي ال...,34,2004,دار ابن حزم للطباعة والنشر والتوزيع,غلاف ورقي,الأدب والخيال,الأدب الإسلامي,1.5,ar


In [None]:
detect('hello man how are you today')
# raw_book_dataset_df['lang'].value_counts()


'so'

# Initialize preprocessor, tokenizer, and model for embeddings


In [None]:

preprocessor = None
tokenizer = None
embedding_model = None

if raw_book_dataset_df is not None:
    preprocessor = ArabicTextPreprocessor()

    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        embedding_model = AutoModel.from_pretrained(MODEL_NAME)

        embedding_model.to(device)
        embedding_model.eval()

        print(f" Successfully loaded tokenizer and base model for: {MODEL_NAME}")

    except Exception as e:
        print(f"damn : error loading tokenizer or model for {MODEL_NAME}: {e}", file=sys.stderr)
        tokenizer = None
        embedding_model = None

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Successfully loaded tokenizer and base model for: CAMeL-Lab/bert-base-arabic-camelbert-mix


# Data Processing Function: Apply Preprocessing

In [None]:

def apply_text_preprocessing(data_text):

    try:
        processed_descriptions = [preprocessor.preprocess(str(desc)) for desc in data_text['Description']]
        return {'processed_description': processed_descriptions}
    except Exception as e:

        print(f" Error in apply_text_preprocessing fuction {e}", file=sys.stderr)


        return {'processed_description': data_text['Description']}


# Data Processing Function: Tokenize Data

In [None]:
def tokenize_processed_text(examples):
    """ tokenize  'processed_description' column """

    try:
        return tokenizer(
            examples['processed_description'],
            truncation=True,
            padding=False,
            max_length=512

        )
    except Exception as e:
        print(f"rrror in tokenize_processed_text func..; {e}", file=sys.stderr)


        return {'input_ids': [[] for _ in examples['processed_description']],
                'attention_mask': [[] for _ in examples['processed_description']]}

# Data Processing Function: generate embeddings

In [None]:
# def generate_cls_embeddings(batch):

#     inputs = {
#         'input_ids': torch.tensor(batch['input_ids'], dtype=torch.long).to(device),
#         'attention_mask': torch.tensor(batch['attention_mask'], dtype=torch.long).to(device)
#     }

#     with torch.no_grad():
#         outputs = embedding_model(**inputs)

#     cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
#     return {'embeddings': cls_embeddings.tolist()}




def generate_mean_pooled_embeddings(batch):

    try:

        max_len_in_batch = 0
        if batch['input_ids']:
             max_len_in_batch = max(len(ids) for ids in batch['input_ids'])


        # %%%%%%%%%%%%%%%%%%%%%%%%%%%%% start of Padding Logic %%%%%%%%%%%%%%%%%%%%%%%%
        padded_input_ids = []
        padded_attention_mask = []

        if batch['input_ids']:
            for ids, mask in zip(batch['input_ids'], batch['attention_mask']):
                padding_length = max_len_in_batch - len(ids)
                padded_ids = ids + [tokenizer.pad_token_id] * padding_length
                padded_mask = mask + [0] * padding_length

                padded_input_ids.append(padded_ids[:max_len_in_batch])
                padded_attention_mask.append(padded_mask[:max_len_in_batch])
        else:
            return {'embeddings': []}


        # we will convert the padded lists to NumPy arrays ;;;;  to  ...

        input_ids_np = np.array(padded_input_ids, dtype=np.int64)
        attention_mask_np = np.array(padded_attention_mask, dtype=np.int64)

        # %%%%%%%%%%%%%%%%%%%%%%%%%%%%% end of Padding Logic %%%%%%%%%%%%%%%%%%%%%%%%

        inputs = {
            'input_ids': torch.from_numpy(input_ids_np).to(device),
            'attention_mask': torch.from_numpy(attention_mask_np).to(device)
        }

        with torch.no_grad():
            outputs = embedding_model(**inputs)

        last_hidden_states = outputs.last_hidden_state
        attention_mask_tensor = inputs['attention_mask']

        input_mask_expanded = attention_mask_tensor.unsqueeze(-1).expand(last_hidden_states.size()).float()
        sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_pooled_embeddings = (sum_embeddings / sum_mask).cpu().numpy()
        return {'embeddings': mean_pooled_embeddings.tolist()}
    except Exception as e:

        print(f"Error in generate_mean_pooled_embeddings: {e}", file=sys.stderr)
        if 'batch' in locals() and 'input_ids' in batch:
             print(f"Batch input_ids lengths: {[len(x) for x in batch['input_ids']]}", file=sys.stderr)
        return {'embeddings': [[] for _ in batch.get('input_ids', [])]}


# Execute Data Preparation

In [None]:


processed_book_dataset = None
tokenized_book_dataset_for_labels = None
final_dataset_before_embeddings = None
final_book_dataset_with_embeddings = None
label2id = None
id2label = None
num_labels = None

if raw_book_dataset_df is not None and preprocessor and tokenizer and embedding_model:
    try:
        columns_to_use = ['Description','Pages','Author','Publisher','Subcategory','Publication year']
        if 'Title' in raw_book_dataset_df.columns:
            columns_to_use.append('Title')
        if 'Category' in raw_book_dataset_df.columns:
            columns_to_use.append('Category')
        else:
            print(" Warnnning 'Category' column not found ", file=sys.stderr)
        hf_book_dataset = Dataset.from_pandas(raw_book_dataset_df[columns_to_use])
    except Exception as e:
        print(f" error converting DataFrame to Hugging face Dataset: {e}", file=sys.stderr)
        hf_book_dataset = None

    if hf_book_dataset:
        try:
            processed_book_dataset = hf_book_dataset.map(
                apply_text_preprocessing,
                batched=True,
                num_proc=4
            )
        except Exception as e:
            print(f" e rror during text preprocessing ma : {e}", file=sys.stderr)
            processed_book_dataset = None

    if processed_book_dataset:
        try:
            tokenized_book_dataset_for_labels = processed_book_dataset.map(
                tokenize_processed_text,
                batched=True,
                num_proc=4
                # remove_columns=['processed_description']
            )
        except Exception as e:
            print(f"Error during tokenization map :; {e}", file=sys.stderr)
            tokenized_book_dataset_for_labels = None




    # %%%%%%%%%%%%%%%%%%%%%%%%%%%%% start Encode Category Labels %%%%%%%%%%%%%%%%%%%%%%%%

    if tokenized_book_dataset_for_labels and 'Category' in tokenized_book_dataset_for_labels.column_names:
        try:
            def ensure_category_str(examples):
                return {'Category_str': [str(cat) if pd.notna(cat) else "Unknown_Category_Token" for cat in examples['Category']]}

            # Create Category_str on the dataset
            dataset_with_category_str = tokenized_book_dataset_for_labels.map(ensure_category_str, batched=True, num_proc=4)

            unique_categories = sorted(list(set(cat for cat in dataset_with_category_str['Category_str'])))

            if not unique_categories or (len(unique_categories) == 1 and unique_categories[0] == "Unknown_Category_Token"):
                 print("Warning: No valid unique categories found for label encoding.", file=sys.stderr)
                 final_dataset_before_embeddings = tokenized_book_dataset_for_labels
            else:
                class_label_feature = ClassLabel(names=unique_categories)
                label2id = {name: i for i, name in enumerate(class_label_feature.names)}
                id2label = {i: name for i, name in enumerate(class_label_feature.names)}
                num_labels = class_label_feature.num_classes


                def encode_labels_fn(examples):
                    encoded_labels = []
                    for cat_str in examples['Category_str']:
                        encoded_labels.append(label2id.get(cat_str, -1))
                    return {'labels': encoded_labels}


                final_dataset_before_embeddings = dataset_with_category_str.map(
                                                        encode_labels_fn,
                                                        batched=True,
                                                        num_proc=4,

                                                    )
                print(f"Label encoding complete. Found {num_labels} unique categories.")

                # remove and the Category_str cols.
                # final_dataset_before_embeddings = final_dataset_before_embeddings.remove_columns(['Category_str'])

        except Exception as e:
            print(f"Error during label encoding: {e}", file=sys.stderr)
            final_dataset_before_embeddings = tokenized_book_dataset_for_labels
    elif tokenized_book_dataset_for_labels:
        final_dataset_before_embeddings = tokenized_book_dataset_for_labels
    else:
        final_dataset_before_embeddings = None

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%% end of Encode Category Labels %%%%%%%%%%%%%%%%%%%%%%%%




# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%      generate Embeddings      %%%%%%%%%%%%%%%%%%%%%%%%
    if final_dataset_before_embeddings:
        try:

            final_book_dataset_with_embeddings = final_dataset_before_embeddings.map(
                generate_mean_pooled_embeddings,
                batched=True,
                batch_size=16
            )
            print("Embedding generation complete.")
        except Exception as e:
            print(f"Error during embedding generation map: {e}", file=sys.stderr)
            final_book_dataset_with_embeddings = None
    else:
        final_book_dataset_with_embeddings = None

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%   end of Generate Embeddings      %%%%%%%%%%%%%%%%%%%%%%%%




# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%    just check  and cleaning other cols    %%%%%%%%%%%%%%%%%%%%%%%%
    if final_book_dataset_with_embeddings:

        columns_to_keep_final = ['embeddings','processed_description','Pages','Author','Publisher','Subcategory','Publication year']
        if 'Title' in final_book_dataset_with_embeddings.column_names:
            columns_to_keep_final.append('Title')
        if 'Category' in final_book_dataset_with_embeddings.column_names:
            columns_to_keep_final.append('Category')
        if 'labels' in final_book_dataset_with_embeddings.column_names:
            columns_to_keep_final.append('labels')
        # if 'Description' in final_book_dataset_with_embeddings.column_names:
        #     columns_to_keep_final.append('Description')


        current_columns = list(final_book_dataset_with_embeddings.column_names)
        cols_to_remove_final = [col for col in current_columns if col not in columns_to_keep_final]

        if cols_to_remove_final:
            final_book_dataset_with_embeddings = final_book_dataset_with_embeddings.remove_columns(cols_to_remove_final)

        final_book_dataset_with_embeddings.reset_format()
        print("Final dataset created with embeddings.")

else:
    print("Dataset processing pipeline skipped due to errors in previous cells.")


Map (num_proc=4):   0%|          | 0/4011 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4011 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4011 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4011 [00:00<?, ? examples/s]

Label encoding complete. Found 13 unique categories.


Map:   0%|          | 0/4011 [00:00<?, ? examples/s]

Embedding generation complete.
Final dataset created with embeddings.


In [None]:
final_book_dataset_with_embeddings


Dataset({
    features: ['Description', 'Pages', 'Author', 'Publisher', 'Subcategory', 'Publication year', 'Title', 'Category', '__index_level_0__', 'processed_description', 'input_ids', 'token_type_ids', 'attention_mask', 'Category_str', 'labels', 'embeddings'],
    num_rows: 4011
})

In [None]:
current_columns

['Description',
 'Title',
 'Category',
 '__index_level_0__',
 'processed_description',
 'input_ids',
 'token_type_ids',
 'attention_mask',
 'Category_str',
 'labels',
 'embeddings']

In [None]:
print(final_book_dataset_with_embeddings)
if len(final_book_dataset_with_embeddings) > 0:
   print(final_book_dataset_with_embeddings[2000])
  #  print((final_book_dataset_with_embeddings.num_rows))

print((final_book_dataset_with_embeddings[0]['Description']))


print(final_book_dataset_with_embeddings.shape)



Dataset({
    features: ['Description', 'Title', 'Category', 'processed_description', 'labels', 'embeddings'],
    num_rows: 4011
})
{'Description': ' إن الحديث عن المناهج في دراسة الظواهر المختلفة، يبقى حديثًا مرتبطًا بحركة التطور الحضارية العامة التي شهدها المجتمع الإنساني عبر مسيرته التاريخية، وغالبًا ما يلاحظ بالمتابعة الموضوعية أن حركة التطور المنهاجي كانت إما سابقة على حركة التطور الحضاري أو موازي عرضوغالبًا ما يلاحظ بالمتابعة الموضوعية أن حركة التطور المنهاجي كانت إما سابقة على حركة التطور الحضاري أو موازي', 'Title': ' التطورات المنهجية وعملية البحث العلمي', 'Category': 'الاقتصاد والأعمال', 'processed_description': 'ان الحديث عن المناهج في دراسه الظواهر المختلفه يبقي حديثا مرتبطا بحركه التطور الحضاريه العامه التي شهدها المجتمع الانساني عبر مسيرته التاريخيه وغالبا ما يلاحظ بالمتابعه الموضوعيه ان حركه التطور المنهاجي كانت اما سابقه علي حركه التطور الحضاري او موازي عرضوغالبا ما يلاحظ بالمتابعه الموضوعيه ان حركه التطور المنهاجي كانت اما سابقه علي حركه التطور الحضاري او موازي', 'labe

In [None]:
similarity_matrix = None
all_embeddings_matrix = None

if final_book_dataset_with_embeddings is not None and 'embeddings' in final_book_dataset_with_embeddings.column_names:
    try:

        print("Extracting embeddings to a NumPy matrix...")

        # Check if embeddings are already numpy arrays or need conversion from lists
        if isinstance(final_book_dataset_with_embeddings[0]['embeddings'], list):
            all_embeddings_matrix = np.array(final_book_dataset_with_embeddings['embeddings'], dtype=np.float32)
        elif isinstance(final_book_dataset_with_embeddings[0]['embeddings'], np.ndarray):
            all_embeddings_matrix = np.stack(final_book_dataset_with_embeddings['embeddings']).astype(np.float32)
        else:
            raise TypeError("Embeddings column is not in an expected list or NumPy array format.")

        print(f"Shape of embeddings matrix: {all_embeddings_matrix.shape}")


        if np.isnan(all_embeddings_matrix).any() or np.isinf(all_embeddings_matrix).any(): #chenck for NaN
            print("Warning: NaN/ Inf values found in embeddings. Similarity calculation might be affected.", file=sys.stderr)

            #! to check data quality
            all_embeddings_matrix = np.nan_to_num(all_embeddings_matrix) # Replaces NaN with 0 ..


        # Calculate cosine similarity matrix

        print("Calculating cosine similarity matrix...")
        similarity_matrix = cosine_similarity(all_embeddings_matrix)

        print(f"Shape of similarity matrix: {similarity_matrix.shape}")
        print("Cos similarity matrix maybe done..")


    except Exception as e:
        print(f"Error calculating similarity matrix: {e}", file=sys.stderr)
        similarity_matrix = None
        all_embeddings_matrix = None
else:
    if final_book_dataset_with_embeddings is None:
        print("Error: `final_book_dataset_with_embeddings` is not available. Skipping similarity calculation.", file=sys.stderr)
    else:
        print("Error: 'embeddings' column not found in `final_book_dataset_with_embeddings`. Skipping similarity calculation.", file=sys.stderr)

Extracting embeddings to a NumPy matrix...
Shape of embeddings matrix: (4011, 768)
Calculating cosine similarity matrix...
Shape of similarity matrix: (4011, 4011)
Cos similarity matrix maybe done..


In [None]:
print("the shape of the similarity of the matrix ")
print(similarity_matrix.shape)

print(similarity_matrix.shape)

all_embeddings_matrix.shape

the shape of the similarity of the matrix 
(4011, 4011)
(4011, 4011)


(4011, 768)

In [None]:

book_titles = None
title_to_index = None
index_to_title = None

if final_book_dataset_with_embeddings is not None and 'Title' in final_book_dataset_with_embeddings.column_names:
    try:
        book_titles = final_book_dataset_with_embeddings['Title']

        # maping from title to index
        title_to_index = {title: i for i, title in enumerate(book_titles)}

        # this mapping from index to title ; to displaying results
        index_to_title = {i: title for i, title in enumerate(book_titles)}
        print(f"Created title-to-index mapping for {len(title_to_index)} titles.")

    except Exception as e:
        print(f"Error creating title mappings: {e}. Recommendations by title might not work.", file=sys.stderr)
elif final_book_dataset_with_embeddings is not None:

    index_to_title = {i: f"Book Index {i}" for i in range(len(final_book_dataset_with_embeddings))}
    print("Warning: 'Title' column not found. Recommendations will use book indices.", file=sys.stderr)


def get_recommendations_by_title(book_title: str, top_n: int = 5):
    """
    Recommends books similar to the given book title.
    """
    if similarity_matrix is None:
        print("Error: Similarity matrix not calculated.", file=sys.stderr)
        return []
    if title_to_index is None or index_to_title is None:
        print("Error: Title mappings not available. Cannot recommend by title.", file=sys.stderr)
        return []
    if book_title not in title_to_index:
        print(f"Error: Book title '{book_title}' not found in the dataset.", file=sys.stderr)
        return []

    try:
        #  djj get the index of the input book title
        book_idx = title_to_index[book_title]

        # Get the similarity scores for this book with all other books
        sim_scores = list(enumerate(similarity_matrix[book_idx]))

        # Sort the books based on the similarity scores in descending order

        # lambda item: item[1] means sort by the second element of the tuple (the score)
        sim_scores = sorted(sim_scores, key=lambda item: item[1], reverse=True)


        top_similar_books_indices = [i[0] for i in sim_scores[1:top_n + 1]]


        recommended_books = []
        for i in top_similar_books_indices:
            recommendation_info = {"title": index_to_title.get(i, f"Unknown Title (Index {i}")}

            if 'Category' in final_book_dataset_with_embeddings.column_names:
                recommendation_info["category"] = final_book_dataset_with_embeddings[i]['Category']
            if 'embeddings' in final_book_dataset_with_embeddings.column_names:
                pass
            recommended_books.append(recommendation_info)

        return recommended_books

    except Exception as e:
        print(f"Error generating recommendations for '{book_title}': {e}", file=sys.stderr)
        return []




# def get_recommendations_by_index(book_idx: int, top_n: int = 5):
#     """
#     Recommends books similar to the book at the given index.
#     """
#     if similarity_matrix is None:
#         print("Error: Similarity matrix not calculated.", file=sys.stderr)
#         return []
#     if not (0 <= book_idx < len(similarity_matrix)):
#         print(f"Error: Book index {book_idx} is out of bounds.", file=sys.stderr)
#         return []
#     if index_to_title is None: # Should be created even if no Title column
#         print("Error: Index-to-title mapping not available.", file=sys.stderr)

#     try:
#         sim_scores = list(enumerate(similarity_matrix[book_idx]))
#         sim_scores = sorted(sim_scores, key=lambda item: item[1], reverse=True)
#         top_similar_books_indices = [i[0] for i in sim_scores[1:top_n + 1]]

#         recommended_books = []
#         for i in top_similar_books_indices:
#             recommendation_info = {"title": index_to_title.get(i, f"Book Index {i}")}
#             if 'Category' in final_book_dataset_with_embeddings.column_names:
#                 recommendation_info["category"] = final_book_dataset_with_embeddings[i]['Category']
#             recommended_books.append(recommendation_info)

#         return recommended_books
#     except Exception as e:
#         print(f"Error generating recommendations for index {book_idx}: {e}", file=sys.stderr)
#         return []

Created title-to-index mapping for 3997 titles.


In [None]:
book_titles[1000]

' تعاون الطفل'

In [None]:
if 'get_recommendations_by_title' in globals() and 'title_to_index' in globals() and title_to_index is not None:
    print("\n--- test Recommendations by Title ---")



    # book titles for testing ... just to test we choose them randomly
    sample_titles_to_test = []
    if book_titles and len(book_titles) > 5:
        sample_titles_to_test = [
            book_titles[10],
            book_titles[550],
            book_titles[len(book_titles) // 2],
            book_titles[500]
        ]

    else:
        print("Warning: 'book_titles' list is not .. good ")



    if not sample_titles_to_test:
        print("there is no sample titles to test ..")
    else:


        for i, title in enumerate(sample_titles_to_test):

            print(f"\n--- Recommendations for: '{title}' (Book {i+1}) ---")

            # display test books category
            if 'Category' in final_book_dataset_with_embeddings.column_names and title_to_index and title in title_to_index:
                input_book_index = title_to_index[title]
                input_book_category = final_book_dataset_with_embeddings[input_book_index]['Category']
                print(f"(Category of input book: {input_book_category})")

            recommendations = get_recommendations_by_title(title, top_n=5)
            if recommendations:
                for rec_idx, rec in enumerate(recommendations):
                    print(f"  {rec_idx+1}. Title: {rec.get('title', 'N/A')}")
                    if 'category' in rec:
                        print(f"     Category: {rec.get('category', 'N/A')}")
            else:
                print(f"  No recommendations found or an error occurred for '{title}'.")



--- Testing Recommendations by Title ---

--- Recommendations for: ' الدعوة الإسلامية فى القرن الحالى' (Book 1) ---
(Category of input book: الأدب والخيال)
  1. Title:  بداية اللانهاية
     Category: العلوم والطبيعة
  2. Title:  معالم الشريعة الإسلامية
     Category: الكتب الإسلامية
  3. Title:  تاريخ الرياضيات العربية بين الجبر والحساب
     Category: العلوم والطبيعة
  4. Title:  المنشقون تنقيب عن مفهوم الخوارج بين التاريخ و الواقع
     Category: الكتب السياسية
  5. Title:  ما بعد العولمة
     Category: الاقتصاد والأعمال

--- Recommendations for: ' غصن مطعم بشجرة غريبة' (Book 2) ---
(Category of input book: الأدب والخيال)
  1. Title:  بروفة لاثنين
     Category: الأدب والخيال
  2. Title:  مواقف من السيرة النبوية
     Category: الكتب الإسلامية
  3. Title:  إنها امرأة أربعينية
     Category: الأدب والخيال
  4. Title:  رحلة حنظله
     Category: الفنون
  5. Title:  في مرايا حانة
     Category: الأدب والخيال

--- Recommendations for: ' إدارة المؤسسات العامة' (Book 3) ---
(Category of input

## Saving files for use in FastAPI and react frontend



In [None]:
import pickle
import os

SAVE_DIR = 'drive/MyDrive/BookRecSysArtifacts/'
os.makedirs(SAVE_DIR, exist_ok=True)

if 'final_book_dataset_with_embeddings' in locals() and \
   'all_embeddings_matrix' in locals() and \
   'similarity_matrix' in locals() and \
   'title_to_index' in locals() and \
   'index_to_title' in locals():

    print(f"Saving artifacts to: {SAVE_DIR}")

    try:
        book_metadata_list = []
        dataset = final_book_dataset_with_embeddings
        total_books = len(dataset)

        print(f"Preparing metadata for {total_books} books...")

        for i in range(total_books):
            item = {"book_id": i}

            if 'Title' in dataset.column_names:
                item["title"] = dataset[i]['Title']
            else:
                item["title"] = index_to_title.get(i, f"Unknown Title {i}")

            if 'Category' in dataset.column_names:
                item["category"] = dataset[i]['Category']

            if 'processed_description' in dataset.column_names:
                item["description"] = dataset[i]['processed_description']
            elif 'Description' in dataset.column_names:
                item["description"] = dataset[i]['Description']

            if 'Author' in dataset.column_names:
                item["author"] = dataset[i]['Author']

            if 'Publication_Date' in dataset.column_names:
                item["publication_date"] = dataset[i]['Publication_Date']

            if 'ISBN' in dataset.column_names:
                item["isbn"] = dataset[i]['ISBN']

            book_metadata_list.append(item)

        df = pd.DataFrame(book_metadata_list)
        metadata_path = os.path.join(SAVE_DIR, 'book_metadata.csv')
        df.to_csv(metadata_path, index=False, encoding='utf-8-sig')
        print(f"Metadata saved to {metadata_path}")

    except Exception as e:
        print(f"Error saving book metadata: {e}", file=sys.stderr)

    try:
        embeddings_path = os.path.join(SAVE_DIR, 'book_embeddings.npy')
        np.save(embeddings_path, all_embeddings_matrix)
        print(f"Embeddings saved to {embeddings_path}")
    except Exception as e:
        print(f"Error saving embeddings: {e}", file=sys.stderr)

    try:
        similarity_path = os.path.join(SAVE_DIR, 'similarity_matrix.npy')
        np.save(similarity_path, similarity_matrix)
        print(f"Similarity matrix saved to {similarity_path}")
    except Exception as e:
        print(f"Error saving similarity matrix: {e}", file=sys.stderr)

    try:
        title_index_path = os.path.join(SAVE_DIR, 'title_to_index.pkl')
        with open(title_index_path, 'wb') as f:
            pickle.dump(title_to_index, f)
        print(f"title_to_index saved to {title_index_path}")
    except Exception as e:
        print(f"Error saving title_to_index: {e}", file=sys.stderr)

    try:
        index_title_path = os.path.join(SAVE_DIR, 'index_to_title.pkl')
        with open(index_title_path, 'wb') as f:
            pickle.dump(index_to_title, f)
        print(f"index_to_title saved to {index_title_path}")
    except Exception as e:
        print(f"Error saving index_to_title: {e}", file=sys.stderr)

else:
    print("Required data is missing. Unable to save artifacts.", file=sys.stderr)


Saving artifacts to: drive/MyDrive/BookRecSysArtifacts/
Preparing metadata for 4011 books...
Book metadata (using processed description) saved to drive/MyDrive/BookRecSysArtifacts/book_metadata.csv
Embeddings matrix saved to drive/MyDrive/BookRecSysArtifacts/book_embeddings.npy
Similarity matrix saved to drive/MyDrive/BookRecSysArtifacts/similarity_matrix.npy
title_to_index mapping saved to drive/MyDrive/BookRecSysArtifacts/title_to_index.pkl
index_to_title mapping saved to drive/MyDrive/BookRecSysArtifacts/index_to_title.pkl
