In [1]:
# ===============================
# Project bootstrap cell
# ===============================

import sys
import os
import importlib

# تأكيد إن مسار المشروع مضاف
PROJECT_ROOT = os.getcwd()
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# تحميل الموديولات
import config
import llm_clients
import prompts
import generator

# إعادة تحميل لتفادي الكاش
importlib.reload(config)
importlib.reload(llm_clients)
importlib.reload(prompts)
importlib.reload(generator)

from generator import generate


print("✅ Project loaded successfully")


✅ Project loaded successfully


## Download the model 

In [2]:
!pip install PyPDF2





[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import os
from PyPDF2 import PdfReader


# Collect the data and Read it

In [4]:
import os
# 1️⃣ Collect all PDF files in the current folder
folder = "."
pdf_files = [f for f in os.listdir(folder) if f.endswith(".pdf")]

# 2️⃣ Extract text from all PDF files
all_text = ""
for pdf_file in pdf_files:
    with open(pdf_file, "rb") as f:
        reader = PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:  # skip empty pages
                all_text += page_text + "\n"

print(f"Read {len(pdf_files)} PDF file(s).")
print("Total number of characters:", len(all_text))
print("First 1000 characters:\n", all_text[:1000])


Read 15 PDF file(s).
Total number of characters: 1311317
First 1000 characters:
 ScienceDirect
Available online at www.sciencedirect.com
Procedia Computer Science 171 (2020)  2267–2274
1877-0509 © 2020 The Authors. Published by Elsevier B.V .
This is an open access article under the CC BY-NC-ND license ( http://creativecommons.org/licenses/by-nc-nd/4.0/ )
Peer-review under responsibility of the scientific committee of the Third International Conference on Computing and Network  
Communications (CoCoNet’19).
10.1016/j.procs.2020.04.245
10.1016/j.procs.2020.04.245 1877-0509© 2020 The Authors. Published by Elsevier B.V .
This is an open access article under the CC BY-NC-ND license ( http://creativecommons.org/licenses/by-nc-nd/4.0/ )
Peer-review under responsibility of the scientific committee of the Third International Conference on Computing and Network  
Communications (CoCoNet’19). Available online at www.sciencedirect.com  
ScienceDirect  
Procedia Computer Science 00 (20 19) 000 –00

# Clear the Data 

In [5]:
import re

def clean_text_keep_sentences(text):
    """
    Clean the text by:
    - Removing URLs (http, https, www)
    - Removing unwanted characters
    - Keeping Arabic and English letters, digits, and sentence-ending punctuation
    - Normalizing spaces
    """
    # Remove URLs starting with http://, https:// or www
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # Remove any character that is not a word, space, Arabic letters, digits, or sentence-ending punctuation
    text = re.sub(r"[^\w\s\u0600-\u06FF.!؟]", "", text)
    # Replace multiple spaces/newlines with a single space
    text = re.sub(r"\s+", " ", text)
    return text.strip()




In [6]:
def clean_sentences(sentences):
    cleaned = []
    for s in sentences:
        # تجاهل الجمل القصيرة جدًا
        if len(s.split()) < 6:
            continue
        
        # تجاهل الجمل اللي فيها DOI أو نشر
        if re.search(r'\b(doi|Elsevier|ScienceDirect|license|Published|Available online)\b', s, re.IGNORECASE):
            continue
        
        # تجاهل الجمل اللي أغلبها أرقام
        digit_ratio = sum(c.isdigit() for c in s) / max(len(s), 1)
        if digit_ratio > 0.3:
            continue
        
        cleaned.append(s.strip())
    
    return cleaned


In [7]:
def advanced_filter(sentences):
    blacklist = [
        "abstract",
        "keywords",
        "corresponding author",
        "@",
        "university",
        "department"
    ]

    clean = []
    for s in sentences:
        if any(b in s.lower() for b in blacklist):
            continue
        clean.append(s)
    return clean


 # Splite the texte into Sentences 

In [8]:
import re

# Split text into sentences (English + Arabic punctuation)
def split_into_sentences(text):
    sentences = re.split(r'(?<=[.!?؟])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

# Use the existing variable that contains all text
sentences = split_into_sentences(all_text)

print("Total number of sentences:", len(sentences))
print("First 5 sentences:\n", sentences[:5])


Total number of sentences: 14850
First 5 sentences:
 ['ScienceDirect\nAvailable online at www.sciencedirect.com\nProcedia Computer Science 171 (2020)  2267–2274\n1877-0509 © 2020 The Authors.', 'Published by Elsevier B.V .', 'This is an open access article under the CC BY-NC-ND license ( http://creativecommons.org/licenses/by-nc-nd/4.0/ )\nPeer-review under responsibility of the scientific committee of the Third International Conference on Computing and Network  \nCommunications (CoCoNet’19).', '10.1016/j.procs.2020.04.245\n10.1016/j.procs.2020.04.245 1877-0509© 2020 The Authors.', 'Published by Elsevier B.V .']


# split sentences into chunks to make every chunk idea

In [9]:
# 5️⃣ Split sentences into chunks
def chunk_text(sentences, chunk_size=50, overlap=10):
    chunks = []
    i = 0
    while i < len(sentences):
        current_chunk = sentences[i:i+chunk_size]
        chunks.append(" ".join(current_chunk))
        i += chunk_size - overlap  # overlap between chunks
    return chunks

chunks = chunk_text(sentences, chunk_size=50, overlap=10)
print("Total number of chunks:", len(chunks))
print("First chunk (first 1000 characters):\n", chunks[0][:1000])

Total number of chunks: 372
First chunk (first 1000 characters):
 ScienceDirect
Available online at www.sciencedirect.com
Procedia Computer Science 171 (2020)  2267–2274
1877-0509 © 2020 The Authors. Published by Elsevier B.V . This is an open access article under the CC BY-NC-ND license ( http://creativecommons.org/licenses/by-nc-nd/4.0/ )
Peer-review under responsibility of the scientific committee of the Third International Conference on Computing and Network  
Communications (CoCoNet’19). 10.1016/j.procs.2020.04.245
10.1016/j.procs.2020.04.245 1877-0509© 2020 The Authors. Published by Elsevier B.V . This is an open access article under the CC BY-NC-ND license ( http://creativecommons.org/licenses/by-nc-nd/4.0/ )
Peer-review under responsibility of the scientific committee of the Third International Conference on Computing and Network  
Communications (CoCoNet’19). Available online at www.sciencedirect.com  
ScienceDirect  
Procedia Computer Science 00 (20 19) 000 –000  
www.elsevie

In [10]:
filtered_sentences = clean_sentences(sentences)

chunks = chunk_text(filtered_sentences, chunk_size=50, overlap=10)

print("Clean chunks:", len(chunks))
print("First clean chunk:\n", chunks[0][:1000])


Clean chunks: 210
First clean chunk:
 Infosys Limited, India  
Vijayaraghavan_V01@infosys.com  
 
 
Abstract  
With the usage of Chatbots growing at an unprecedented rate, it is imperative that they are thoroughly tested, verified and 
validated. This is done to ensure that Chatbots do not  fail during operation. Chatbot failures are undesirable and they often occur 
when the bot is provided with ambiguous or illegible input. The Chatbots have to be thoroughly tested to ensure that they do not 
fail under any circumstance and have mechanisms t o deal with such scenarios. Although various methods are in use to test 
Chatbots, algorithm testing is a promising solution to the problem. This involves the use of techniques such as cross -validation, 
grammar and parsing, verification and validation and s tatistical parsing. This paper aims to explore the prominent types of 
chatbot testing methods with detailed emphasis on algorithm testing techniques. Infosys Limited, India  
Vijayaraghavan

# download Bert model to make summerize 

In [11]:
!pip install transformers torch sentencepiece





[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## Distilled BART – CNN model

In [12]:
from transformers import pipeline

summarizer = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",
    framework="pt",
    device=-1,
)

result = summarizer(
    "This is a simple test text to verify that the summarization pipeline works correctly."
)

print(result)


Device set to use cpu
Your max_length is set to 142, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)


[{'summary_text': " This is a simple test text to verify that the summarization pipeline works correctly . The test text is written in a simple text to test that summarization works correctly. This is the first time we've seen this type of test text in a blog post that has been used in the past ."}]


In [13]:
print("number of chunks:", len(chunks))
print("\nthe firse chunks:\n", chunks[0][:400])


number of chunks: 210

the firse chunks:
 Infosys Limited, India  
Vijayaraghavan_V01@infosys.com  
 
 
Abstract  
With the usage of Chatbots growing at an unprecedented rate, it is imperative that they are thoroughly tested, verified and 
validated. This is done to ensure that Chatbots do not  fail during operation. Chatbot failures are undesirable and they often occur 
when the bot is provided with ambiguous or illegible input. The Chat


In [14]:
safe_chunk = chunks[0][:1000]  # ✂️ قصّ آمن

summary_1 = summarizer(
    safe_chunk,
    max_length=120,
    min_length=40,
    do_sample=False
)[0]["summary_text"]

print("\n summarise the first chunks :\n", summary_1)



 summarise the first chunks :
  Chatbot failures are undesirable and they often occur when the bot is provided with ambiguous or illegible input . The Chatbots have to be thoroughly tested to ensure that they do not  fail under any circumstance . Algorithm testing is a promising solution to the problem . This involves the use of techniques such as cross-validation and parsing, verification and validation and s tatistical parsing .


In [15]:
summaries = []

for i in range(3):
    safe_chunk = chunks[i][:1000]  # ✂️ مهم
    
    s = summarizer(
        safe_chunk,
        max_length=120,
        min_length=40,
        do_sample=False
    )[0]["summary_text"]
    
    summaries.append(s)
    print(f"\n Summary of the section {i+1}:\n{s}")



 Summary of the section 1:
 Chatbot failures are undesirable and they often occur when the bot is provided with ambiguous or illegible input . The Chatbots have to be thoroughly tested to ensure that they do not  fail under any circumstance . Algorithm testing is a promising solution to the problem . This involves the use of techniques such as cross-validation and parsing, verification and validation and s tatistical parsing .

 Summary of the section 2:
 Chattest is a collaborative open source project that offers 120 questions across various paradigm such as Answering, Error Manage ment, Intelligence, Navigation, Onboarding, Personality and Understanding . The performance of the bot across these key areas can be used to improve the design and functionality of the chatbot .

 Summary of the section 3:
 If through the Turing Test a human is unable to identify the chatbot, then we c an assume its NLP ipientalgorithm is functioning appropriately . In the subsequent sections, popular chat

In [16]:
# ربط الاسم المستخدم في المودات بالـ summaries الفعلية
chunk_summaries = summaries


In [17]:
print(type(chunk_summaries))
print(len(chunk_summaries))
print(chunk_summaries[0][:200])


<class 'list'>
3
 Chatbot failures are undesirable and they often occur when the bot is provided with ambiguous or illegible input . The Chatbots have to be thoroughly tested to ensure that they do not  fail under any


# download LLM Api

In [18]:
pip install python-dotenv


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
pip install --upgrade google-genai


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


# teste the api 

In [20]:
import importlib
import config
importlib.reload(config)


<module 'config' from 'C:\\Users\\lenovo\\project R2S\\config.py'>

In [21]:
pip install openai


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
from generator import generate_social_post
