In [9]:
!pip install transformers sentence-transformers PyPDF2 nltk --quiet

In [12]:
import torch
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import PyPDF2
import nltk
import random
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import string
from google.colab import files

nltk.download('punkt')
nltk.download('stopwords')

# ===============================================
# 🧠 Load Models
# ===============================================
print("Loading models... please wait.")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Models loaded successfully!\n")

# ===============================================
# 📂 Upload and Extract PDF
# ===============================================
print("Upload a document (PDF)...")
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]

def extract_text_from_pdf(path):
    reader = PyPDF2.PdfReader(path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

document_text = extract_text_from_pdf(pdf_path)
print("\n✅ Document uploaded and extracted successfully!\n")

# ===============================================
# 🧹 Preprocess Text
# ===============================================
def preprocess_text(text):
    cleaned_sentences = []
    pattern_header = re.compile(r"Company XYZ Human Resources Policy Manual", re.IGNORECASE)
    pattern_page_num = re.compile(r"Page \d+( of \d+)?", re.IGNORECASE)
    sentences_raw = sent_tokenize(text)

    for sent in sentences_raw:
        sent_stripped = sent.strip()
        if not sent_stripped:
            continue
        if pattern_header.search(sent_stripped) or pattern_page_num.search(sent_stripped):
            continue
        if len(sent_stripped.split()) < 4:
            continue
        cleaned_sentences.append(sent_stripped)
    return cleaned_sentences

sentences = preprocess_text(document_text)

if not sentences:
    print("Error: No valid text content found after cleaning.")
    sentence_embeddings = torch.empty(0)
else:
    print(f"Processed {len(sentences)} clean sentences.")
    sentence_embeddings = embedder.encode(sentences, convert_to_tensor=True)

# ===============================================
# 😀 Emoji and Response Handling
# ===============================================
emojis = ["😊", "🤖", "🔥", "👍", "💼", "✨", "😎", "🙌", "📘", "💬", "🤩"]

def random_emoji():
    return random.choice(emojis)

# ===============================================
# 💬 Chatbot Logic
# ===============================================
def chatbot(query, user_name):
    q_lower = query.lower().strip()

    # --- Greetings ---
    if q_lower in ["hello", "hi", "hey", "good morning", "good afternoon"]:
        return f"🤖: Hello {user_name}! How can I assist you today? {random_emoji()}"

    # --- Compliments ---
    if any(word in q_lower for word in ["thanks", "thank you", "good job", "well done", "nice", "great"]):
        return f"🤖: You're welcome, {user_name}! I'm glad I could help. {random_emoji()}"

    # --- Session End ---
    if "bye bot" in q_lower:
        return f"🤖: Goodbye {user_name}! Have a great day! 👋"

    # --- Name question ---
    if "my name" in q_lower:
        return f"🤖: Your name is {user_name}! {random_emoji()}"

    # --- No document case ---
    if sentence_embeddings.nelement() == 0:
        return "🤖: Sorry, I couldn’t process the document properly. No content found."

    # --- Find best matching sentence ---
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0]
    top_result = torch.topk(cos_scores, k=1)
    best_sentence = sentences[top_result[1]]

    if float(top_result[0]) < 0.45:
        return f"🤖: Sorry {user_name}, I couldn’t find that information in the document. {random_emoji()}"
    else:
        return f"🤖: {best_sentence.strip()} {random_emoji()}"

# ===============================================
# 🧑 Interaction Loop
# ===============================================
print("\n--- Chatbot Ready ---")
user_name = input("Enter your name: ")

print(f"🤖: Hello {user_name}! Ask me anything from the document or say 'bye bot' to exit.\n")

while True:
    q = input(f"{user_name}: ")
    if q.lower().strip() == "bye bot":
        print(chatbot(q, user_name))
        break
    ans = chatbot(q, user_name)
    print(ans)
    print()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading models... please wait.


Device set to use cpu


✅ Models loaded successfully!

Upload a document (PDF)...


Saving Policy_HR.pdf to Policy_HR.pdf

✅ Document uploaded and extracted successfully!

Processed 34 clean sentences.

--- Chatbot Ready ---
Enter your name: Dharshini K
🤖: Hello Dharshini K! Ask me anything from the document or say 'bye bot' to exit.

Dharshini K: Hello
🤖: Hello Dharshini K! How can I assist you today? 💬

Dharshini K: What is the dress code?
🤖: The standard office dress code is business casual. ✨

Dharshini K: What is the leave policy?
🤖: We also offer a parental leave policy, which includes 12
weeks of paid leave for the primary caregiver. 😎

Dharshini K: How many vacation days do I get?
🤖: All full-time employees are
entitled to 20 paid vacation days per year, which are accrued on a monthly basis. 🔥

Dharshini K: Bye bot
🤖: Goodbye Dharshini K! Have a great day! 👋
