In [None]:
import os
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from pinecone import Pinecone
from pinecone import ServerlessSpec
import io
import requests
import pdfplumber
import pandas as pd
from bs4 import BeautifulSoup
from atlassian import Confluence
from langchain.embeddings import HuggingFaceEmbeddings
from docx import Document
from langchain_pinecone import PineconeVectorStore
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from flask import Flask, render_template, jsonify, request


In [3]:
print(os.getcwd())

c:\Users\Nabeel\OneDrive\Desktop\Programming\Confluence_Chatbot\research


In [4]:
os.chdir(r"C:\Users\Nabeel\OneDrive\Desktop\Programming\Hackathon\HackAIdea")

In [2]:
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
CONFLUENCE_URL = os.getenv("CONFLUENCE_HOST")
EMAIL = os.getenv("EMAIL_ID")
TOKEN = os.getenv("CONFLUENCE_API_TOKEN")


In [3]:
#PINECONE DB CONFIGURATION
# PINECONE_API_KEY = "PINECONE_API_KEY"
index_name = "hackathon"

# ------------------------------------------------------------
# PINECONE INIT
# ------------------------------------------------------------
pc = Pinecone(api_key=PINECONE_API_KEY)

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
index = pc.Index(index_name)


In [4]:
# ------------------------------------------------------------
# EMBEDDINGS - Using Your Model
# ------------------------------------------------------------
def download_embeddings():
    model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
    return HuggingFaceEmbeddings(model_name=model_name)

embeddings = download_embeddings()

# ------------------------------------------------------------
# EMBED TEXT
# ------------------------------------------------------------
def embed_text(text: str):
    return embeddings.embed_query(text)

# ------------------------------------------------------------
# TEXT CHUNKING
# ------------------------------------------------------------
def chunk_text(text, chunk_size=800, overlap=200):
    if overlap >= chunk_size:
        raise ValueError("overlap must be smaller than chunk_size")

    chunks = []
    start = 0
    length = len(text)

    while start < length:
        end = min(start + chunk_size, length)
        chunks.append(text[start:end])
        start += chunk_size - overlap

    return chunks

# ------------------------------------------------------------
# CLEAN HTML → TEXT (includes tables)
# ------------------------------------------------------------
def confluence_html_to_text(html):
    soup = BeautifulSoup(html, "html.parser")

    # remove useless tags
    for tag in soup(["script", "style"]):
        tag.decompose()

    # convert tables manually
    for table in soup.find_all("table"):
        rows = []
        for tr in table.find_all("tr"):
            cells = [c.get_text(strip=True) for c in tr.find_all(["td", "th"])]
            rows.append(" | ".join(cells))
        table.replace_with("\n".join(rows))

    text = soup.get_text(separator="\n")
    # collapse multiple blank lines
    return "\n".join([line.strip() for line in text.splitlines() if line.strip()])


# ------------------------------------------------------------
# Extract text from PDF
# ------------------------------------------------------------
def extract_pdf_text(file_bytes):
    text_parts = []
    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
        for page in pdf.pages:
            extracted = page.extract_text(layout=True)
            text_parts.append(extracted or "")
    return "\n\n".join(text_parts)

# ------------------------------------------------------------
# Extract text from Excel (xlsx/xls)
# ------------------------------------------------------------
def extract_excel_text(file_bytes):
    dfs = pd.read_excel(io.BytesIO(file_bytes), sheet_name=None)
    output = []
    for sheet_name, df in dfs.items():
        output.append(f"Sheet: {sheet_name}")
        output.append(df.to_string(index=False))
    return "\n\n".join(output)

# ------------------------------------------------------------
# DOWNLOAD FILE FROM URL
# ------------------------------------------------------------
def download_file_from_url(url):
    """
    Downloads a file and returns (bytes, content_type)
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        content_type = response.headers.get("Content-Type", "").lower()
        return response.content, content_type
    except Exception as e:
        print(f"Failed to download file from {url}: {e}")
        return None, None

# Extract text from DOCX
def extract_docx_text(file_bytes):
    doc = Document(io.BytesIO(file_bytes))
    text = []

    for para in doc.paragraphs:
        if para.text.strip():
            text.append(para.text)

    for table in doc.tables:
        for row in table.rows:
            row_data = [cell.text.strip() for cell in row.cells]
            text.append(" | ".join(row_data))

    return "\n".join(text)

  return HuggingFaceEmbeddings(model_name=model_name)


In [5]:
# ------------------------------------------------------------
# INGEST A SINGLE PAGE + ATTACHMENTS + LINKED FILES
# ------------------------------------------------------------
def ingest_page(page_id):
    print(f"\n=== Processing Confluence Page: {page_id} ===\n")

    # Connect
    confluence = Confluence(
        url=CONFLUENCE_URL.replace("/wiki", ""),
        username=EMAIL,
        password=TOKEN
    )

    # Fetch page
    page = confluence.get_page_by_id(page_id=page_id, expand="body.storage")
    title = page["title"]
    html = page["body"]["storage"]["value"]

    # # Clean and chunk page text
    text = confluence_html_to_text(html) # extracting text using function defined above
    chunks = chunk_text(text) # chunking text using function defined above

    # Upload page text chunks
    vectors = []
    for i, chunk in enumerate(chunks):
        vectors.append({
            "id": f"{page_id}-page-{i}",
            "values": embed_text(chunk),
            "metadata": {
                "source": "page",
                "page_id": page_id,
                "page_title": title,
                "page_url": CONFLUENCE_URL + page["_links"]["webui"],
                "chunk": i,
                "text": chunk
            }
        })
    index.upsert(vectors) # using upsert class present in pinecone
    print(f"Uploaded {len(vectors)} page chunks for: {title}")

    # -------- Ingest Attachments --------
    attachments = confluence.get_attachments_from_content(page_id).get("results", []) # getting attachments using "get_attachments_from_content" class from atlassian
    for att in attachments:
        file_name = att["title"]
        download_link = att["_links"]["download"]

        file_bytes = requests.get(
            CONFLUENCE_URL + download_link,
            auth=(EMAIL, TOKEN)
        ).content

        if file_name.lower().endswith(".pdf"):
            extracted = extract_pdf_text(file_bytes)
            file_type = "pdf"
        elif file_name.lower().endswith((".xlsx", ".xls")):
            extracted = extract_excel_text(file_bytes)
            file_type = "excel"
        elif file_name.lower().endswith((".docx", ".doc")):
            try:
                import docx
                doc = docx.Document(io.BytesIO(file_bytes))
                extracted = "\n".join([p.text for p in doc.paragraphs])
                file_type = "word"
            except Exception as e:
                print(f"Failed to read Word file {file_name}: {e}")
                continue
        else:
            print(f"Skipping unsupported attachment: {file_name}")
            continue

        print(f"Ingesting attachment: {file_name}")

        file_chunks = chunk_text(extracted) # chunking text exatracted from attachements
        file_vectors = []
        for i, chunk in enumerate(file_chunks):
            file_vectors.append({
                "id": f"{page_id}-{file_name}-{i}",
                "values": embed_text(chunk),
                "metadata": {
                    "source": "attachment",
                    "file_type": file_type,
                    "filename": file_name,
                    "page_id": page_id,
                    "page_title": title,
                    "page_url": CONFLUENCE_URL + page["_links"]["webui"],
                    "chunk": i,
                    "text": chunk
                }
            })
        index.upsert(file_vectors) # using upsert class present in pinecone
        print(f"Uploaded {len(file_vectors)} chunks from attachment: {file_name}")

    # # -------- Ingest Linked Files in Page --------
    # soup = BeautifulSoup(html, "html.parser") # converting to Document Object Model (DOM)
    # links = [a["href"] for a in soup.find_all("a", href=True)] # extracting all links

    # for link in links:
    #     file_bytes, new = download_file_from_url(link)
    #     # print(file_bytes)
    #     if not file_bytes:
    #         continue
    #     print(file_bytes)
    #     print("***")
    #     print(new)

    
    # ## Detect type based on SharePoint hint in URL
    #     print("check point")
    #     if ":w:" in link:
    #         # import docx
    #         # doc = docx.Document(io.BytesIO(file_bytes))
    #         # extracted = "\n".join([p.text for p in doc.paragraphs])
    #         # file_type = "word"
    #         extracted = extract_docx_text(file_bytes)
    #         file_type = "word"

    #     elif ":x:" in link:
    #         extracted = extract_excel_text(file_bytes)
    #         file_type = "excel"
    #     # elif link.lower().endswith(".pdf"):
    #     #     extracted = extract_pdf_text(file_bytes)
    #     #     file_type = "pdf"
    #     else:
    #         print(f"Skipping unsupported linked file: {link}")
    #         continue

    #     file_chunks = chunk_text(extracted) # chunking text exatracted from linked files
    #     file_vectors = []
    #     for i, chunk in enumerate(file_chunks):
    #         file_vectors.append({
    #             "id": f"{page_id}-{i}",
    #             "metadata": {
    #                 "source": "linked_file",
    #                 "file_type": file_type,
    #                 "page_id": page_id,
    #                 "page_url": CONFLUENCE_URL + page["_links"]["webui"],
    #                 "file_url": link,
    #                 "chunk": i,
    #                 "text": chunk
    #             }
    #         })
    #     index.upsert(file_vectors) # using upsert class present in pinecone
    #     print(f"Uploaded {len(file_vectors)} chunks from linked file: {link}")



In [6]:
# Extracting multiple pages and Storing in Pinecone DB
PAGE_IDS = ["819201", "1277953"]  # Add more page IDs as needed

# ------------------------------------------------------------
# INGEST MULTIPLE PAGES
# ------------------------------------------------------------
def ingest_multiple_pages(page_ids):
    for page_id in page_ids:
        try:
            ingest_page(page_id)
        except Exception as e:
            print(f"Error ingesting page {page_id}: {e}")

# ------------------------------------------------------------
# RUN BATCH INGESTION
# ------------------------------------------------------------
ingest_multiple_pages(PAGE_IDS)
print("\n==== ALL PAGES INGESTED ====\n")



=== Processing Confluence Page: 819201 ===

Uploaded 2 page chunks for: Data Asset: sample_data_table
Ingesting attachment: data_dictionary.xlsx
Uploaded 1 chunks from attachment: data_dictionary.xlsx
Ingesting attachment: design_document.xlsx
Uploaded 3 chunks from attachment: design_document.xlsx

=== Processing Confluence Page: 1277953 ===

Uploaded 3 page chunks for: Advancements in AI

==== ALL PAGES INGESTED ====



In [13]:
vector_id = "000410ab-51e6-4894-8093-3f02e614fa8c"
result = index.fetch(ids=[vector_id])
metadata = result.vectors[vector_id].metadata
print(metadata)

{'source': 'data\\artificial-intelligence-modern-approach.9780131038059.25368.pdf', 'text': 'getting at the same goal: reducing the variance in the language model.\nOne complication: note that the expression P(c\ni |ci−2:i−1) asks for P(c1 |c-1:0) when\ni =1 , but there are no characters before c1. We can introduce artiﬁcial characters, for\nexample, deﬁning c0 to be a space character or a special “begin text” character. Or we can\nfall back on lower-order Markov models, in effect deﬁning c-1:0 to be the empty sequence\nand thus P(c1 |c-1:0)= P(c1).\n22.1.3 Model evaluation\nWith so many possible n-gram models—unigram, bigram, trigram, interpolated smoothing\nwith different values of λ, etc.—how do we know what model to choose? We can evaluate a\nmodel with cross-validation. Split the corpus into a training corpus and a validation corpus.'}


In [24]:
system_prompt = (
    "You are a knowledgeable assistant with access to internal documentation from Confluence pages. "
    "Use the provided context to answer the user’s questions as accurately as possible. "
    "Only use the information given in the context. If the answer is not present, say 'I don’t know'. "
    "{context}"
)

In [17]:
# embeddings = download_embeddings()

index_name = "hackathon" 
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [29]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

# chatModel = ChatOpenAI(model="gpt-4o")
chatModel = ChatOllama(model="llama2:latest")
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [39]:
def chat(msg):
    # msg = request.form["msg"]
    print("User Input:", msg)

    response = rag_chain.invoke({"input": msg})

    answer = response["answer"]
    docs = response["context"]   # retrieved documents

    citations = []
    for doc in docs:
        meta = doc.metadata

        citations.append({
            # "source": meta.get("source"),
            "page_url": meta.get("page_url"),
            # "score": meta.get("score"),  # optional if stored
        })

    # return jsonify({
    #     "answer": answer,
    #     "citations": citations
    # })
    return {
        "answer": answer,
        "citations": citations
    }

In [43]:
msg = "What all attributes are present in the Sample Data Asset?"
chat(msg)

User Input: What all attributes are present in the Sample Data Asset?


{'answer': "Based on the information provided in the context, the following attributes are present in the Sample Data Asset:\n\n1. id\n2. name\n3. created_at\n4. value\n\nTherefore, the answer to the user's question is:\n\nThe Sample Data Asset contains the following attributes: id, name, created_at, and value.",
 'citations': [{'page_url': 'https://nabeelnizam78.atlassian.net/wiki/spaces/MFS/pages/819201/Data+Asset+sample_data_table'},
  {'page_url': None},
  {'page_url': 'https://nabeelnizam78.atlassian.net/wiki/spaces/MFS/pages/819201/Data+Asset+sample_data_table'}]}

RuntimeError: Working outside of application context.

This typically means that you attempted to use functionality that needed
the current application. To solve this, set up an application context
with app.app_context(). See the documentation for more information.