In [None]:
!pip install langchain_community



In [None]:
!pip install pypdf



In [None]:
!pip install sentence-transformers



In [None]:
pip install python-pptx



In [None]:
!pip install pinecone



In [None]:
pip install PyMuPDF

In [None]:
from langchain.document_loaders import PyPDFLoader
import json
import requests
from langchain.text_splitter import RecursiveCharacterTextSplitter
import nltk
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from pptx import Presentation
from pptx.util import Pt
from pptx.dml.color import RGBColor
from pptx.util import Inches
from pptx.enum.shapes import MSO_SHAPE
from pptx.enum.text import PP_ALIGN
import os
import fitz
import math

nltk.download('punkt')

# Function to match tags using LLM
def match_tags(text, tags):
    prompt = (
        f"Please read the following text and assign the most appropriate tag from the provided list and Ensure that the tags focus on the core concept of the text and do not include common terms or context that may pertain to the entire document. Also ignore the page related details while generating the tags:\n\n"
        f"{text}\n\n"
        f"{tags}\n\n"
        f"First consider the tag list to match the most appropriate tags from the list incase if list not matching provide a tag. Provide only the tag without any additional text or explanation. Do not add sentences in beginning and formatted as follows:\n\n [Tag1,Tag2,Tag3]"
    )
    payload = json.dumps({
        "prompt": prompt,
        "max_tokens": 2048,
        "temperature": 0.1,
        "top_p": 0.95
    })
    headers = {'Content-Type': 'application/json'}

    try:
        response = requests.post(url, data=payload, headers=headers)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        response_json = response.json()

        # Debug: Print the raw response
        #print("Raw response:", response_json)

        # Extract the tags from the response
        if isinstance(response_json, list) and 'text' in response_json[0]:
            response_text = response_json[0]['text'].strip()
            #print(f"Response text: {response_text}")
            pattern = r'\[(.*?)\]'
            if "None" in response_text:
              #print(f"No matching tags found: {response_text}")
              return ["Unknown"]
            else:
              match = re.search(pattern, response_text)
              tags = match.group(1).split(',')
              return [tag.strip() for tag in tags]

        else:
            #print(f"Unexpected response format: {response_json}")
            return ["Unknown"]
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return ["Unknown"]

def character_based_chunking(data):
    # Split your data into smaller documents with chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000,chunk_overlap=200)
    documents = text_splitter.split_documents(data)
    return documents

def bytes_based_chunking(data):
    # Extract text from each Document object and join into a single string
    text_data = ''.join([doc.page_content for doc in data])
    # Encode the text into bytes
    text_bytes = text_data.encode('utf-8')
    chunks = []
    start = 0
    while start < len(text_bytes):
        # Determine the end of the chunk
        end = start + 2000
        # Ensure not to split in the middle of a character
        while end < len(text_bytes) and (text_bytes[end] & 0xC0) == 0x80:
            end -= 1
        # Decode the chunk back to a string and append to chunks list
        chunks.append(text_bytes[start:end].decode('utf-8', errors='ignore'))
        start = end
    return chunks

def page_based_chunking(data):
    pages_text = [doc.page_content for doc in data]
    return pages_text

def extract_font_sizes(pdf_path):
    doc = fitz.open(pdf_path)
    font_sizes = set()

    for page_num in range(doc.page_count):
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        font_size = abs(span["size"])
                        font_sizes.add(font_size)  # Add the font size to the set
    sorted_font_sizes = sorted(font_sizes, reverse=True)
    print(sorted_font_sizes)
    if len(sorted_font_sizes) == 0:
        return None  # No font sizes found

    if len(sorted_font_sizes) == 1:
        return sorted_font_sizes[0]  # Only one font size, return it as the largest

    return sorted_font_sizes[1]

def extract_content_below_headings(pdf_path, size_threshold):
    doc = fitz.open(pdf_path)
    chunks = []
    current_heading = None
    current_content = ""

    for page_num in range(doc.page_count):
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        font_size = span["size"]
                        text = span["text"].strip()

                        if not text:
                            continue

                        # Check if the text is a heading based on the font size
                        if font_size > size_threshold:
                            # Save the current content under the previous heading
                            if current_heading and current_content.strip():
                                chunks.append({"heading": current_heading, "content": current_content.strip()})
                            # Start a new section with the new heading
                            current_heading = text
                            current_content = ""
                        else:
                            # Append text to the current section's content
                            current_content += " " + text

    # Append the last section if it has content
    if current_heading and current_content.strip():
        chunks.append({"heading": current_heading, "content": current_content.strip()})

    return chunks

def create_chunks(pdf_path):
    loader = PyPDFLoader(pdf_path)
    data = loader.load()
    ctype= int(input("Select the chunk type by number: 1) character based chunking 2) Byte size based chunking 3) Pagewise chunking 4) Section wise chunking( based on font size)"))
    if ctype == 1:
      chunks=character_based_chunking(data)
    elif ctype == 2:
      chunks=bytes_based_chunking(data)
    elif ctype == 3:
      chunks=page_based_chunking(data)
    elif ctype == 4:
      font_sizes = extract_font_sizes(pdf_path)
      print("Font sizes found in the PDF:", math.floor(font_sizes))
      chunks = extract_content_below_headings(pdf_path,math.floor(font_sizes)-1)

    # Convert Document objects into strings
    texts = [str(doc) for doc in chunks]
    print("Chunk Size :",len(chunks))
    return texts

def create_tags(texts):
    tags = ['Introduction', 'Subheadings','Subheadings','Conclusion'] # Provide titles and subtitles that are required
    document_tags = []
    for text in texts:
        tag = match_tags(text, tags)
        document_tags.append({
            'Document': text,
            'Tag': tag
        })
    document_tags_df = pd.DataFrame(document_tags)
    return document_tags_df

def generate_embeddings(text):
    return model.encode(text).tolist()

def add_chunks_to_pinecone(document_tags_df):
  # Store only the tags' embeddings in Pinecone and print the vectors
  for i, row in document_tags_df.iterrows():
      tags_embedding = generate_embeddings(row['Document'])
      index.upsert([
          {
              'id': f'tags-{i}',
              'values': tags_embedding,
              'metadata': {'text': row['Document']}
          }
      ], namespace=f"{row['Tag']}")
      #print(f"Vector for tags-{i}: {tags_embedding}")

  print("Tags' embeddings stored in Pinecone successfully.")


# Function to get tags using LLaMA 3 model
def get_tags(prompt):
    url = "llama38b"
    payload = json.dumps({
        "prompt": prompt,
        "max_tokens": 2048,
        "temperature": 0.1,
        "top_p": 0.95
    })
    headers = {'Content-Type': 'application/json'}
    try:
        response = requests.post(url, data=payload, headers=headers)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        response_json = response.json()

        # Extract the tags from the response
        if isinstance(response_json, list) and 'text' in response_json[0]:
            response_text = response_json[0]['text'].strip()
            #print(f"Response text: {response_text}")  # Debugging line
            return response_text
        else:
            #print(f"Unexpected response format: {response_json}")
            return ["Unknown"]
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return ["Unknown"]

def extract_keywords(text):
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word.isalnum()]

# Compare keywords with tags and find relevant chunks across multiple namespaces
def find_relevant_chunks(question):
    keywords = extract_keywords(question)
    #print(f"Extracted keywords: {keywords}")
    query_embeddings = generate_embeddings(' '.join(keywords))
    #print(f"Query embeddings: {query_embeddings[:5]}...")

    response = index.describe_index_stats()
    namespaces = list(response['namespaces'].keys())
    #print(f"Namespaces: {namespaces}")

    all_matches = []

    # Filter namespaces based on keywords
    filtered_namespaces = [ns for ns in namespaces if any(kw.lower() in ns.lower() for kw in keywords)]
    #print(f"Filtered Namespaces: {filtered_namespaces}")

    # Query each filtered namespace
    for namespace in filtered_namespaces:
        response = index.query(
            vector=query_embeddings,
            top_k=5,
            include_metadata=True,
            namespace=namespace
        )

        #print(f"Response from namespace {namespace}: {response}")

        matches = response.get('matches', [])
        all_matches.extend(matches)

    all_matches = sorted(all_matches, key=lambda x: x['score'], reverse=True)

    return all_matches

# Function to generate the final summary using LLM
def create_summary_from_relevant_chunks(relevant_chunks):
    concatenated_text = "\n".join([chunk['metadata']['text'] for chunk in relevant_chunks])

    # Create a prompt for the LLM to generate a summary
    summary_prompt = f"Summarize the following information into a concise summary:\n\n{concatenated_text}\n\nSummary:"
    summary = get_tags(summary_prompt)
    return summary

# Function to load PowerPoint templates from a directory
def load_templates(directory):
    templates = {}
    for filename in os.listdir(directory):
        if filename.endswith(".pptx"):
            templates[filename] = os.path.join(directory, filename)
    return templates

# Function to select a PowerPoint template interactively
def select_template(templates):
    print("Available templates:")
    for i, template in enumerate(templates.keys(), 1):
        print(f"{i}. {template}")
    choice = int(input("Select a template by number: ")) - 1
    selected_template = list(templates.values())[choice]
    return selected_template

def add_slide(prs, title, content, word_limit=80):
    words = content.split()
    while len(words) > word_limit:
        # Get the portion of the content that fits within the word limit
        current_content = ' '.join(words[:word_limit])
        # Add the slide with the current portion of the content
        create_slide(prs, title, current_content)
        # Update the words list to contain the remaining words
        words = words[word_limit:]
        # Update the title to indicate continuation
        title = "to be contd.."
    # Add the final slide with the remaining content
    create_slide(prs, title, ' '.join(words))

def create_slide(prs, title, content):
    slide_layout = prs.slide_layouts[1]  # Use the layout that has a title and content
    slide = prs.slides.add_slide(slide_layout)
    title_placeholder = slide.shapes.title
    content_placeholder = slide.placeholders[1]

    # Set title
    title_placeholder.text = title
    for paragraph in title_placeholder.text_frame.paragraphs:
        for run in paragraph.runs:
            run.font.size = Pt(24)  # Set title font size
            run.font.bold = True  # Set title font bold
            run.font.name = 'Arial'  # Set title font style

    # Split content into bullet points using '•', '.', or '*'
    bullet_points = re.split(r'[•*]', content)
    bullet_points = [bp.strip() for bp in bullet_points if bp.strip()]  # Clean and remove empty entries

    # Clear any existing content
    text_frame = content_placeholder.text_frame
    text_frame.clear()

    # Add bullet points
    for point in bullet_points:
        p = text_frame.add_paragraph()
        p.text = point
        p.level = 0  # Level 0 for top-level bullets
        p.font.size = Pt(16)
        p.font.name = 'Calibri'
        p.font.color.rgb = RGBColor(0, 0, 0)  # Black colo

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:

url = "llama38b"
pdf_path = r"Sample.pdf" # Any pdf 
texts=create_chunks(pdf_path)
document_tags_df=create_tags(texts)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize Pinecone
pinecone = Pinecone(api_key='') # include api_key
dimension = model.get_sentence_embedding_dimension()  # Dimension for 'all-MiniLM-L6-v2' model
pinecone.delete_index("chunks")
pinecone.create_index(
      name="chunks",
      dimension=dimension,
      metric="cosine",
      spec=ServerlessSpec(
          cloud="aws",
          region="us-east-1"
      )
  )
index = pinecone.Index("chunks")
add_chunks_to_pinecone(document_tags_df)

#PPT
templates_directory = r"/content/templates"
templates = load_templates(templates_directory)
selected_template_path = select_template(templates)

prs = Presentation(selected_template_path)

while True:
    user_question = input("Enter the title (type 'done' to finish): ")
    if user_question.lower() == "done":
        break
    relevant_chunks = find_relevant_chunks(user_question)
    summary = create_summary_from_relevant_chunks(relevant_chunks)
    add_slide(prs, user_question, summary)

output_path = r"output_presentation.pptx"
prs.save(output_path)
print(f"Presentation saved to {output_path}")

Select the chunk type by number: 1) character based chunking 2) Byte size based chunking 3) Pagewise chunking 4) Section wise chunking1
Chunk Size : 30
Tags' embeddings stored in Pinecone successfully.
Available templates:
1. template1.pptx
2. template2.pptx
3. template4.pptx
4. template3.pptx
Select a template by number: 2
Enter the title (type 'done' to finish): Projection for the export
Enter the title (type 'done' to finish): Capex
Enter the title (type 'done' to finish): Price Discounting
Enter the title (type 'done' to finish): Consolidating Margin
Enter the title (type 'done' to finish): done
Presentation saved to output_presentation.pptx
