## Setup

In [1]:
import os
import re
import json
import PyPDF2

import pandas as pd

from pptx import Presentation
import regex as re
from tqdm.notebook import tqdm, trange
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import subprocess

  from .autonotebook import tqdm as notebook_tqdm


## Initialise variables

In [2]:
DATA_FOLDER = '../data'
SAMPLE_DATA_FOLDER = "../data/sample_files"

# Assuming constants and imports are defined elsewhere
DEFAULT_EMBED_MODEL = "thenlper/gte-large"

DEFAULT_CHUNK_SIZE = 500  # This means each chunk has at most 500 tokens
SENTENCE_CHUNK_OVERLAP = 50  # Example overlap
CHUNKING_REGEX = r"[^,\.;]+[,\.;]?"  # Simple sentence splitter regex
DEFAULT_PARAGRAPH_SEP = "\n\n"  # Paragraph separator

## Utilities

In [3]:
def generate_chunk_entry(doc_type, chunk_name, chunk_description, embedding_model):
    try:
        embedding = embedding_model.get_text_embedding(chunk_description)
        return {
            "Type": doc_type,
            "Name": chunk_name,
            "Description": chunk_description,
            "Embedding": embedding  # Ensure the embedding is serializable
        }
    except Exception as e:
        print(f"Error computing embedding for chunk {chunk_name}: {e}")
        return None

# Define the function to generate a JSON entry for each document
def generate_json_entry(doc_id, doc_type, doc_name, description, link, splitter, embedding_model):
    try:
        # Split the description into chunks
        sentence_chunks = splitter.split_text(description)
        chunks = []
        for i, chunk in enumerate(sentence_chunks, 1):
            chunk_entry = generate_chunk_entry(doc_type, f"{doc_name} - Part {i}", chunk, embedding_model)
            if chunk_entry:
                chunks.append(chunk_entry)

        return {
            "Id": doc_id,
            "Name": doc_name,
            "Description": description,
            "Link": link,
            "Chunks": chunks
        }
    except Exception as e:
        print(f"Failed to compute embedding for {doc_name}: {e}")
        return None
    
def clean_text(text):
    cleaned_text = re.sub(r'\n', ' ', text)
    cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
    cleaned_text = re.sub(r' \.', '.', cleaned_text)
    cleaned_text = re.sub(r' ,', ',', cleaned_text)
    cleaned_text = re.sub(r'\t', ' ', cleaned_text)
    cleaned_text = re.sub(r'\\u\d{3}[a-zA-Z]', '', cleaned_text)
    return cleaned_text


### Check if we have any .pptx files from the crawler

In [4]:
path_all_pdfs = [file for file in os.listdir(DATA_FOLDER)]
print(path_all_pdfs)

pptx_found = False

for path in path_all_pdfs:
    if path.endswith(".pptx"):
        print(path)
        pptx_found = True

if not pptx_found:
    print("No file in the folder is a .pptx file")


['login?ReturnURL=https%3a%2f%2finfo.lse.ac.uk%2fstaff%2fdivisions%2fHuman-Resources%2fAssets%2fInternal%2fstaff%2fPolicy%2fHR-Divisional-Review-2019.pdf', 'login?ReturnURL=https%3a%2f%2finfo.lse.ac.uk%2fstaff%2fservices%2fPolicies-and-procedures%2fAssets%2fDocuments%2finternal%2fpaySupAcPol.pdf', 'login?ReturnURL=https%3a%2f%2finfo.lse.ac.uk%2fstaff%2fdivisions%2fHuman-Resources%2fAssets%2fInternal%2fstaff%2fAtoZ%2fSupport-Leave.pdf', 'Ex-Offenders-Policy-Statement-2019.pdf', 'HR-Adviser-HR-Administrator-Roles-and-Responsibilities.pdf', '18-0074-Staff-Lifecycle-Infographic-v9-latest.pdf', 'login?ReturnURL=https%3a%2f%2finfo.lse.ac.uk%2fstaff%2fdivisions%2fHuman-Resources%2fAssets%2fInternal%2fstaff%2fAtoZ%2fHR-Partnering-Team-Roles-and-Responsibilities.pdf', 'Non-Standard-Leave-Request-Form.docx', 'perRelPolAndPro.pdf', 'adoPol.pdf', 'MaternityandAdoption-Leave-Return-Form.docx', 'Authorising-Annual-Leave-Online-and-Delegation-Guidance.pdf', 'redPolPro.pdf', 'resAll.pdf', 'ProcessingN

In that case, we will need to manually upload our own .pptx slides to provide support for embedding. Let's upload these in a separate folder in "data".

## Process .pptx, .ppt, and .pdf first

In [8]:
path_all_pptx = [file for file in os.listdir(SAMPLE_DATA_FOLDER) if file.endswith('.pptx')]
path_all_ppt = [file for file in os.listdir(SAMPLE_DATA_FOLDER) if file.endswith('.ppt')]
path_all_pdf = [file for file in os.listdir(DATA_FOLDER) if file.endswith('.pdf')]

def read_pdf(file_path):
    all_text = ""
    try:
        with open(file_path, "rb") as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text = page.extract_text()
                if text:
                    all_text += text
    except PyPDF2.errors.PdfReadError as e:
        print(f"Error reading {file_path}: {e}")
        return None
    except Exception as e:
        print(f"An error occurred with {file_path}: {e}")
        return None
    return all_text

def convert_ppt_to_pptx(ppt_path, output_folder):
    pptx_path = os.path.join(output_folder, os.path.splitext(os.path.basename(ppt_path))[0] + '.pptx')
    try:
        subprocess.run(['libreoffice', '--headless', '--convert-to', 'pptx', '--outdir', output_folder, ppt_path], check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error converting {ppt_path}: {e}")
        return None
    return pptx_path

def process_ppt_files(ppt_folder, output_folder):
    path_all_ppts = [file for file in os.listdir(ppt_folder) if file.endswith('.ppt')]
    pptx_data_list = []

    for path in path_all_ppts:
        ppt_path = os.path.join(ppt_folder, path)
        pptx_path = convert_ppt_to_pptx(ppt_path, output_folder)
        if pptx_path:
            pptx_data_list.extend(process_pptx_files(pptx_path))
    return pptx_data_list

def process_pptx_files(pptx_path):
    prs = Presentation(pptx_path)
    pptx_texts = []
    for slide_number, slide in enumerate(prs.slides):
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                pptx_texts.append(clean_text(shape.text))
    all_text = ' '.join(pptx_texts)
    file_name, file_extension = os.path.splitext(os.path.basename(pptx_path))
    return [{
        "name": file_name,
        "type": file_extension.lstrip('.'),
        "text": all_text
    }]


def process_pdfs(pdf_folder):
    path_all_pdfs = [file for file in os.listdir(pdf_folder) if file.endswith('.pdf')]
    docs = []
    for file in path_all_pdfs:
        file_path = os.path.join(pdf_folder, file)
        text = read_pdf(file_path)
        if text is not None:
            file_name, file_extension = os.path.splitext(file)
            doc_entry = {
                "name": file_name,
                "type": file_extension.lstrip('.'),
                "text": text
            }
            docs.append(doc_entry)
    return docs

def clean_docs(docs):
    cleaned_docs = []
    for doc in docs:
        cleaned_doc = {
            "name": doc["name"],
            "type": doc["type"],
            "text": clean_text(doc["text"])
        }
        cleaned_docs.append(cleaned_doc)
    return cleaned_docs

## Complete the embeddings

In [9]:
ppt_docs = process_ppt_files(SAMPLE_DATA_FOLDER, SAMPLE_DATA_FOLDER)
pptx_docs = process_pptx_files(SAMPLE_DATA_FOLDER)
pdf_docs = process_pdfs(DATA_FOLDER)

# Clean the documents
cleaned_ppt_docs = clean_docs(ppt_docs)
cleaned_pptx_docs = clean_docs(pptx_docs)
cleaned_pdf_docs = clean_docs(pdf_docs)

# Combine all documents
all_docs = cleaned_ppt_docs + cleaned_pptx_docs + cleaned_pdf_docs

# Generate JSON data
documents = [(counter + 1, doc["type"], doc["name"], doc["text"]) for counter, doc in enumerate(all_docs)]

json_data = []

splitter = SentenceSplitter(chunk_size=512, chunk_overlap=256)
embed_model = HuggingFaceEmbedding(model_name=DEFAULT_EMBED_MODEL) 

for doc_id, doc_type, doc_name, description in documents:
    json_entry = generate_json_entry(doc_id, doc_type, doc_name, description, "some_link", splitter, embed_model)
    if json_entry:
        json_data.append(json_entry)
    else:
        print(f"Failed to create JSON entry for {doc_name}")

# Save the JSON data to a file
json_file_path = "seed_lse_data.json"
try:
    with open(json_file_path, "w") as f:
        json.dump(json_data, f, indent=4)
    print(f"JSON file created successfully at {json_file_path}")
except Exception as e:
    print(f"Failed to write JSON file: {e}")

FileNotFoundError: [Errno 2] No such file or directory: 'libreoffice'

In [None]:
import os
import comtypes.client

def convert_ppt_to_pptx(ppt_path):
    # Ensure the input file exists
    if not os.path.exists(ppt_path):
        raise FileNotFoundError(f"The file {ppt_path} does not exist")

    # Define the output file path
    pptx_path = os.path.splitext(ppt_path)[0] + '.pptx'

    # Initialize PowerPoint application
    powerpoint = comtypes.client.CreateObject("Powerpoint.Application")
    powerpoint.Visible = 1

    # Open the presentation
    presentation = powerpoint.Presentations.Open(ppt_path)

    # Save the presentation as pptx
    presentation.SaveAs(pptx_path, FileFormat=24)  # 24 is the file format for pptx

    # Close the presentation and quit PowerPoint
    presentation.Close()
    powerpoint.Quit()

    print(f"Converted {ppt_path} to {pptx_path}")
    return pptx_path

# Example usage
ppt_file = "example.ppt"
convert_ppt_to_pptx(ppt_file)
