## Setup

In [4]:
import os
import re
import json
import PyPDF2

import pandas as pd

from pptx import Presentation
import regex as re
from tqdm.notebook import tqdm, trange
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from docx import Document


## Initialise variables

In [5]:
DATA_FOLDER = '../data'
SAMPLE_DATA_FOLDER = "../data/sample_files"

# Assuming constants and imports are defined elsewhere
DEFAULT_EMBED_MODEL = "thenlper/gte-large"

DEFAULT_CHUNK_SIZE = 500  # This means each chunk has at most 500 tokens
SENTENCE_CHUNK_OVERLAP = 50  # Example overlap
CHUNKING_REGEX = r"[^,\.;]+[,\.;]?"  # Simple sentence splitter regex
DEFAULT_PARAGRAPH_SEP = "\n\n"  # Paragraph separator

### NOTE: When I ran the crawler, I did not get any .pptx or .docx files, so I manually looked 1 of each up. I did not find any .ppt and .doc files as it seems LSE doesn't have these outdated file types (from my search, at least). So, this notebook is meant to support .pdf, .pptx, and .docx. Kylin has provided the function to read .docx files. 

## Process .pptx

In [8]:
path_all_pptx = [file for file in os.listdir(SAMPLE_DATA_FOLDER) if file.endswith('.pptx')]

def clean_text(text):
    cleaned_text = re.sub(r'\n', ' ', text)
    cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
    cleaned_text = re.sub(r' \.', '.', cleaned_text)
    cleaned_text = re.sub(r' ,', ',', cleaned_text)
    cleaned_text = re.sub(r'\t', ' ', cleaned_text)
    cleaned_text = re.sub(r'\\u\d{3}[a-zA-Z]', '', cleaned_text)
    return cleaned_text

pptx_data_list = []

for path in path_all_pptx:
    pptx_path = os.path.join(SAMPLE_DATA_FOLDER, path)
    prs = Presentation(pptx_path)
    ppt_texts = []
    for slide_number, slide in enumerate(prs.slides):
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                ppt_texts.append(clean_text(shape.text))
    # Join all the cleaned text elements into a single string for this pptx
    all_text = ' '.join(ppt_texts)
    
    # Extract name and type
    file_name, file_extension = os.path.splitext(path)
    
    # Append to the list as a dictionary
    pptx_data_list.append({
        "name": file_name,
        "type": file_extension.lstrip('.'),
        "text": all_text
    })

pptx_data_list

[{'name': 'scs-workshop-exams-revise-and-de-stress',
  'type': 'pptx',
  'text': '1     Exam Workshop – Revise and De-stress  4 May 2016   Helen Green   Adam Sandelson    Teaching and Learning LSE Student Counselling Service Centre         2 Today’s session      Exam revision: the last few weeks Exam psychology   Plan your time Practice exam skills Take advantage of revision support Exam revision: the last few weeks Helen Amelia Green Teaching and Learning Centre Consider the exams you have, the number of days until your exams, then plan for a number of revision sessions per exam. Figure out what works for you. e.g. 1.5 hours x 4 or 5; 3 hours x 3 Work regularly, but take breaks / weekends.  Plan your time Four exams, three in one week (8-18 June) Three half-unit courses, one full unit course Three “sessions” per day, from 12 May SO468 LL4BB GV4A5 HY436       An example… Find the rhythm that works best for you. Vary each day’s work. Take breaks. Allot revision time for each exam. 23 se

## Process .pdf

In [9]:
path_all_pdfs = [file for file in os.listdir(SAMPLE_DATA_FOLDER) if file.endswith('.pdf')]

def read_pdf(file_path):
    all_text = ""
    try:
        with open(file_path, "rb") as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text = page.extract_text()
                if text:
                    all_text += clean_text(text)
    except PyPDF2.errors.PdfReadError as e:
        print(f"Error reading {file_path}: {e}")
        return None
    except Exception as e:
        print(f"An error occurred with {file_path}: {e}")
        return None
    return all_text


def process_pdfs(pdf_folder):
    path_all_pdfs = [file for file in os.listdir(pdf_folder) if file.endswith('.pdf')]
    docs = []
    for file in path_all_pdfs:
        file_path = os.path.join(pdf_folder, file)
        text = read_pdf(file_path)
        if text is not None:
            file_name, file_extension = os.path.splitext(file)
            doc_entry = {
                "name": file_name,
                "type": file_extension.lstrip('.'),
                "text": text
            }
            docs.append(doc_entry)
    return docs

pdf_data_list = process_pdfs(DATA_FOLDER)

pdf_data_list

Error reading ../data/login?ReturnURL=https%3a%2f%2finfo.lse.ac.uk%2fstaff%2fdivisions%2fHuman-Resources%2fAssets%2fInternal%2fstaff%2fPolicy%2fHR-Divisional-Review-2019.pdf: EOF marker not found
Error reading ../data/login?ReturnURL=https%3a%2f%2finfo.lse.ac.uk%2fstaff%2fservices%2fPolicies-and-procedures%2fAssets%2fDocuments%2finternal%2fpaySupAcPol.pdf: EOF marker not found
Error reading ../data/login?ReturnURL=https%3a%2f%2finfo.lse.ac.uk%2fstaff%2fdivisions%2fHuman-Resources%2fAssets%2fInternal%2fstaff%2fAtoZ%2fSupport-Leave.pdf: EOF marker not found
Error reading ../data/login?ReturnURL=https%3a%2f%2finfo.lse.ac.uk%2fstaff%2fdivisions%2fHuman-Resources%2fAssets%2fInternal%2fstaff%2fAtoZ%2fHR-Partnering-Team-Roles-and-Responsibilities.pdf: EOF marker not found
Error reading ../data/login?ReturnURL=https%3a%2f%2finfo.lse.ac.uk%2fstaff%2fdivisions%2fHuman-Resources%2fAssets%2fInternal%2fService-Enquiries-Human-Resources-Division.pdf: EOF marker not found
Error reading ../data/login?

[{'name': 'Ex-Offenders-Policy-Statement-2019',
  'type': 'pdf',
  'text': '1 Policy Statement: Ex-Offenders LSE actively promotes equality of opportunity for all and actively welcomes applications from a wide range of applicants, including those with criminal records. LSE uses the Disclosure and Barring Service (DBS) to help assess the suitability of applicants when recruiting to positions that are likely to involve working with children or vulnerable adults, or other positions of trust. Therefore, LSE complies fully wi th the DBS Code of Practice and undertakes to treat all applicants fairly. LSE also undertakes not to discriminate unfairly against any subject of a DBS check on the basis of a conviction or any information revealed. All applicants who apply for position s requiring a DBS check will be made aware of the Code of Practice during the recruitment process and will be provided with a copy on request. This policy will be made available to all Disclosure applicants at the outs

## Process .docx (extract_text_from_docx function from Kylin's branch)

In [10]:
path_all_pdfs = [file for file in os.listdir(SAMPLE_DATA_FOLDER) if file.endswith('.docx')]


def extract_text_from_docx(docx_path):
    try:
        # Load the DOCX file into a Document object
        doc = Document(docx_path)
        
        # Directly extract text from the document
        text = ""
        for para in doc.paragraphs:
            text += para.text + "\n"
        return clean_text(text)
    except Exception as e:
        print(f"Error extracting text from {docx_path}: {e}")
        return ""
    
def process_docx(docx_folder):
    path_all_docx = [file for file in os.listdir(docx_folder) if file.endswith('.docx')]
    docs = []
    for file in path_all_docx:
        docx_path = os.path.join(docx_folder, file)
        text = extract_text_from_docx(docx_path)
        if text:
            file_name, file_extension = os.path.splitext(file)
            doc_entry = {
                "name": file_name,
                "type": file_extension.lstrip('.'),
                "text": text
            }
            docs.append(doc_entry)
    return docs

docx_data = process_docx(SAMPLE_DATA_FOLDER)
docx_data

[{'name': 'LSE-Graduate-Admissions-Reference-request-form',
  'type': 'docx',
  'text': ' LSE Graduate Admissions – reference request form Please use this form only if you are unable to use our online system as detailed in our request email to you. Sending your reference by email is slower and may delay the application. Thank you for agreeing to write a reference. Your letter is important to the applicant; we cannot consider applications without supporting references. If you are unable to provide a reference, please let the applicant know immediately to allow them to make alternative arrangements. Please follow the instructions below: Supply the applicant’s details so that we can match your reference to the correct application. Answer the remaining questions, including ranking the applicant relative to their peers. This structured section makes it easier for us to compare references. Write your letter of reference, bearing in mind the questions at the end of this form. Send your letter

## Complete the embeddings

In [11]:
def generate_chunk_entry(doc_type, chunk_name, chunk_description, embedding_model):
    try:
        embedding = embedding_model.get_text_embedding(chunk_description)
        return {
            "Type": doc_type,
            "Name": chunk_name,
            "Description": chunk_description,
            "Embedding": embedding  # Ensure the embedding is serializable
        }
    except Exception as e:
        print(f"Error computing embedding for chunk {chunk_name}: {e}")
        return None

# Define the function to generate a JSON entry for each document
def generate_json_entry(doc_id, doc_type, doc_name, description, link, splitter, embedding_model):
    try:
        # Split the description into chunks
        sentence_chunks = splitter.split_text(description)
        chunks = []
        for i, chunk in enumerate(sentence_chunks, 1):
            chunk_entry = generate_chunk_entry(doc_type, f"{doc_name} - Part {i}", chunk, embedding_model)
            if chunk_entry:
                chunks.append(chunk_entry)

        return {
            "Id": doc_id,
            "Name": doc_name,
            "Description": description,
            "Link": link,
            "Chunks": chunks
        }
    except Exception as e:
        print(f"Failed to compute embedding for {doc_name}: {e}")
        return None
    


# Combine all documents
all_docs = docx_data + pdf_data_list + pptx_data_list

# Generate JSON data
documents = [(counter + 1, doc["type"], doc["name"], doc["text"]) for counter, doc in enumerate(all_docs)]

json_data = []

splitter = SentenceSplitter(chunk_size=512, chunk_overlap=256)
embed_model = HuggingFaceEmbedding(model_name=DEFAULT_EMBED_MODEL) 

for doc_id, doc_type, doc_name, description in documents:
    json_entry = generate_json_entry(doc_id, doc_type, doc_name, description, "some_link", splitter, embed_model)
    if json_entry:
        json_data.append(json_entry)
    else:
        print(f"Failed to create JSON entry for {doc_name}")

# Save the JSON data to a file
json_file_path = "../data/seed_lse_data.json"
try:
    with open(json_file_path, "w") as f:
        json.dump(json_data, f, indent=4)
    print(f"JSON file created successfully at {json_file_path}")
except Exception as e:
    print(f"Failed to write JSON file: {e}")



JSON file created successfully at seed_lse_data.json
