In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install openai langchain "unstructured[all-docs]"
!pip install pinecone-client langchain-experimental langchain-pinecone langchain-openai
!pip install langchain-core
!pip install langchainhub
!pip install chromadb
!pip install PyPDF2
!pip install PyMuPDF
!pip install faiss-gpu

In [None]:
!apt-get install poppler-utils
!apt-get install tesseract-ocr

In [1]:
import os
import json
import chromadb
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from tqdm import tqdm
import base64
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
from langchain_openai import ChatOpenAI
# from unstructured.partition.pdf import partition_pdf
from typing import List, Tuple
# import cv2.typing
# from PyPDF2 import PdfReader, PdfWriter
# import fitz

load_dotenv()

True

In [15]:
embedding_fn = OpenAIEmbeddings()

## Piazza Posts

In [63]:
def concatenate_posts(data):
    posts = []
    for post in data["response"]:
        final_string = ""
        final_string += "Post Subject: " + post["value"]["subject"] + "\nPost Content: " + post["value"]["content"] + "\nFollow Up Discussion: "
        for follow_up in post["value"]["follow_ups"]:
            final_string += follow_up["subject"] + " " + follow_up["content"] + " "
        posts.append(final_string)
    return posts

def load_documents(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)

    hw_posts = concatenate_posts(data)
    hw_docs = [Document(x) for x in hw_posts]
    return hw_docs

def create_faiss_index(docs, embedding_fn, pd):
    db = FAISS.from_documents(docs, embedding_fn)
    db.save_local(pd)

In [None]:
piazza_path = "./data/piazza/"
vectordb_path = "./data/vector_index/"
for fn in tqdm(os.listdir(piazza_path)):
    name = fn.split('.')[0]
    _format = fn.split('.')[1]
    if _format != "json":
        continue
    pd = vectordb_path + name + "_db"
    hw_docs = load_documents(piazza_path+fn)
    create_faiss_index(hw_docs, embedding_fn, pd)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:01<00:00,  3.97it/s]


## PDF

In [41]:
def extract_pdf_data(fpath) -> List:
    """
    Extract text, and tables from PDF
    :param fpath: filepath to the PDF
    :return: List of raw data
    """
    return partition_pdf(
        filename=fpath,
        strategy="hi_res",
        extract_images_in_pdf=False,
        extract_image_block_output_dir="temp",
        infer_table_structure=False,
        chunking_strategy="basic",
        max_characters=5000,
    )

def get_elements(raw_data) -> Tuple[List, List]:
    """
    Extract text, and tables str elements
    :param raw_data: List of raw data
    :return: List of text and table elements
    """
    texts = []
    for element in raw_data:
      if "unstructured.documents.elements.CompositeElement" in str(type(element)):
        texts.append(str(element))
    return texts

In [75]:
def process_pdfs(input_dir, output_dir, output_image_dir):
    # Iterate through PDF files in input directory
    pdf_docs = []
    for filename in os.listdir(input_dir):
        if filename.endswith('.pdf'):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, f"{filename[:-4]}_pages")
            output_image_dir_path = os.path.join(output_image_dir, f"{filename[:-4]}_pages")
            os.makedirs(output_path, exist_ok=True)
            os.makedirs(output_image_dir_path, exist_ok=True)

            # Read PDF file
            img_index = 0
            with open(input_path, 'rb') as file:
                pdf_reader = PdfReader(file)
                num_pages = len(pdf_reader.pages)
                pdf_path = input_path.replace("/content/drive/My Drive/", "./data/")
                # Iterate through each page of the PDF
                for page_num in tqdm(range(num_pages)):
                    page = pdf_reader.pages[page_num]
                    output_page_path = os.path.join(output_path, f"{page_num+1}.pdf")

                    # Write current page to separate PDF file
                    with open(output_page_path, 'wb') as output_file:
                        pdf_writer = PdfWriter()
                        pdf_writer.add_page(page)
                        pdf_writer.write(output_file)


                    # iterate each page and name the loc as fpath
                    raw_data = extract_pdf_data(output_page_path)
                    texts = get_elements(raw_data)

                    pdf_document = fitz.open(output_page_path)

                    page = pdf_document.load_page(0)
                    image_list = page.get_images(full=True)
                    if len(image_list) == 0:
                      pdf_docs.append(Document(page_content=texts[0], metadata=dict(pdf=pdf_path, pdf_page_num=page_num+1)))
                      continue

                    img_info = image_list[0]
                    xref = img_info[0]
                    base_image = pdf_document.extract_image(xref)
                    image_bytes = base_image["image"]
                    image_ext = base_image["ext"]

                    # Save the image to the output directory
                    image_path = os.path.join(output_image_dir_path, f"img_{img_index+1}.jpg")
                    with open(image_path, "wb") as image_file:
                        image_file.write(image_bytes)
                    img_index += 1
                    image_path = image_path.replace("./", "./data/")


                    # save text chunk along with image, page_num, and pdf name as metadata in the vectordb
                    pdf_docs.append(Document(page_content=texts[0], metadata=dict(image_path=image_path, pdf=pdf_path, pdf_page_num=page_num+1)))

    return pdf_docs

In [None]:
input_directory = "/content/drive/My Drive/Lectures"
output_directory = "./output"
output_image_dir = "./images"
pdf_docs = process_pdfs(input_directory, output_directory, output_image_dir)

In [65]:
pdf_docs[:5]

[Document(page_content='16/30\n\nIntroduction', metadata={'image_path': './data/images/585_1_pages/img_1.jpg', 'pdf': './data/Lectures/585_1.pdf', 'pdf_page_num': 1})]

In [68]:
vectordb_path = "./data/vector_index/"
pd = vectordb_path + "pdf" + "_db"
os.makedirs(pd, exist_ok=True)
create_faiss_index(pdf_docs, embedding_fn, pd)

In [None]:
!cp -r ./images /content/drive/My\ Drive/ta_images

In [None]:
!cp -r ./data/vector_index /content/drive/My\ Drive/ta_vector_index

## Video (OCR+Transcription)

In [16]:
def concatenate_frames(data, filepath):
    frames = []
    image_path = "./data/Video/images/"+filepath.split('/')[-1].split('.')[0]+'/'
    for frame in data:
        final_string = ""
        final_string += "Frame Text: " + frame["OCR_text"] + "\nTranscription: " + frame["audio_transcription"]
        metadata=dict(image_path=image_path+frame["frame"], video=frame["link"])
        frames.append((final_string, metadata))
        
    return frames

def load_documents(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)

    video_frames = concatenate_frames(data, filepath)
    video_docs = [Document(page_content=x, metadata=y) for x, y in video_frames]
    return video_docs

def create_faiss_index(docs, embedding_fn, pd):
    db = FAISS.from_documents(docs, embedding_fn)
    db.save_local(pd)

In [17]:
video_path = "./data/Video/"
vectordb_path = "./data/video_index/"
for fn in tqdm(os.listdir(video_path)):
    if "json" not in fn:
        continue
    name = fn.split('.')[0]
    _format = fn.split('.')[1]
    if _format != "json":
        continue
    pd = vectordb_path + name + "_db"
    video_docs = load_documents(video_path+fn)
    create_faiss_index(video_docs, embedding_fn, pd)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.86s/it]


In [18]:
video_docs[0]

Document(page_content='Frame Text: di\n\nKrishnan Luhar\n\nTranscription: all right 5:00 hi everyone 107 participants', metadata={'image_path': './data/Video/images/Lec_2/output_frame_0001.jpg', 'video': 'https://youtu.be/oNDOzpIiVEM?t=0h0m0s'})