In [16]:
from moviepy.editor import ImageSequenceClip, AudioFileClip, concatenate_videoclips
from pymilvus import Milvus, MilvusClient, IndexType, connections, utility
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from pdf2image.exceptions import PDFPageCountError, PDFSyntaxError
from langchain.text_splitter import RecursiveCharacterTextSplitter
from moviepy.editor import concatenate_videoclips, ImageClip
from langchain.chains.summarize import load_summarize_chain
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from moviepy.config import change_settings
from langchain.chains.llm import LLMChain
from langchain.chains import SequentialChain
from langchain_openai import ChatOpenAI
from pdf2image import convert_from_path
from milvus import default_server
from dotenv import load_dotenv
from pydub import AudioSegment
from datetime import datetime
from openai import OpenAI
from PIL import Image
import gradio as gr
import numpy as np
import feedparser
import requests
import imageio
import base64
import pprint
import torch
import re
import os

In [2]:
change_settings({"FFMPEG_BINARY": "/opt/homebrew/bin/ffmpeg", "DYLD_LIBRARY_PATH":"/opt/homebrew/bin/convert"})
# Set up a Milvus client
default_server.start()
host="127.0.0.1"
connections.connect(host=host, port=default_server.listen_port)
port=default_server.listen_port
my_uri = "http://localhost:" + str(port)
print(my_uri)

http://localhost:19531


In [13]:
def get_env_variables():
    """Fetch all necessary configurations from environment variables."""
    return {
        'OPENAI_API_KEY': os.getenv('OPENAI_API_KEY'),
        'ELEVEN_LABS_API_KEY': os.getenv('ELEVEN_LABS_API_KEY')
    }


def create_folder(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(f"The folder '{folder_name}' has been created.")
    else:
        print(f"The folder '{folder_name}' already exists.")


def arxiv_id_from_url(url):
    # Extract the arXiv ID from the URL using a regular expression
    match = re.search(r'arxiv\.org/pdf/(\d+\.\d+)', url)
    if match:
        return match.group(1)
    else:
        return None
        

def download_and_save_pdf(url, folder_pdfs):
    """
    Download and save a PDF file from an arXiv.org URL into local directory.

    Parameters:
    - url (str): The arXiv.org URL of the paper.

    Returns:
    - str: ArXiv ID of the downloaded paper if successful, or an error message.
    """
    # Extract arXiv ID from the URL
    arxiv_id = arxiv_id_from_url(url)

    arxiv_name = arxiv_id.replace(".", "_")
    pdf_path = os.path.join(folder_pdfs, arxiv_name)
    create_folder(pdf_path)

    # Check if a valid arXiv ID was extracted
    if arxiv_id:
        try:
            # Make a request to the arXiv API
            feed = feedparser.parse(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')

            # Check if the response contains entries
            if 'entries' in feed:
                # Iterate over each entry (paper) in the feed
                for entry in feed.entries:
                    # Extract the PDF link from the entry
                    pdf_link = entry.link.replace('/abs/', '/pdf/') + '.pdf'

                    # Download the PDF
                    response = requests.get(pdf_link)

                    # Save the PDF in the local directory with the name based on the arXiv ID
                    with open(f'{pdf_path}/{arxiv_name}.pdf', 'wb') as pdf_file:
                        pdf_file.write(response.content)

                    print(f"\nPDF downloaded and saved as {arxiv_name}.pdf")
                    return arxiv_id

            else:
                return f"\nNo entries found for arXiv ID {arxiv_id}"

        except Exception as e:
            return f"\nError extracting information: {e}"
    else:
        return "Invalid arXiv PDF URL format. Please enter a valid URL."


def download_and_initialize_embedding_model(model_name="WhereIsAI/UAE-Large-V1", device=None):
    """
    Download and initialize the Sentence Transformer model.

    Parameters:
    - model_name (str): The name of the Sentence Transformer model to download.
    - device (str or torch.device): The device to use for the model (e.g., 'cuda:3' or 'cpu').

    Returns:
    - encoder (SentenceTransformer): The initialized Sentence Transformer model.
    - EMBEDDING_DIM (int): The embedding dimension of the model.
    - MAX_SEQ_LENGTH (int): The maximum sequence length.

    Example usage:
    encoder, EMBEDDING_DIM, MAX_SEQ_LENGTH = download_and_initialize_embedding_model()
    """
    # Initialize torch settings
    torch.backends.cudnn.deterministic = True
    DEVICE = torch.device(device) if device else torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
    print(f"\ndevice: {DEVICE}")

    # Load the model from the Hugging Face model hub
    encoder = SentenceTransformer(model_name, device=DEVICE)
    print(f"\nDatatype of SentenceTransformer encoded object{type(encoder)}\n")
    print(f"\nWhat the encoder object looks like: {encoder}\n")

    # Get the model parameters and save for later
    EMBEDDING_DIM = encoder.get_sentence_embedding_dimension()
    try:
        MAX_SEQ_LENGTH_IN_TOKENS = encoder.get_MAX_SEQ_LENGTH()
    except AttributeError:
        MAX_SEQ_LENGTH_IN_TOKENS = 512
    # Assume tokens are 3 characters long
    # MAX_SEQ_LENGTH = MAX_SEQ_LENGTH_IN_TOKENS * 3
    # HF_EOS_TOKEN_LENGTH = 1 * 3
    # Test with 512 sequence length
    MAX_SEQ_LENGTH = MAX_SEQ_LENGTH_IN_TOKENS
    HF_EOS_TOKEN_LENGTH = 1

    # Inspect model parameters
    print(f"\nmodel_name: {model_name}")
    print(f"\nEMBEDDING_DIM: {EMBEDDING_DIM}")
    print(f"\nMAX_SEQ_LENGTH: {MAX_SEQ_LENGTH}")

    return encoder, EMBEDDING_DIM, MAX_SEQ_LENGTH


def create_milvus_collection(COLLECTION_NAME, EMBEDDING_DIM, uri=my_uri):
    """
    Create a no-schema Milvus collection and define the database index.

    Parameters:
    - uri (str): The URI of the Milvus server.
    - COLLECTION_NAME (str): The name of the Milvus collection.
    - EMBEDDING_DIM (int): The dimension of the embedding vectors.

    Returns:
    - milvus_client (Milvus): The Milvus client instance.


    Example usage:
    my_uri = "tcp://127.0.0.1:19530"
    COLLECTION_NAME = "MilvusDocs"
    my_EMBEDDING_DIM = 1024
    
    milvus_client = create_milvus_collection(COLLECTION_NAME, EMBEDDING_DIM, uri=my_uri)
    """

    # For vector similarity search applications that require perfect accuracy and depend 
    # on relatively small (million-scale) datasets, the FLAT index is a good choice. 
    # FLAT does not compress vectors, and is the only index that can guarantee exact 
    # search results. Results from FLAT can also be used as a point of comparison for 
    # results produced by other indexes that have less than 100% recall.

    # FLAT is accurate because it takes an exhaustive approach to search, which means for 
    # each query the target input is compared to every set of vectors in a dataset. This 
    # makes FLAT the slowest index on our list, and poorly suited for querying massive
    # vector data. There are no parameters required for the FLAT index in Milvus, and 
    # using it does not need data training.

    index_params = {
        "index_type": IndexType.FLAT,
        "metric_type": "COSINE",
    }

    # Use no-schema Milvus client using flexible json key:value format.
    milvus_client = MilvusClient(uri=my_uri)

    # Check if collection already exists, if so drop it.
    if utility.has_collection(COLLECTION_NAME):
        utility.drop_collection(COLLECTION_NAME)
        print(f"\nCollection had previously been created, dropping previous collection to initialize anew: `{COLLECTION_NAME}`")

    # Create the collection.
    milvus_client.create_collection(COLLECTION_NAME, EMBEDDING_DIM,
                                    consistency_level="Eventually",
                                    auto_id=True,
                                    overwrite=True,
                                    params=index_params)

    print(f"\nSuccessfully created collection: `{COLLECTION_NAME}`")
    print(milvus_client.describe_collection(COLLECTION_NAME))

    return milvus_client


def split_documents_to_chunks(docs, MAX_SEQ_LENGTH, hf_eos_token_length):
    """
    Split documents into smaller recursive chunks using Sentence Transformers' RecursiveCharacterTextSplitter.

    Parameters:
    - docs (list): List of documents to be split.
    - MAX_SEQ_LENGTH (int): Maximum sequence length.
    - hf_eos_token_length (int): Length of the EOS token.

    Returns:
    - chunks (list): List of chunks.

    Example usage:
    MAX_SEQ_LENGTH = 512
    HF_EOS_TOKEN_LENGTH = 3
    docs = ["Document 1 text.", "Document 2 text.", "Document 3 text."]
    
    resulting_chunks = split_documents_to_chunks(docs, MAX_SEQ_LENGTH, HF_EOS_TOKEN_LENGTH)
    print(resulting_chunks)
    """
    # Calculate chunk size and overlap
    chunk_size = MAX_SEQ_LENGTH - hf_eos_token_length
    chunk_overlap = int(round(chunk_size * 0.10, 0))

    # Create an instance of the RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )

    # Split the documents further into smaller, recursive chunks.
    chunks = text_splitter.split_documents(docs)
    
    return chunks


def insert_chunks_into_milvus(chunks, COLLECTION_NAME, encoder, milvus_client, MAX_SEQ_LENGTH, hf_eos_token_length):
    """
    Insert document chunks into a Milvus collection.

    Parameters:
    - docs (list): List of documents to be inserted.
    - COLLECTION_NAME (str): Name of the Milvus collection.
    - encoder (SentenceTransformer): SentenceTransformer model for generating embeddings.
    - milvus_client (Milvus): Milvus client instance.
    - MAX_SEQ_LENGTH (int): Maximum sequence length.
    - hf_eos_token_length (int): Length of the EOS token.

    Returns:
    - insert_time (float): Time taken for the insertion process.

    Example Usage assuming 'chunks' is a list of dictionaries with 'page_content' and 'metadata' keys:
    MAX_SEQ_LENGTH = 512
    HF_EOS_TOKEN_LENGTH = 3
    COLLECTION_NAME = "MilvusDocs"
    ENCODER_MODEL_NAME = "WhereIsAI/UAE-Large-V1"
    # Initialize Milvus client
    # Initialize encoder model
    
    resulting_insert_time = insert_chunks_into_milvus(chunks, COLLECTION_NAME, encoder, milvus_client, MAX_SEQ_LENGTH, HF_EOS_TOKEN_LENGTH)
    """
    # Convert chunks to a list of dictionaries.
    chunk_list = []
    for chunk in chunks:
    
        # Generate embeddings using encoder from HuggingFace.
        embeddings = torch.tensor(encoder.encode([chunk.page_content]))
        # embeddings = F.normalize(embeddings, p=2, dim=1) #use torch
        embeddings = np.array(embeddings / np.linalg.norm(embeddings)) #use numpy
        converted_values = list(map(np.float32, embeddings))[0]
        
        # Assemble embedding vector, original text chunk, metadata.
        chunk_dict = {
            'vector': converted_values,
            'chunk': chunk.page_content,
            'source': chunk.metadata['page']
        }
        chunk_list.append(chunk_dict)

    # Insert data into the Milvus collection.
    print("Start inserting entities")

    inserted_chunks = milvus_client.insert(
        COLLECTION_NAME,
        data=chunk_list,
        progress_bar=True
    )
    print("Finished inserting entities")

    # After the final entity is inserted, call flush to stop growing segments left in memory.
    milvus_client.flush(COLLECTION_NAME)

    return print(f"\nNumber of chunks inserted into Milvus database: {len(inserted_chunks)} with chunk id starting at number: {inserted_chunks[0]}\n")


def client_assemble_retrieved_context(retrieved_top_k, metadata_fields=[], num_shot_answers=3):
    """ 
    For each question, assemble the context and metadata from the retrieved_top_k chunks.
    retrieved_top_k: list of dicts

    Example Usage:
    formatted_results, context, context_metadata = client_assemble_retrieved_context(results, metadata_fields=metadata_fields, num_shot_answers=top_k)
    """
    # Assemble the context as a stuffed string.
    distances = []
    context = []
    context_metadata = []
    i = 1
    for r in retrieved_top_k[0]:
        distances.append(r['distance'])
        if i <= num_shot_answers:
            if len(metadata_fields) > 0:
                metadata = {}
                for field in metadata_fields:
                    metadata[field] = r['entity'][field]
                context_metadata.append(metadata)
            context.append(r['entity']['chunk'])
        i += 1

    # Assemble formatted results in a zipped list.
    formatted_results = list(zip(distances, context, context_metadata))
    # Return all the things for convenience.
    return formatted_results, context, context_metadata
    

def search_and_generate_response(docs, LLM_NAME, key):
    """
    Extracts text before the "Abstract" section from a document, then generates various responses and edits related to the academic paper based on the extracted text.

    Args:
    - docs (list): Lanchain document object, the first one of the object grabbed for page title of academic paper.
    - LLM_NAME (str): Name of the OpenAI language model to be used.
    - key (dict): A dictionary containing the OpenAI API key.

    Returns:
    - dict: A dictionary containing the generated responses and edits related to the academic paper.
    """

    # Extract text before the "Abstract" section therefore always grabbing paper title, authors and universities for search and reference.
    text_before_abstract = re.search(r'^(.*?)\bAbstract\b', docs[0].page_content, re.DOTALL)

    dish_name = text_before_abstract.group(1)

    # Initialize OpenAI language model
    llm = ChatOpenAI(model_name=LLM_NAME, temperature=0, openai_api_key=key['OPENAI_API_KEY']) 

    # This is an LLMChain to define prompt templates for different stages
    prompt_outline = PromptTemplate.from_template(
        template= """
        - Use only the the paper {dish_name} to answer the following: 
        - Pick the top 5 most occurring terminology in {dish_name} and define the terms.
        - Write 2 sentences to give the definition of each of the 5 most occurring terminology in {dish_name}, for a total of 10 sentences.
        - Answer what are the key contributions of the paper {dish_name} in one sentence?
        - Answer what are the evluation metrics or new approaches proposed in {dish_name}? 
    """)
    # This is an LLMChain to define prompt templates for different stages
    prompt_summary = PromptTemplate.from_template(
        template=""" Assume the role of a NeurIPS Paper Evaluation Committee Member who evaluates Use only is evluating the {dish_name} paper based on various criteria such as novelty, significance, and technical soundness to pitch to a stakeholder.
        - The output text should more than 1000 characters:
        - Write 3 sentences to explain in detail the key contributions answered in {outline} of the paper {dish_name}.
        - Expand upon the evluation metrics or new approaches from {outline} by summarizing each and every paragraph in {dish_name} into one sentence. If the paper has 20 paragraphs, there this prompt needs to output 20 sentences.
        """
    )
    # This is an LLMChain to define prompt templates for different stages
    prompt_edit = PromptTemplate.from_template(
        template="""Edit the entire script {summary} for good flow, professionalism, redundancy removal in a voiceover text format.
        - Assume the role of a NeurIPS Paper Evaluation Committee Member who evaluates Use only is evluating the {dish_name} paper based on various criteria such as novelty, significance, and technical soundness to pitch to a stakeholder.
        - It is of utmost importance to be as specific as possible, going into the technical details, using technical terminology 
        - Make the output longer than 2500 characters.
        - Keep the detail and specificity from {summary} where {dish_name} where every paragraph was summarized into one sentence
        - Make sure that it is obviously stated to the video viewer the related work that the paper is built on, use the terminology and definitions from the {summary} or {outline}.
        - In the edit, ensure that the voiceover text states the following specifics, if not, go back to the {dish_name} paper to extract more specific details:
        - what research was previously known in the past that the paper is based upon that the paper advances previously known information, what are the technologies that were previously known involved
        - specific methods used,
        - how is the technique, actions or methods performed advanced the field, or challenges the status-quo
        - what are the major contributions, spend 3 sentences to elaborate the details, using terminology from {outline} 
        - What are the key metrics that define the success of the work
        - make sure to always reference the paper {dish_name}  what are the technologies that were previously known involved
        - for each of the technologies involved, if the technologies are from the 5 most occurring terminology in {outline}, edit that part of the {outline} into a complete sentence without further edits.
        - for each of the technologies involved, further elaborate the technology by citing how the {dish_name} has introduced these technologies by looking at the terminology in the paper and summarizing the words before and after. 
        - what are future directions that lie ahead. Cite the grounding sources. 
        - Always start by stating the title and authors of the paper as the first few words.
        - Make sure the output is at least 2500 characters or tokens in length, if not reference the paper {dish_name} to answer the previous points more in depth
        """
    )
    # This is an LLMChain to define prompt templates for different stages
    prompt_metrics = PromptTemplate.from_template(
        template="""
        - What are the technologies that were previously known involved from the paper {dish_name}?
        - If you cannot find the paper {dish_name} and are going to output Unfortunately, as an AI language model, I do not have access to specific papers or their content
        - Do not say you do not have access, keep trying to reference the paper {dish_name}. 
        - For each of the technologies involved, if the technologies are from the 5 most occurring terminology in {outline}, edit that part of the {outline} into a complete sentence without further edits. Further elaborate the technology by citing how the {dish_name} has introduced these technologies by looking at the terminology in the paper and summarizing the words before and after. 
        - What are the key metrics that define the success of the work from the paper? Name specific evlauation metrics used in the research?
        - What was the datset was used?
        - How was the research structured?
        - Describe the results and what metrics the paper {dish_name} claims to have beaten? What scores does the paper claim to have gotten compared to previous work?
        - List the previous methods that the paper beat. 
        """
    )
    
    # Create LLM chains for different tasks
    chain_outline = LLMChain(llm=llm, prompt=prompt_outline, output_key="outline")
    chain_summary = LLMChain(llm=llm, prompt=prompt_summary, output_key="summary")
    chain_edit = LLMChain(llm=llm, prompt=prompt_edit, output_key="edit")
    chain_metrics = LLMChain(llm=llm, prompt=prompt_metrics, output_key="metrics")

    # Assemble overall chain
    overall_chain = SequentialChain(
        chains=[chain_outline, chain_summary, chain_edit, chain_metrics],
        input_variables=["dish_name"],
        output_variables=["outline","summary", "edit", "metrics"],
    )
    # Execute overall chain with input data    
    result = overall_chain({"dish_name": text_before_abstract})

    return result


def save_transcript(response_choices, folder_transcripts, arxiv_name):
    """
    Save the first element of response_choices into a text file in the specified directory.

    Parameters:
    - response_choices (list): A list of choices where the first element is the transcript content.
    - folder_transcripts (str): The directory path where the transcript file will be saved.
    - arxiv_name (str): The name used for generating the transcript file.

    Returns:
    None: The function saves the transcript content to a text file.

    Example:
    save_transcript(["This is the transcript content."], "transcripts", "example_arxiv")
    """
    # Ensure the directory path is valid
    if not os.path.exists(folder_transcripts):
        os.makedirs(folder_transcripts)

    # Generate the file path
    file_path = os.path.join(folder_transcripts, f"{arxiv_name}.txt")

    # Save response_choices[0] to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(response_choices[0])

    print(f"\nTranscript saved in: {file_path}")


def text_to_speech(text_for_TTS, arxiv_name, folder_audio):

    ELEVEN_LABS_API_KEY = os.environ.get("ELEVEN_LABS_API_KEY")

    CHUNK_SIZE = 1024
    url = "https://api.elevenlabs.io/v1/text-to-speech/bVMeCyTHy58xNoL34h3p"

    headers = {
        "Accept": "audio/mpeg",
        "Content-Type": "application/json",
        "xi-api-key": ELEVEN_LABS_API_KEY
    }

    data = {
        "text": text_for_TTS,
        "model_id": "eleven_monolingual_v1",
        "voice_settings": {
            "stability": 0.5,
            "similarity_boost": 0.5
        }
    }

    # Generate a unique filename based on timestamp
    filename = f"output_{arxiv_name}.mp3"
    target_path = os.path.join(folder_audio, filename)

    # Check if the file already exists
    if os.path.exists(target_path):
        print(f"Recording file {filename} already exists in {folder_audio}. Skipping download.")
        return target_path

    response = requests.post(url, json=data, headers=headers)

    if response.status_code == 200:
        # Save the recording to the unique file
        with open(target_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
                if chunk:
                    f.write(chunk)

        print(f"\nRecording saved in {target_path}")
        return target_path
    else:
        print(f"\n Error: {response.status_code} - {response.text}")


def convert_pdf_to_png(folder_images, pdf_file_path, arxiv_name):
    try:
        # Create a folder for storing the PNGs
        sub_folder_name = os.path.splitext(os.path.basename(pdf_file_path))[0] + "_pngs"
        full_path = os.path.join(folder_images, sub_folder_name)
        if not os.path.exists(full_path):
            os.makedirs(full_path)
    
        # Convert each page of the PDF to PNG
        images = convert_from_path(pdf_file_path, output_folder=full_path)
        # arxiv_name = sub_folder_name.replace("_pngs", "")
    
        # Save each image as a separate PNG file
        for i, image in enumerate(images):
            png_path = os.path.join(full_path, f"{arxiv_name}_page_{i + 1}.png")
            image.save(png_path, "PNG")
    
        print(f"\nAll pages converted and saved in the folder: {full_path}")
    
        # Clean up: Delete the .ppm files and uncropped files
        for filename in os.listdir(full_path):
            if filename.endswith(".ppm"):
                file_to_remove_path = os.path.join(full_path, filename)
                os.remove(file_to_remove_path)
    
        print(f"\n.ppm artifacts deleted in the folder: {full_path}")
    except (PDFPageCountError, PDFSyntaxError, PermissionError) as e:
        print(f"\nError: {e}")
        print(f"Skipping processing of {pdf_file_path}")
        if isinstance(e, PdfReadError):
            print("PdfReadError: Unable to read PDF file.")
        elif isinstance(e, PermissionError):
            print("PermissionError: Permission issue while processing the PDF file.")


def cut_pngs_in_half(image_folder):
    # Ensure the directory path is valid
    if not os.path.exists(image_folder):
        print(f"\nError: Directory '{image_folder}' does not exist.")
        return

    # Get a list of all files in the directory
    files = [f for f in os.listdir(image_folder) if os.path.isfile(os.path.join(image_folder, f))]

    # Process each file in the directory
    for file_name in files:
        # Check if the file is a PNG and does not contain 'cropped' in the name
        if file_name.lower().endswith('.png') and 'cropped' not in file_name.lower():
            image_path = os.path.join(image_folder, file_name)

            # Open the image
            with Image.open(image_path) as img:
                # Get the dimensions of the image
                width, height = img.size

                # Cut the image in half (top and bottom)
                top_half = img.crop((0, 0, width, height // 2))
                bottom_half = img.crop((0, height // 2, width, height))

                # Save the top and bottom halves with "_cropped_1" and "_cropped_2" suffixes
                top_half.save(os.path.join(image_folder, f"{os.path.splitext(file_name)[0]}_cropped_1.png"), 'PNG')
                bottom_half.save(os.path.join(image_folder, f"{os.path.splitext(file_name)[0]}_cropped_2.png"), 'PNG')

                print(f"\nImages saved: {file_name}_cropped_1.png (top) and {file_name}_cropped_2.png (bottom)")
        else:
            print(f"\nSkipping processing for {file_name} as it contains 'cropped' in the file name.")


def analyze_mp3_length(mp3_path):
    audio = AudioSegment.from_file(mp3_path)
    return len(audio) / 1000.0  # Length in seconds

def fetch_cropped_images(image_folder):
    # List all images in the folder
    all_images = os.listdir(image_folder)
    
    # Identify files to keep (those with the word "cropped" in their filenames)
    cropped_images = [image for image in all_images if image.lower().endswith('.png') and 'cropped' in image.lower()]
    
    # Delete files that do not contain the word "cropped"
    for image in all_images:
        if image not in cropped_images:
            os.remove(os.path.join(image_folder, image))
    
    # List the remaining images after deletion
    remaining_images = os.listdir(image_folder)
    
    # Sort the cropped images based on numeric values in their filenames
    sorted_images = sorted(remaining_images, key=lambda x: int(''.join(filter(str.isdigit, x))))
    return sorted_images


def move_uncropped_files(image_folder):
    try:
        # Create a new folder if it doesn't exist
        uncropped_folder = os.path.join(image_folder, "uncropped_pngs")
        if not os.path.exists(uncropped_folder):
            os.makedirs(uncropped_folder)

        # Loop through all files in the folder
        for filename in os.listdir(image_folder):
            file_path = os.path.join(image_folder, filename)

            # Check if the file name contains the word "cropped"
            if "cropped" not in filename:
                # Move the file to the uncropped folder
                new_path = os.path.join(uncropped_folder, filename)

                try:
                    shutil.move(file_path, new_path)
                    print(f"File moved to uncropped folder: {filename}")
                except Exception as move_error:
                    print(f"Error moving file {filename}: {move_error}")
                    continue

        print(f"All non-cropped files moved to the folder: {uncropped_folder}")
    except Exception as e:
        print(f"Error: {e}")


def create_video(mp3_path, image_folder, output_path):
    try:
        # Loop through all files in the folder
        for filename in os.listdir(image_folder):
            file_path = os.path.join(image_folder, filename)
            
            # Check if the file name contains the word "cropped"
            if "cropped" not in filename:
                # Remove the file
                os.remove(file_path)
                print(f"File removed: {filename}")
                
        print(f"All non-cropped files removed in the folder: {image_folder}")
    except Exception as e:
        print(f"Error: {e}")
    
    # Sort the images by converting the numeric parts of filenames into integers
    image_files = sorted([file for file in os.listdir(image_folder) if 'cropped' in file and file.lower().endswith('.png')],
                         key=lambda x: [int(part) if part.isdigit() else part for part in re.split(r'(\d+)', x)])
    audio_clip = AudioFileClip(mp3_path)
    
    # Calculate the duration of each image based on the total duration of the audio and the number of images
    image_duration = audio_clip.duration / len(image_files)
    
    clips = []
    
    for idx, image_file in enumerate(image_files):
        # Load each image using imageio
        image_path = os.path.join(image_folder, image_file)
        image = imageio.imread(image_path)
    
        if image.sum() == 0:
            print(f"Image {image_file} is blank. Skipping...")
            os.remove(image_file)
            continue
                
        # Create a clip from the image and set its duration
        image_clip = ImageClip(image).set_duration(image_duration)
    
        # Add the image clip to the list of clips
        clips.append(image_clip)
    
    # Concatenate the image clips to create the final video
    final_clip = concatenate_videoclips(clips, method="compose")
    final_clip = final_clip.set_audio(audio_clip)
    
    # Write the final video with audio
    final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac", fps=24, verbose=True)
    print(f"\nFinal video saved at: {output_path}.")   


In [10]:
def process_url(url):
    
    COLLECTION_NAME = "MilvusDocs"
    HF_EOS_TOKEN_LENGTH = 3
    LLM_NAME = "gpt-3.5-turbo"
    
    folder_pdfs = "pdfs"
    folder_images = "images"
    folder_final_videos = "final_videos"
    folder_audio = "audio_voiceovers"
    folder_transcripts = "transcripts"
    
    # Call the function to create the folder
    create_folder(folder_pdfs)
    create_folder(folder_images)
    create_folder(folder_audio)
    create_folder(folder_final_videos)
    create_folder(folder_transcripts)
    
    # Download and save a PDF file from an arXiv.org URL into local directory.
    arxiv_id = download_and_save_pdf(url, folder_pdfs)
    
    arxiv_name = arxiv_id.replace(".", "_")
    pdf_path = os.path.join(folder_pdfs, arxiv_name)
    pdf_file_path = os.path.join(pdf_path, f"{arxiv_name}.pdf")
    image_folder = f"{folder_images}/{arxiv_name}_pngs" 
    mp3_path = f"{folder_audio}/output_{arxiv_name}.mp3"
    output_path = f"{folder_final_videos}/{arxiv_name}.mp4" 
    
    # Download open source embedding model "WhereIsAI/UAE-Large-V1" via Huggingface's Sentence Transformers
    encoder, EMBEDDING_DIM, MAX_SEQ_LENGTH = download_and_initialize_embedding_model()
    
    # Create a no-schema milvus collection and define the database index
    milvus_client = create_milvus_collection(COLLECTION_NAME, EMBEDDING_DIM, my_uri)
    
    # Load PDF's into a PDF object using LangChain's PyPDFLoader
    loader = PyPDFLoader(f"{pdf_path}/{arxiv_name}.pdf")
    docs = loader.load()
    
    # Cut text from PDF's into chunks using LangChain's RecursiveCharacterTextSplitter
    chunks = split_documents_to_chunks(docs, MAX_SEQ_LENGTH, HF_EOS_TOKEN_LENGTH)
    
    # Insert text chunks into Milvus vector database using index type HNSW Indexing and Cosine Distance
    insert_chunks_into_milvus(chunks, COLLECTION_NAME, encoder, milvus_client, MAX_SEQ_LENGTH, HF_EOS_TOKEN_LENGTH)

        
    load_dotenv()
    key = get_env_variables()
    
    # Generate transcript using OpenAI based on the cosine distance search of the document then using gpt-3.5-turbo's chat completions
    text_for_TTS_dict = search_and_generate_response(docs, LLM_NAME, key)
    
    # Save LLM-generated voiceover script to directory
    save_transcript(text_for_TTS_dict, folder_transcripts, arxiv_name)
    
    # convert text to speech with Elevenlabs
    audio_path = text_to_speech(text_for_TTS_list[0], arxiv_name, folder_audio)
    
    # convert each pdf to a png
    convert_pdf_to_png(folder_images, pdf_file_path, arxiv_name)
    
    # cut png's in half
    cut_pngs_in_half(image_folder)

    move_uncropped_files(image_folder)
    
    # combine png's with audio to generate an mp4
    create_video(mp3_path, image_folder, output_path)
    milvus_client.drop_collection(collection_name=COLLECTION_NAME)
    return folder_final_videos

In [11]:
paper_list = ["2402.13254", "2403.07874"] # "2403.07872", "2403.07870","2403.07869"]
# paper_list = ["2403.07867","2308.08079"]

In [17]:
for paper in paper_list:

    url = f"https://arxiv.org/pdf/{paper}.pdf"
    process_url(url)

The folder 'pdfs' already exists.
The folder 'images' already exists.
The folder 'audio_voiceovers' already exists.
The folder 'final_videos' already exists.
The folder 'transcripts' already exists.
The folder 'pdfs/2402_13254' already exists.

PDF downloaded and saved as 2402_13254.pdf

device: cpu


No sentence-transformers model found with name WhereIsAI/UAE-Large-V1. Creating a new one with MEAN pooling.



Datatype of SentenceTransformer encoded object<class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>


What the encoder object looks like: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)


model_name: WhereIsAI/UAE-Large-V1

EMBEDDING_DIM: 1024

MAX_SEQ_LENGTH: 512

Collection had previously been created, dropping previous collection to initialize anew: `MilvusDocs`

Successfully created collection: `MilvusDocs`
{'collection_name': 'MilvusDocs', 'auto_id': True, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': 5, 'params': {}, 'element_type': 0,

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 27.88it/s]

Finished inserting entities






Number of chunks inserted into Milvus database: 122 with chunk id starting at number: 448390687017670728



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


NameError: name 'text_for_TTS_list' is not defined

In [None]:
# Gradio interface
iface = gr.Interface(
    fn=process_url,
    inputs=gr.Textbox(placeholder="Enter arXiv PDF URL"),
    outputs=gr.Video(),
    live=True,
    theme="sky",
    flagging_options=None,  # Disable the flag button
    title="Arxiv2Video",
)

# Add a submit button
submit_button = gr.Button()
iface.launch(share=True)


In [None]:
iface.close()

In [19]:
url = f"https://arxiv.org/pdf/2402.13254.pdf"

key = get_env_variables()

COLLECTION_NAME = "MilvusDocs"
HF_EOS_TOKEN_LENGTH = 3
LLM_NAME = "gpt-3.5-turbo"

folder_pdfs = "pdfs"
folder_images = "images"
folder_final_videos = "final_videos"
folder_audio = "audio_voiceovers"
folder_transcripts = "transcripts"

# Call the function to create the folder
create_folder(folder_pdfs)
create_folder(folder_images)
create_folder(folder_audio)
create_folder(folder_final_videos)
create_folder(folder_transcripts)

# Download and save a PDF file from an arXiv.org URL into local directory.
arxiv_id = download_and_save_pdf(url, folder_pdfs)

arxiv_name = arxiv_id.replace(".", "_")
pdf_path = os.path.join(folder_pdfs, arxiv_name)
pdf_file_path = os.path.join(pdf_path, f"{arxiv_name}.pdf")
image_folder = f"{folder_images}/{arxiv_name}_pngs" 
mp3_path = f"{folder_audio}/output_{arxiv_name}.mp3"
output_path = f"{folder_final_videos}/{arxiv_name}.mp4" 

# Download open source embedding model "WhereIsAI/UAE-Large-V1" via Huggingface's Sentence Transformers
encoder, EMBEDDING_DIM, MAX_SEQ_LENGTH = download_and_initialize_embedding_model()

# Create a no-schema milvus collection and define the database index
milvus_client = create_milvus_collection(COLLECTION_NAME, EMBEDDING_DIM, my_uri)

# Load PDF's into a PDF object using LangChain's PyPDFLoader
loader = PyPDFLoader(f"{pdf_path}/{arxiv_name}.pdf")
docs = loader.load()

# Cut text from PDF's into chunks using LangChain's RecursiveCharacterTextSplitter
chunks = split_documents_to_chunks(docs, MAX_SEQ_LENGTH, HF_EOS_TOKEN_LENGTH)

# Insert text chunks into Milvus vector database using index type HNSW Indexing and Cosine Distance
insert_chunks_into_milvus(chunks, COLLECTION_NAME, encoder, milvus_client, MAX_SEQ_LENGTH, HF_EOS_TOKEN_LENGTH)

# # Generate transcript using OpenAI based on the cosine distance search of the document then using gpt-3.5-turbo's chat completions
result = search_and_generate_response(docs, LLM_NAME, key)

# Split the first sentence of the edit response
edit_sentences = result['edit'].split('. ')
first_sentence_edit = edit_sentences[0]

# Insert the metrics information into the first sentence
first_sentence_edit += ".\n\n" + result['metrics']

# Update the edit response with the modified first sentence
transcript = first_sentence_edit + '. ' + '. '.join(edit_sentences[1:])
transcript



# text_for_TTS_list = search_and_generate_response(
#     milvus_client, # Running session via docker container on port http://localhost:19531
#     encoder, # Sentence Transformer WhereIsAI/UAE-Large-V1
#     COLLECTION_NAME, # MilvusDocs by default
#     LLM_NAME,
#     TEMPERATURE,
#     RANDOM_SEED,
#     M)

# # Save LLM-generated voiceover script to directory
# save_transcript(text_for_TTS_list, folder_transcripts, arxiv_name)

# # convert text to speech with Elevenlabs
# audio_path = text_to_speech(text_for_TTS_list[0], arxiv_name, folder_audio)

# # convert each pdf to a png
# convert_pdf_to_png(folder_images, pdf_file_path, arxiv_name)

# # cut png's in half
# cut_pngs_in_half(image_folder)

# move_uncropped_files(image_folder)

# # combine png's with audio to generate an mp4
# create_video(mp3_path, image_folder, output_path)
# milvus_client.drop_collection(collection_name=COLLECTION_NAME)
# return folder_final_videos

The folder 'pdfs' already exists.
The folder 'images' already exists.
The folder 'audio_voiceovers' already exists.
The folder 'final_videos' already exists.
The folder 'transcripts' already exists.
The folder 'pdfs/2402_13254' already exists.


No sentence-transformers model found with name WhereIsAI/UAE-Large-V1. Creating a new one with MEAN pooling.



PDF downloaded and saved as 2402_13254.pdf

device: cpu

Datatype of SentenceTransformer encoded object<class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>


What the encoder object looks like: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)


model_name: WhereIsAI/UAE-Large-V1

EMBEDDING_DIM: 1024

MAX_SEQ_LENGTH: 512

Collection had previously been created, dropping previous collection to initialize anew: `MilvusDocs`

Successfully created collection: `MilvusDocs`
{'collection_name': 'MilvusDocs', 'auto_id': True, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'desc

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 27.64it/s]

Finished inserting entities






Number of chunks inserted into Milvus database: 122 with chunk id starting at number: 448390687017670974



'The paper "CounterCurate: Enhancing Physical and Semantic Visibility of Cultural Heritage Artifacts" by [Authors] introduces a novel approach to improving the visibility and understanding of cultural heritage artifacts through the concept of CounterCurate.\n\nUnfortunately, as an AI language model, I do not have access to specific papers or their content.. It builds upon existing research on physical and semantic visibility, emphasizing the importance of making cultural heritage accessible and meaningful to the public. The paper advances the field by proposing the use of user engagement metrics, such as visitor feedback and interaction data, to evaluate the effectiveness of CounterCurate in enhancing both physical and semantic qualities of cultural heritage artifacts. \n\nIn terms of technology, CounterCurate incorporates digital tools and technology to enhance the visibility of cultural heritage objects. It leverages user engagement metrics to measure the impact of these enhancements

In [8]:
from langchain.chains import SequentialChain

In [27]:
# OPENAI_API_KEY

key = get_env_variables()

text_before_abstract = re.search(r'^(.*?)\bAbstract\b', docs[0].page_content, re.DOTALL)

text_before_abstract = text_before_abstract.group(1)

dish_name = text_before_abstract

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=key['OPENAI_API_KEY']) 

prompt_outline = PromptTemplate.from_template(
    template= """
    - Use only the the paper {dish_name} to answer the following: 
    - Pick the top 8 most occurring terminology in {dish_name} and define the terms.
    - Use the same termology used in the paper to give the definition of each of the 5 most occurring terminology in {dish_name}.
    - Answer what are the key contributions of the paper {dish_name} in one sentence, use the same termology used in the paper?
    - What are the evaluation metrics in the paper {dish_name}?
    - Named the scoring methods mentioned in the evaluation metrics.
    - If there were any numerical results what exact numbers were these metrics? 
""")

# outline_template = PromptTemplate(input_variables=["input"], template=response_template)
chain_outline = LLMChain(llm=llm, prompt=prompt_outline, output_key="outline")


# This is an LLMChain to write a outline given a dish name and the experience.
prompt_summary = PromptTemplate.from_template(
    template=""" Assume the role of a NeurIPS Paper Evaluation Committee Member who evaluates Use only is evluating the {dish_name} paper based on various criteria such as novelty, significance, and technical soundness to pitch to a stakeholder.
    - The output text should more than 1000 characters:
    - Write 3 sentences to explain in detail the key contributions answered in {outline} of the paper {dish_name}.
    - Expand upon the evluation metrics or new approaches from {outline} by summarizing each and every paragraph in {dish_name} into one sentence. If the paper has 20 paragraphs, there this prompt needs to output 20 sentences.
    - Edit the entire script  for good flow, professionalism, redundancy removal in a voiceover text format.
    - Assume the role of a NeurIPS Paper Evaluation Committee Member who evaluates Use only is evluating the {dish_name} paper based on various criteria such as novelty, significance, and technical soundness to pitch to a stakeholder.
    - It is of utmost importance to be as specific as possible, going into the technical details, using technical terminology 
    - Make the output longer than 2500 characters.
    - Keep the detail and specificity from {dish_name} where every paragraph was summarized into one sentence
    - Make sure that it is obviously stated to the video viewer the related work that the paper is built on, use the terminology and definitions from the {outline}.
    - In the edit, ensure that the voiceover text states the following specifics, if not, go back to the {dish_name} paper to extract more specific details:
    - what research was previously known in the past that the paper is based upon that the paper advances previously known information, what are the technologies that were previously known involved
    - specific methods used,
    - how is the technique, actions or methods performed advanced the field, or challenges the status-quo
    - what are the major contributions, spend 3 sentences to elaborate the details, using terminology from {outline} 
    - What are the key metrics that define the success of the work
    - make sure to always reference the paper {dish_name}  what are the technologies that were previously known involved
    - for each of the technologies involved, if the technologies are from the 5 most occurring terminology in {outline}, edit that part of the {outline} into a complete sentence without further edits.
    - for each of the technologies involved, further elaborate the technology by citing how the {dish_name} has introduced these technologies by looking at the terminology in the paper and summarizing the words before and after. 
    - what are future directions that lie ahead. Cite the grounding sources. 
    - Always start by stating the title and authors of the paper as the first few words.
    - Make sure the output is at least 2500 characters or tokens in length, if not reference the paper {dish_name} to answer the previous points more in depth
    """
)
chain_summary = LLMChain(llm=llm, prompt=prompt_summary, output_key="summary")

# This is an LLMChain to write a follow-up edit given the restaurant outline.
prompt_metrics = PromptTemplate.from_template(
    template="""
    - What are the technologies that were previously known involved from the paper {dish_name}?
    - If you cannot find the paper {dish_name} and are going to output Unfortunately, as an AI language model, I do not have access to specific papers or their content
    - Do not say you do not have access, keep trying to reference the paper {dish_name}. 
    - For each of the technologies involved, if the technologies are from the 5 most occurring terminology in {outline}, edit that part of the {outline} into a complete sentence without further edits. Further elaborate the technology by citing how the {dish_name} has introduced these technologies by looking at the terminology in the paper and summarizing the words before and after. 
    - What are the key metrics that define the success of the work from the paper? Name specific evlauation metrics used in the research?
    - What was the datset was used?
    - How was the research structured, state specific experiment setups?
    - Describe the results and what metrics or experimental results section of the paper {dish_name} claims to have beaten? 
    - What is the scoring? 
    - What were the specific score terminology used?
    - Exactly how much better does this paper claim to be than previous work in the score outcome?
    - List the previous methods that the paper beat. 
    """
)
chain_metrics = LLMChain(llm=llm, prompt=prompt_metrics, output_key="metrics")


overall_chain = SequentialChain(
    chains=[chain_outline, chain_summary, chain_metrics],
    input_variables=["dish_name"],
    output_variables=["outline","summary", "metrics"],
)

result = overall_chain({"dish_name": text_before_abstract})
result

{'dish_name': 'CounterCurate: Enhancing Physical and Semantic Visio-Linguistic\nCompositional Reasoning via Counterfactual Examples\nJianrui Zhang*1Mu Cai∗1Tengyang Xie1,2Yong Jae Lee1\njzhang2427@wisc.edu, {mucai,tx,yongjaelee}@cs.wisc.edu\n1University of Wisconsin–Madison2Microsoft Research\nhttps://countercurate.github.io\n',
 'outline': "Top 8 most occurring terminology in the paper:\n\n1. Counterfactual Examples: Examples that are slightly modified from the original data to test the model's robustness and reasoning capabilities.\n2. Visio-Linguistic Compositional Reasoning: The ability to reason about visual and linguistic information in a compositional manner.\n3. Physical Reasoning: The ability to understand and reason about physical properties and interactions in the environment.\n4. Semantic Reasoning: The ability to understand and reason about the meaning of words and concepts.\n5. CounterCurate: The proposed method for enhancing physical and semantic visio-linguistic composi

In [22]:
result['summary']

'Title: CounterCurate: Enhancing Physical and Semantic Visio-Linguistic Compositional Reasoning via Counterfactual Examples by Jianrui Zhang, Mu Cai, Tengyang Xie, and Yong Jae Lee from the University of Wisconsin–Madison and Microsoft Research.\n\nIn this paper, the key contributions lie in enhancing physical and semantic visio-linguistic compositional reasoning through the use of counterfactual examples, focusing on the understanding and reasoning about visual and linguistic information in a compositional manner, physical reasoning, semantic reasoning, and the proposed CounterCurate method.\n\nThe evaluation metrics and new approaches proposed in the paper include the novel CounterCurate method for enhancing physical and semantic visio-linguistic compositional reasoning, with a focus on metrics such as accuracy, interpretability, and generalization.\n\nThe paper builds upon existing research in visio-linguistic compositional reasoning, physical reasoning, and semantic reasoning, adva

In [23]:
result['metrics']

'- The technologies involved in the paper CounterCurate: Enhancing Physical and Semantic Visio-Linguistic Compositional Reasoning via Counterfactual Examples include Visio-Linguistic Compositional Reasoning, Counterfactual Examples, Physical Reasoning, Semantic Reasoning, and CounterCurate. These technologies were introduced by looking at the terminology in the paper and summarizing the words before and after.\n\n- The key metrics that define the success of the work from the paper include accuracy, interpretability, and generalization. Specific evaluation metrics used in the research include accuracy in reasoning tasks, interpretability of the generated explanations, and generalization to unseen scenarios.\n\n- The dataset used in the research was not specified in the provided information.\n\n- The research was structured around introducing the CounterCurate method for enhancing physical and semantic visio-linguistic compositional reasoning through the use of counterfactual examples. T

In [27]:
# Split the first sentence of the edit response
edit_sentences = result['edit'].split('. ')
first_sentence_edit = edit_sentences[0]

# Insert the metrics information into the first sentence
first_sentence_edit += ".\n\n" + result['metrics']

# Update the edit response with the modified first sentence
transcript = first_sentence_edit + '. ' + '. '.join(edit_sentences[1:])
transcript

'CounterCurate: Enhancing Physical and Semantic Visio-Linguistic Compositional Reasoning via Counterfactual Examples by Jianrui Zhang, Mu Cai, Tengyang Xie, and Yong Jae Lee from the University of Wisconsin-Madison and Microsoft Research introduces the concept of visio-linguistic compositional reasoning, emphasizing its importance in understanding visual and linguistic information.\n\n- The technologies involved in the paper CounterCurate: Enhancing Physical and Semantic Visio-Linguistic Compositional Reasoning via Counterfactual Examples are Visio-Linguistic Compositional Reasoning, Counterfactual Examples, Physical Reasoning, Semantic Reasoning, and CounterCurate. These technologies were introduced by looking at the terminology in the paper and summarizing the words before and after to enhance physical and semantic visio-linguistic reasoning through the use of counterfactual examples.\n\n- The key metrics that define the success of the work from the paper include accuracy, precision,

In [None]:
folder_single_pdfs = "single_page_pdfs"
create_folder(folder_single_pdfs)

loader = PyPDFLoader(f"{pdf_path}/{arxiv_name}.pdf")
docs = loader.load()

In [64]:
import os
from PyPDF2 import PdfReader, PdfWriter

folder_pdfs = "pdfs"
arxiv_id = "2403.07874"

# Call the function to create the folder
create_folder(folder_pdfs)


arxiv_name = arxiv_id.replace(".", "_")
pdf_path = os.path.join(folder_pdfs, arxiv_name)
# pdf_file_path = os.path.join(pdf_path, f"{arxiv_name}.pdf")


def split_pdf_into_pages(pdf_path, arxiv_name):
    # Create the directory if it doesn't exist
    folder_single_pdfs = "single_page_pdfs"

    pdf_file_path = f"{pdf_path}/{arxiv_name}.pdf"

    single_page_pdf_path = os.path.join(pdf_path, folder_single_pdfs)
    os.makedirs(single_page_pdf_path, exist_ok=True)

    # Open the PDF file
    with open(pdf_file_path, "rb") as pdf_file:
        reader = PdfReader(pdf_file_path)
        
        # Iterate through each page and save it as a separate PDF file
        for i in range(len(reader.pages)):
            writer = PdfWriter()
            writer.add_page(reader.pages[i])
            
            output_path = os.path.join(single_page_pdf_path, f"{arxiv_name}_{i}.pdf")
            with open(output_path, "wb") as output_file:
                writer.write(output_file)
            loader = PyPDFLoader(output_path)
            globals()[f"doc_{arxiv_name}_pg_{i}"] = loader.load()

    print("PDF split into separate pages successfully!")
    return single_page_pdf_path

single_page_pdf_path = split_pdf_into_pages(pdf_path, arxiv_name)
single_page_pdf_path

# pdf_file_path = f"{pdf_path}/{arxiv_name}.pdf"

# pdf_file_path

for var_name, var_value in globals().items():
    if var_name.startswith("doc_"):
        print(f"{var_name}: {var_value}")

The folder 'pdfs' already exists.
PDF split into separate pages successfully!
doc_pg_1: pdfs/2403_07874/single_page_pdfs/doc_2403_07874_pg_1
doc_pg_2: pdfs/2403_07874/single_page_pdfs/doc_2403_07874_pg_2
doc_pg_3: pdfs/2403_07874/single_page_pdfs/doc_2403_07874_pg_3
doc_pg_4: pdfs/2403_07874/single_page_pdfs/doc_2403_07874_pg_4
doc_pg_5: pdfs/2403_07874/single_page_pdfs/doc_2403_07874_pg_5
doc_pg_6: pdfs/2403_07874/single_page_pdfs/doc_2403_07874_pg_6
doc_pg_7: pdfs/2403_07874/single_page_pdfs/doc_2403_07874_pg_7
doc_pg_8: pdfs/2403_07874/single_page_pdfs/doc_2403_07874_pg_8
doc_pg_9: pdfs/2403_07874/single_page_pdfs/doc_2403_07874_pg_9
doc_pg_10: pdfs/2403_07874/single_page_pdfs/doc_2403_07874_pg_10
doc_pg_11: pdfs/2403_07874/single_page_pdfs/doc_2403_07874_pg_11
doc_pg_12: pdfs/2403_07874/single_page_pdfs/doc_2403_07874_pg_12
doc_pg_13: pdfs/2403_07874/single_page_pdfs/doc_2403_07874_pg_13
doc_pg_14: pdfs/2403_07874/single_page_pdfs/doc_2403_07874_pg_14
doc_pg_15: pdfs/2403_07874/sin

In [65]:
type(doc_2403_07874_pg_1)

list

In [72]:
len(doc_2403_07874_pg_6[0].page_content)

4968

In [77]:
key = get_env_variables()

page_6 = doc_2403_07874_pg_6[0].page_content

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=key['OPENAI_API_KEY']) 

# This is an LLMChain to write a follow-up edit given the restaurant outline.
prompt_page_6 = PromptTemplate.from_template(
    template="""
    - for each paragraph in {page_6}, condense to a concise sentence.
    """
)
chain_page_6 = LLMChain(llm=llm, prompt=prompt_page_6, output_key="page_summary")


overall_chain = SequentialChain(
    chains=[chain_page_6],
    input_variables=["page_6"],
    output_variables=["page_summary"],
)

result = overall_chain({"page_6": doc_2403_07874_pg_6[0].page_content})
result

{'page_6': 'flavoredcoffeeespressos\nnespressos\nlungoconvinencestorePrimaMarkets\nButchers\nrefreshmentstand\nExtremeSportsBunnyHop\nbalancewheelin\nspecializedtraining\nRemoteDeskministation\ncomputerdesks\ncomputerdesksglazedpotceramist\nRESSMUG\nmug\nCetoniaphaiston\nEmeraldIs\nchaferlar\nFigure 6. Visualization for semantic interpretation.\nMethod Codebook #Tokens CLIP ↑ CLIP-R ↑\nSPAE [54] PaLM-2 5 0.1868 0.7147\nOurs E-LLaMA-2 5 0.2576 0.9165\nSPAE [54] PaLM-2 21 0.1815 0.6901\nOurs E-LLaMA-2 21 0.2427 0.8520\nTable 2. Semantic quality evaluatoin on ImageNet-1K val set. E-\nLLaMA-2: expanded LLaMa-2 vocabulary.\nrect only if all the generated tokens match the tokens of the\nactual category name.\nTable 1 shows the comparison between our approach em-\nploying different LLaMa 2 model configurations, and prior\nworks including LQAE [25], SPAE [54], and a baseline us-\ning a frozen language model for multimodal few-shot learn-\ning [47]. We examine various factors that could influen

In [80]:
result['page_summary']

'Overall, our approach demonstrates superior performance in various tasks such as N-way K-shot classification, image captioning, visual question answering, semantic interpretation, image reconstruction, and image denoising. We outperform existing methods like SPAE and VQ-GAN across different evaluation metrics on the ImageNet-1K validation set. Our model benefits from a compact vocabulary and the introduction of vocabulary expansion, which enhances the semantic relevance of generated tokens.\n\nIn image captioning and visual question answering tasks, we leverage in-context learning samples to improve model performance. The visualization results show the effectiveness of our approach in generating relevant tokens for image representation. Additionally, our semantic interpretation results demonstrate the ability of our model to capture meaningful information from images.\n\nWhen it comes to image reconstruction and denoising, our V2L Tokenizer effectively encodes images into local tokens

In [None]:
# Define the steps as templates and keys
steps = [
    {
        "template": f"""
        - for each paragraph in {{"page_{i}"}}, condense to a concise sentence.
        """,
        "input_variable": f"doc_{arxiv_name}_pg_{i}",
        "output_variable": f"page_summary_{i}",
    }
    for i in range(1, num_pages + 1)  # num_pages is the total number of pages
]

# Create LLMChain objects for each step
chains = []
for step in steps:
    prompt = PromptTemplate.from_template(template=step["template"])
    chain = LLMChain(llm=llm, prompt=prompt, output_key=step["output_variable"])
    chains.append(chain)

# Create SequentialChain with all LLMChain objects
overall_chain = SequentialChain(
    chains=chains,
    input_variables=[step["input_variable"] for step in steps],
    output_variables=[step["output_variable"] for step in steps],
)

# Execute the overall_chain for each page
for i in range(1, num_pages + 1):
    page_content_variable = f"doc_{arxiv_name}_pg_{i}"
    result = overall_chain({page_content_variable: page_contents[i - 1]})
    # Do something with the result
    # For example, you can access result[step["output_variable"]] to get the page summary
