In [1]:
from moviepy.editor import ImageSequenceClip, AudioFileClip, concatenate_videoclips
from pymilvus import Milvus, MilvusClient, IndexType, connections, utility
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from pdf2image.exceptions import PDFPageCountError, PDFSyntaxError
from langchain.text_splitter import RecursiveCharacterTextSplitter
from moviepy.editor import concatenate_videoclips, ImageClip
from langchain.chains.summarize import load_summarize_chain
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from moviepy.config import change_settings
from langchain.chains.llm import LLMChain
from langchain_openai import ChatOpenAI
from pdf2image import convert_from_path
from milvus import default_server
from dotenv import load_dotenv
from pydub import AudioSegment
from datetime import datetime
from openai import OpenAI
from PIL import Image
import gradio as gr
import numpy as np
import feedparser
import requests
import imageio
import base64
import pprint
import torch
import re
import os

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.vectorstores import Milvus
from langchain.schema.runnable import RunnablePassthrough

In [2]:
change_settings({"FFMPEG_BINARY": "/opt/homebrew/bin/ffmpeg", "DYLD_LIBRARY_PATH":"/opt/homebrew/bin/convert"})
# Set up a Milvus client
default_server.start()
host="127.0.0.1"
connections.connect(host=host, port=default_server.listen_port)
port=default_server.listen_port
my_uri = "http://localhost:" + str(port)
print(my_uri)

http://localhost:19531


In [34]:
def get_env_variables():
    """Fetch all necessary configurations from environment variables."""
    return {
        'OPENAI_API_KEY': os.getenv('OPENAI_API_KEY'),
        'ELEVEN_LABS_API_KEY': os.getenv('ELEVEN_LABS_API_KEY')
    }


def create_folder(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(f"The folder '{folder_name}' has been created.")
    else:
        print(f"The folder '{folder_name}' already exists.")


def arxiv_id_from_url(url):
    # Extract the arXiv ID from the URL using a regular expression
    match = re.search(r'arxiv\.org/pdf/(\d+\.\d+)', url)
    if match:
        return match.group(1)
    else:
        return None
        

def download_and_save_pdf(url, folder_pdfs):
    """
    Download and save a PDF file from an arXiv.org URL into local directory.

    Parameters:
    - url (str): The arXiv.org URL of the paper.

    Returns:
    - str: ArXiv ID of the downloaded paper if successful, or an error message.
    """
    # Extract arXiv ID from the URL
    arxiv_id = arxiv_id_from_url(url)

    arxiv_name = arxiv_id.replace(".", "_")
    pdf_path = os.path.join(folder_pdfs, arxiv_name)
    create_folder(pdf_path)

    # Check if a valid arXiv ID was extracted
    if arxiv_id:
        try:
            # Make a request to the arXiv API
            feed = feedparser.parse(f'http://export.arxiv.org/api/query?id_list={arxiv_id}')

            # Check if the response contains entries
            if 'entries' in feed:
                # Iterate over each entry (paper) in the feed
                for entry in feed.entries:
                    # Extract the PDF link from the entry
                    pdf_link = entry.link.replace('/abs/', '/pdf/') + '.pdf'

                    # Download the PDF
                    response = requests.get(pdf_link)

                    # Save the PDF in the local directory with the name based on the arXiv ID
                    with open(f'{pdf_path}/{arxiv_name}.pdf', 'wb') as pdf_file:
                        pdf_file.write(response.content)

                    print(f"\nPDF downloaded and saved as {arxiv_name}.pdf")
                    return arxiv_id

            else:
                return f"\nNo entries found for arXiv ID {arxiv_id}"

        except Exception as e:
            return f"\nError extracting information: {e}"
    else:
        return "Invalid arXiv PDF URL format. Please enter a valid URL."


def download_and_initialize_embedding_model(model_name="WhereIsAI/UAE-Large-V1", device=None):
    """
    Download and initialize the Sentence Transformer model.

    Parameters:
    - model_name (str): The name of the Sentence Transformer model to download.
    - device (str or torch.device): The device to use for the model (e.g., 'cuda:3' or 'cpu').

    Returns:
    - encoder (SentenceTransformer): The initialized Sentence Transformer model.
    - EMBEDDING_DIM (int): The embedding dimension of the model.
    - MAX_SEQ_LENGTH (int): The maximum sequence length.

    Example usage:
    encoder, EMBEDDING_DIM, max_seq_length = download_and_initialize_embedding_model()
    """
    # Initialize torch settings
    torch.backends.cudnn.deterministic = True
    DEVICE = torch.device(device) if device else torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
    print(f"\ndevice: {DEVICE}")

    # Load the model from the Hugging Face model hub
    encoder = SentenceTransformer(model_name, device=DEVICE)
    print(f"\nDatatype of SentenceTransformer encoded object{type(encoder)}\n")
    print(f"\nWhat the encoder object looks like: {encoder}\n")

    # Get the model parameters and save for later
    EMBEDDING_DIM = encoder.get_sentence_embedding_dimension()
    MAX_SEQ_LENGTH_IN_TOKENS = encoder.get_max_seq_length()
    # Assume tokens are 3 characters long
    MAX_SEQ_LENGTH = MAX_SEQ_LENGTH_IN_TOKENS * 3
    HF_EOS_TOKEN_LENGTH = 1 * 3
    # Test with 512 sequence length
    MAX_SEQ_LENGTH = MAX_SEQ_LENGTH_IN_TOKENS
    HF_EOS_TOKEN_LENGTH = 1

    # Inspect model parameters
    print(f"\nmodel_name: {model_name}")
    print(f"\nEMBEDDING_DIM: {EMBEDDING_DIM}")
    print(f"\nMAX_SEQ_LENGTH: {MAX_SEQ_LENGTH}")

    return encoder, EMBEDDING_DIM, MAX_SEQ_LENGTH


def create_milvus_collection(COLLECTION_NAME, uri=my_uri):
    """
    Create a no-schema Milvus collection and define the database index.

    Parameters:
    - uri (str): The URI of the Milvus server.
    - COLLECTION_NAME (str): The name of the Milvus collection.
    
    Returns:
    - milvus_client (Milvus): The Milvus client instance.


    Example usage:
    my_uri = "tcp://127.0.0.1:19530"
    COLLECTION_NAME = "MilvusDocs"
    my_EMBEDDING_DIM = 3072
    
    milvus_client = create_milvus_collection(COLLECTION_NAME, uri=my_uri)
    """
    EMBEDDING_DIM = 3072 # This is the embedding dimension for OpenAI's Embedding model "text-embedding-3-large"
    
    # Add custom FLAT search index to the collection, which is 1:1 the original and mmore inefficient that other types
    index_params = {
        "index_type": IndexType.FLAT, # Flat index type does not have params
        "metric_type": "COSINE",
    }

    # Use no-schema Milvus client using flexible json key:value format.
    milvus_client = MilvusClient(uri=my_uri)

    # Check if collection already exists, if so drop it.
    if utility.has_collection(COLLECTION_NAME):
        utility.drop_collection(COLLECTION_NAME)
        print(f"\nCollection had previously been created, dropping previous collection to initialize anew: `{COLLECTION_NAME}`")

    # Create the collection.
    milvus_client.create_collection(COLLECTION_NAME, EMBEDDING_DIM,
                                    consistency_level="Eventually",
                                    auto_id=True,
                                    overwrite=True,
                                    params=index_params)

    print(f"\nSuccessfully created collection: `{COLLECTION_NAME}`")
    print(milvus_client.describe_collection(COLLECTION_NAME))

    return milvus_client


def split_documents_to_chunks(docs, MAX_SEQ_LENGTH, CHUNK_OVERLAP):
    """
    Split documents into smaller recursive chunks using Sentence Transformers' RecursiveCharacterTextSplitter.

    Parameters:
    - docs (list): List of documents to be split.
    - max_seq_length (int): Maximum sequence length.
    - hf_eos_token_length (int): Length of the EOS token.

    Returns:
    - chunks (list): List of chunks.

    Example usage:
    MAX_SEQ_LENGTH = 512
    HF_EOS_TOKEN_LENGTH = 3
    docs = ["Document 1 text.", "Document 2 text.", "Document 3 text."]
    
    resulting_chunks = split_documents_to_chunks(docs, MAX_SEQ_LENGTH, HF_EOS_TOKEN_LENGTH)
    print(resulting_chunks)
    """

    # Create an instance of the RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=MAX_SEQ_LENGTH, # Arbitrary numbers, personal preference
        chunk_overlap=CHUNK_OVERLAP, # Arbitrary numbers, personal preference
        separators=["\n\n", "\n", " ", ""]
    )

    # Split the documents further into smaller, recursive chunks.
    chunks = text_splitter.split_documents(docs)
    
    return chunks


def insert_chunks_into_milvus(chunks, COLLECTION_NAME, host="127.0.0.1", port="19531"):
    """
    Insert document chunks into a Milvus collection.

    Parameters:
    - chunked (list): Full langchain documents object chunked already from document loader to be inserted.
    - COLLECTION_NAME (str): Name of the Milvus collection.
    - host
    - port

    Returns:
    - insert_time (float): Time taken for the insertion process.

    Example Usage: 
    
    vector_store = insert_chunks_into_milvus(COLLECTION_NAME, encoder, milvus_client, MAX_SEQ_LENGTH, HF_EOS_TOKEN_LENGTH)
    """
    # Instantiate OpenAI embedding model
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    
    vector_store = Milvus(
        embedding_function=embeddings,
        connection_args={"host": host, "port": port},
        collection_name=COLLECTION_NAME,
        drop_old=True,
    ).from_documents(
        chunks,
        embedding=embeddings,
        collection_name=COLLECTION_NAME,
        connection_args={"host": host, "port": port},
    )

    # After the final entity is inserted, call flush to stop growing segments left in memory.
    milvus_client.flush(COLLECTION_NAME)
    print(f"\nChunks inserted into Milvus database.")

    return vector_store


def retrieve_context(vector_store):
    """ 
    Retrieve relevant data from vector database, which in this case happens to be everything loaded from the pdf.
    retrieved_top_k: list of dicts

    Example Usage:
    formatted_results, context, context_metadata = client_assemble_retrieved_context(results, metadata_fields=metadata_fields, num_shot_answers=top_k)
    """
    # Assemble the context as a stuffed string.

    context = []
    context_metadata = []
    i = 1
    for r in retrieved_top_k[0]:

        if i <= num_shot_answers:
            if len(metadata_fields) > 0:
                metadata = {}
                for field in metadata_fields:
                    metadata[field] = r['entity'][field]
                context_metadata.append(metadata)
            context.append(r['entity']['chunk'])
        i += 1


    

def retrieve_context_and_generate_response(vector_store, text_before_abstract):
    """
    Search Milvus collection for relevant context and generate a response using the OpenAI API.

    Parameters:
    - openai_client (OpenAI): OpenAI client instance.
    - milvus_client (Milvus): Milvus client instance.
    - encoder (SentenceTransformer): SentenceTransformer model for generating embeddings.
    - COLLECTION_NAME (str): Name of the Milvus collection.
    - SAMPLE_QUESTION (str): Sample question for search.
    - llm_name (str): Name of the OpenAI language model.
    - temperature (float): Temperature for response generation.
    - random_seed (int): Random seed for response generation.
    - top_k (int): Top K results to retrieve from Milvus search.
    - M (Milvus): Choice of M: 4~64, larger M for larger data and larger embedding lengths.

    Returns:
    - response_choices (list): List of response choices.

    Example usage:
    
    response_choices = search_and_generate_response(
        milvus_client,
        encoder,
        COLLECTION_NAME,
        LLM_NAME,
        TEMPERATURE,
        RANDOM_SEED
    )
    """
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) 
    
    context = vector_store.as_retriever()

    QUESTION = f"What are the key contributions and evaluations of this paper {text_before_abstract}"

    # Assemble retrieved context
    # metadata_fields = [f for f in output_fields if f != 'chunk']

    SYSTEM_PROMPT = f"""Use only the the paper {text_before_abstract} to answer the user's question. Answer in no less than 4000 characters. Be clear, factual, complete, concise. Answer the question and follow the instructions to the best of your ability.You will be provided a research paper and your task is to summarize the research paper into a 5 minute video as follows:
    - Outline the key points of the paper but do not output it. Use the outline as a guide to expand on a video voiceover of 3 minutes.
    - Edit the outline into a voiceover script for a 5 minute video
    - Clearly state why was the research done, what are the technologies that were previously known involved,
    how is the technique or actions performed advancing the field, what are the key metrics that define the success of the work 
    and what are future directions that lie ahead.
    - Do not write any fact which is not present in the paper
    
    - First, assume the role of a research scientist who has won accolates for being able to explain expert information to a high-schooler and is giving an overview briefing of a research project.
    - Write a clearly organized and to-the-point outline summary of the following research:,
    - The outline should have 3000 words and objectives should be clearly defined for each section of the paper while preserving the specifics address in the technology used or methods tried that have advanced the particular field.
    - Introduce the research scientists involved and the institutions involved if known.
    - Every single line in the outline should be in complete sentences, talk with dignity and sophistication. 
    - Use phrases such as "Our research presents", "This paper details the", do not use words such as realm, or start the sentence with "In the"
    - Assume the audience is asking why and how about the reasoning and logic of the content. 
    - Use present tense and do not use past tense.
    - Do not use phrases such as "x has been discussed, x has been highlighted", be as specific on the details as possible.
    - Make sure to answer clearly what is the major contribution of this body of work.
    - The outline should answer to the point and in specific detail why was the research done, what are the technologies that were previously known involved, how is the technique or actions performed advancing the field, what are the key metrics that define the success of the work and what are future directions that lie ahead.
    
    - After you have produced the outline, next convert each point in the outline to be one or more complete sentences in third person point of view, going into detail especially regarding the technicalities and key concepts of the research. Make sure that it is absolutely clear in specific detail why was the research done, what are the technologies that were previously known involved,
    how is the technique or actions performed advancing the field, what are the key metrics that define the success of the work 
    and what are future directions that lie ahead.
    - Always start by stating the title of the paper as the first few words.
    - Assume the role of the editor of the best ranking tv production company in the world. 
    - Format into a script but not screenplay to be broadcasted publicly in a 5 minute production of 4000 words for higher education consumption.
    - Introduce yourself to assume the role of a third party and do not assume the time of day, do not say good evening you are not the researcher but you represent
    the researcher in advocating for their work. Provide the narration only, do not format as a screenplay.
    - Spend at least 6 sentences delving deep into the research key findings and evaluation.
    - Do not start a paragraph with "Good day, esteemed viewers."
    - Do out output the outline, just output the transcript. Be as detailed as possible.
    
    - Lastly edit the entire script to make sure that it is obviously stated to the video viewer why was the research done, what are the technologies that were previously known involved,
    how is the technique or actions performed advancing the field, what are the key metrics that define the success of the work 
    and what are future directions that lie ahead. Cite the grounding sources. 
    Context: {context}
    Question: {QUESTION}
    """
    
    rag_prompt = PromptTemplate.from_template(SYSTEM_PROMPT, return_source_documents=True)
    rag_chain = (
        {"context": context, "question": RunnablePassthrough()}
        | rag_prompt
        | llm
    )
    
    response = rag_chain.invoke(response = rag_chain.invoke(f"Only output the voiceover transcript, highlighting the key contributions and evaluation metrics for the paper {text_before_abstract}"))

    return response


def save_transcript(response_choices, folder_transcripts, arxiv_name):
    """
    Save the first element of response_choices into a text file in the specified directory.

    Parameters:
    - response_choices (list): A list of choices where the first element is the transcript content.
    - folder_transcripts (str): The directory path where the transcript file will be saved.
    - arxiv_name (str): The name used for generating the transcript file.

    Returns:
    None: The function saves the transcript content to a text file.

    Example:
    save_transcript(["This is the transcript content."], "transcripts", "example_arxiv")
    """
    # Ensure the directory path is valid
    if not os.path.exists(folder_transcripts):
        os.makedirs(folder_transcripts)

    # Generate the file path
    file_path = os.path.join(folder_transcripts, f"{arxiv_name}.txt")

    # Save response_choices[0] to the file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(response_choices[0])

    print(f"\nTranscript saved in: {file_path}")


def text_to_speech(text_for_TTS, arxiv_name, folder_audio):

    ELEVEN_LABS_API_KEY = os.environ.get("ELEVEN_LABS_API_KEY")

    CHUNK_SIZE = 1024
    url = "https://api.elevenlabs.io/v1/text-to-speech/bVMeCyTHy58xNoL34h3p"

    headers = {
        "Accept": "audio/mpeg",
        "Content-Type": "application/json",
        "xi-api-key": ELEVEN_LABS_API_KEY
    }

    data = {
        "text": text_for_TTS,
        "model_id": "eleven_monolingual_v1",
        "voice_settings": {
            "stability": 0.5,
            "similarity_boost": 0.5
        }
    }

    # Generate a unique filename based on timestamp
    filename = f"output_{arxiv_name}.mp3"
    target_path = os.path.join(folder_audio, filename)

    # Check if the file already exists
    if os.path.exists(target_path):
        print(f"Recording file {filename} already exists in {folder_audio}. Skipping download.")
        return target_path

    response = requests.post(url, json=data, headers=headers)

    if response.status_code == 200:
        # Save the recording to the unique file
        with open(target_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
                if chunk:
                    f.write(chunk)

        print(f"\nRecording saved in {target_path}")
        return target_path
    else:
        print(f"\n Error: {response.status_code} - {response.text}")


def convert_pdf_to_png(folder_images, pdf_file_path, arxiv_name):
    try:
        # Create a folder for storing the PNGs
        sub_folder_name = os.path.splitext(os.path.basename(pdf_file_path))[0] + "_pngs"
        full_path = os.path.join(folder_images, sub_folder_name)
        if not os.path.exists(full_path):
            os.makedirs(full_path)
    
        # Convert each page of the PDF to PNG
        images = convert_from_path(pdf_file_path, output_folder=full_path)
        # arxiv_name = sub_folder_name.replace("_pngs", "")
    
        # Save each image as a separate PNG file
        for i, image in enumerate(images):
            png_path = os.path.join(full_path, f"{arxiv_name}_page_{i + 1}.png")
            image.save(png_path, "PNG")
    
        print(f"\nAll pages converted and saved in the folder: {full_path}")
    
        # Clean up: Delete the .ppm files and uncropped files
        for filename in os.listdir(full_path):
            if filename.endswith(".ppm"):
                file_to_remove_path = os.path.join(full_path, filename)
                os.remove(file_to_remove_path)
    
        print(f"\n.ppm artifacts deleted in the folder: {full_path}")
    except (PDFPageCountError, PDFSyntaxError, PermissionError) as e:
        print(f"\nError: {e}")
        print(f"Skipping processing of {pdf_file_path}")
        if isinstance(e, PdfReadError):
            print("PdfReadError: Unable to read PDF file.")
        elif isinstance(e, PermissionError):
            print("PermissionError: Permission issue while processing the PDF file.")


def cut_pngs_in_half(image_folder):
    # Ensure the directory path is valid
    if not os.path.exists(image_folder):
        print(f"\nError: Directory '{image_folder}' does not exist.")
        return

    # Get a list of all files in the directory
    files = [f for f in os.listdir(image_folder) if os.path.isfile(os.path.join(image_folder, f))]

    # Process each file in the directory
    for file_name in files:
        # Check if the file is a PNG and does not contain 'cropped' in the name
        if file_name.lower().endswith('.png') and 'cropped' not in file_name.lower():
            image_path = os.path.join(image_folder, file_name)

            # Open the image
            with Image.open(image_path) as img:
                # Get the dimensions of the image
                width, height = img.size

                # Cut the image in half (top and bottom)
                top_half = img.crop((0, 0, width, height // 2))
                bottom_half = img.crop((0, height // 2, width, height))

                # Save the top and bottom halves with "_cropped_1" and "_cropped_2" suffixes
                top_half.save(os.path.join(image_folder, f"{os.path.splitext(file_name)[0]}_cropped_1.png"), 'PNG')
                bottom_half.save(os.path.join(image_folder, f"{os.path.splitext(file_name)[0]}_cropped_2.png"), 'PNG')

                print(f"\nImages saved: {file_name}_cropped_1.png (top) and {file_name}_cropped_2.png (bottom)")
        else:
            print(f"\nSkipping processing for {file_name} as it contains 'cropped' in the file name.")


def analyze_mp3_length(mp3_path):
    audio = AudioSegment.from_file(mp3_path)
    return len(audio) / 1000.0  # Length in seconds

def fetch_cropped_images(image_folder):
    # List all images in the folder
    all_images = os.listdir(image_folder)
    
    # Identify files to keep (those with the word "cropped" in their filenames)
    cropped_images = [image for image in all_images if image.lower().endswith('.png') and 'cropped' in image.lower()]
    
    # Delete files that do not contain the word "cropped"
    for image in all_images:
        if image not in cropped_images:
            os.remove(os.path.join(image_folder, image))
    
    # List the remaining images after deletion
    remaining_images = os.listdir(image_folder)
    
    # Sort the cropped images based on numeric values in their filenames
    sorted_images = sorted(remaining_images, key=lambda x: int(''.join(filter(str.isdigit, x))))
    return sorted_images


def move_uncropped_files(image_folder):
    try:
        # Create a new folder if it doesn't exist
        uncropped_folder = os.path.join(image_folder, "uncropped_pngs")
        if not os.path.exists(uncropped_folder):
            os.makedirs(uncropped_folder)

        # Loop through all files in the folder
        for filename in os.listdir(image_folder):
            file_path = os.path.join(image_folder, filename)

            # Check if the file name contains the word "cropped"
            if "cropped" not in filename:
                # Move the file to the uncropped folder
                new_path = os.path.join(uncropped_folder, filename)

                try:
                    shutil.move(file_path, new_path)
                    print(f"File moved to uncropped folder: {filename}")
                except Exception as move_error:
                    print(f"Error moving file {filename}: {move_error}")
                    continue

        print(f"All non-cropped files moved to the folder: {uncropped_folder}")
    except Exception as e:
        print(f"Error: {e}")


def create_video(mp3_path, image_folder, output_path):
    try:
        # Loop through all files in the folder
        for filename in os.listdir(image_folder):
            file_path = os.path.join(image_folder, filename)
            
            # Check if the file name contains the word "cropped"
            if "cropped" not in filename:
                # Remove the file
                os.remove(file_path)
                print(f"File removed: {filename}")
                
        print(f"All non-cropped files removed in the folder: {image_folder}")
    except Exception as e:
        print(f"Error: {e}")
    
    # Sort the images by converting the numeric parts of filenames into integers
    image_files = sorted([file for file in os.listdir(image_folder) if 'cropped' in file and file.lower().endswith('.png')],
                         key=lambda x: [int(part) if part.isdigit() else part for part in re.split(r'(\d+)', x)])
    audio_clip = AudioFileClip(mp3_path)
    
    # Calculate the duration of each image based on the total duration of the audio and the number of images
    image_duration = audio_clip.duration / len(image_files)
    
    clips = []
    
    for idx, image_file in enumerate(image_files):
        # Load each image using imageio
        image_path = os.path.join(image_folder, image_file)
        image = imageio.imread(image_path)
    
        if image.sum() == 0:
            print(f"Image {image_file} is blank. Skipping...")
            os.remove(image_file)
            continue
                
        # Create a clip from the image and set its duration
        image_clip = ImageClip(image).set_duration(image_duration)
    
        # Add the image clip to the list of clips
        clips.append(image_clip)
    
    # Concatenate the image clips to create the final video
    final_clip = concatenate_videoclips(clips, method="compose")
    final_clip = final_clip.set_audio(audio_clip)
    
    # Write the final video with audio
    final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac", fps=24, verbose=True)
    print(f"\nFinal video saved at: {output_path}.")   


In [35]:
# def process_url(url):

paper="2402.13254"
host="127.0.0.1"
port="19531"

url = f"https://arxiv.org/pdf/{paper}.pdf"

COLLECTION_NAME = "MilvusDocs"
HF_EOS_TOKEN_LENGTH = 3
LLM_NAME = "gpt-3.5-turbo"
TEMPERATURE = 0.1
RANDOM_SEED = 415
M=16

MAX_SEQ_LENGTH = 600 
CHUNK_OVERLAP=75

folder_pdfs = "pdfs"
folder_images = "images"
folder_final_videos = "final_videos"
folder_audio = "audio_voiceovers"
folder_transcripts = "transcripts"

# # Call the function to create the folder
# create_folder(folder_pdfs)
# create_folder(folder_images)
# create_folder(folder_audio)
# create_folder(folder_final_videos)
# create_folder(folder_transcripts)

# Download and save a PDF file from an arXiv.org URL into local directory.
arxiv_id = download_and_save_pdf(url, folder_pdfs)

arxiv_name = arxiv_id.replace(".", "_")
pdf_path = os.path.join(folder_pdfs, arxiv_name)
pdf_file_path = os.path.join(pdf_path, f"{arxiv_name}.pdf")
image_folder = f"{folder_images}/{arxiv_name}_pngs" 
mp3_path = f"{folder_audio}/output_{arxiv_name}.mp3"
output_path = f"{folder_final_videos}/{arxiv_name}.mp4" 

# Download open source embedding model "WhereIsAI/UAE-Large-V1" via Huggingface's Sentence Transformers
# encoder, EMBEDDING_DIM, MAX_SEQ_LENGTH = download_and_initialize_embedding_model()

# Create a no-schema milvus collection and define the database index
milvus_client = create_milvus_collection(COLLECTION_NAME, my_uri)

# Load PDF's into a PDF object using LangChain's PyPDFLoader
loader = PyPDFLoader(f"{pdf_path}/{arxiv_name}.pdf")
docs = loader.load()

text_before_abstract = re.search(r'^(.*?)\bAbstract\b', docs[0].page_content, re.DOTALL)

# Cut text from PDF's into chunks using LangChain's RecursiveCharacterTextSplitter
chunks = split_documents_to_chunks(docs, MAX_SEQ_LENGTH, CHUNK_OVERLAP)

# Insert text chunks into Milvus vector database using index type HNSW Indexing and Cosine Distance
vector_store = insert_chunks_into_milvus(chunks, COLLECTION_NAME, host, port)

response = retrieve_context_and_generate_response(vector_store, text_before_abstract)

response
    
    # # Generate transcript using OpenAI based on the cosine distance search of the document then using gpt-3.5-turbo's chat completions
    # text_for_TTS_list = search_and_generate_response(
    #     milvus_client, # Running session via docker container on port http://localhost:19531
    #     encoder, # Sentence Transformer WhereIsAI/UAE-Large-V1
    #     COLLECTION_NAME, # MilvusDocs by default
    #     LLM_NAME,
    #     TEMPERATURE,
    #     RANDOM_SEED,
    #     M)
    
    # # Save LLM-generated voiceover script to directory
    # save_transcript(text_for_TTS_list, folder_transcripts, arxiv_name)
    
    # # convert text to speech with Elevenlabs
    # audio_path = text_to_speech(text_for_TTS_list[0], arxiv_name, folder_audio)
    
    # # convert each pdf to a png
    # convert_pdf_to_png(folder_images, pdf_file_path, arxiv_name)
    
    # # cut png's in half
    # cut_pngs_in_half(image_folder)

    # move_uncropped_files(image_folder)
    
    # # combine png's with audio to generate an mp4
    # create_video(mp3_path, image_folder, output_path)
    # milvus_client.drop_collection(collection_name=COLLECTION_NAME)
    # return folder_final_videos

The folder 'pdfs/2402_13254' already exists.

PDF downloaded and saved as 2402_13254.pdf

Collection had previously been created, dropping previous collection to initialize anew: `MilvusDocs`

Successfully created collection: `MilvusDocs`
{'collection_name': 'MilvusDocs', 'auto_id': True, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': 5, 'params': {}, 'element_type': 0, 'auto_id': True, 'is_primary': True}, {'field_id': 101, 'name': 'vector', 'description': '', 'type': 101, 'params': {'dim': 3072}, 'element_type': 0}], 'aliases': [], 'collection_id': 448370747955370650, 'consistency_level': 3, 'properties': {}, 'num_partitions': 1, 'enable_dynamic_field': True}

Chunks inserted into Milvus database.


AIMessage(content='The research paper titled "CounterCurate: Enhancing Physical and Semantic Vi" presents a novel approach to enhancing physical and semantic visualizations. The key contributions of this paper include the development of a system that integrates physical and digital elements to create an interactive and immersive visualization experience. The researchers involved in this project are from institutions that are not specified in the provided text.\n\nThe motivation behind this research was to address the limitations of existing visualization techniques that often rely solely on digital interfaces. By combining physical and digital elements, the researchers aimed to create a more engaging and intuitive visualization platform.\n\nThe technologies previously known and involved in this research include Milvus and OpenAI Embeddings. Milvus is a vector database that enables efficient storage and retrieval of high-dimensional vectors, while OpenAI Embeddings are pre-trained model

In [36]:
response.content

'The research paper titled "CounterCurate: Enhancing Physical and Semantic Vi" presents a novel approach to enhancing physical and semantic visualizations. The key contributions of this paper include the development of a system that integrates physical and digital elements to create an interactive and immersive visualization experience. The researchers involved in this project are from institutions that are not specified in the provided text.\n\nThe motivation behind this research was to address the limitations of existing visualization techniques that often rely solely on digital interfaces. By combining physical and digital elements, the researchers aimed to create a more engaging and intuitive visualization platform.\n\nThe technologies previously known and involved in this research include Milvus and OpenAI Embeddings. Milvus is a vector database that enables efficient storage and retrieval of high-dimensional vectors, while OpenAI Embeddings are pre-trained models that capture sem

In [6]:
milvus_client.list_collections()

['MilvusDocs']

In [28]:

# Extract substring
text_before_abstract = re.search(r'^(.*?)\bAbstract\b', docs[0].page_content, re.DOTALL)

page_header.group(1)

'CounterCurate: Enhancing Physical and Semantic Visio-Linguistic\nCompositional Reasoning via Counterfactual Examples\nJianrui Zhang*1Mu Cai∗1Tengyang Xie1,2Yong Jae Lee1\njzhang2427@wisc.edu, {mucai,tx,yongjaelee}@cs.wisc.edu\n1University of Wisconsin–Madison2Microsoft Research\nhttps://countercurate.github.io\n'

In [38]:
from pymilvus import Collection
collection = Collection("MilvusDocs")      # Get an existing collection.
collection.load()

result = collection.query(
  expr="", 
  output_fields = ["text", "source", "page"], # only other option is vector
  limit = 10
)

result[1]

{'text': 'explored problems: the neglect of physically\ngrounded reasoning (counting and position un-\nderstanding) and the potential of using highly\ncapable text and image generation models for\nsemantic counterfactual fine-tuning. Our work\npioneers an approach in addressing these gaps.\nWe first spotlight the near-chance performance\nof multimodal models like CLIP and LLaV A in\nphysically grounded compositional reasoning.\nWe then apply simple data augmentation using\ngrounded image generation model GLIGEN to\ngenerate fine-tuning data, resulting in signif-\nicant performance improvements: +33% and',
 'source': 'pdfs/2402_13254/2402_13254.pdf',
 'page': 0,
 'pk': 448370747954954739}

In [74]:
result[0]['text']

'CounterCurate: Enhancing Physical and Semantic Visio-Linguistic\nCompositional Reasoning via Counterfactual Examples\nJianrui Zhang*1Mu Cai∗1Tengyang Xie1,2Yong Jae Lee1\njzhang2427@wisc.edu, {mucai,tx,yongjaelee}@cs.wisc.edu\n1University of Wisconsin–Madison2Microsoft Research\nhttps://countercurate.github.io\nAbstract\nWe propose CounterCurate, a framework to\ncomprehensively improve the visio-linguistic\ncompositional reasoning capability for both\ncontrastive and generative multimodal models.\nIn particular, we identify two critical under-\nexplored problems: the neglect of physically'

In [None]:
result.

In [41]:
from langchain.chains import ConversationalRetrievalChain
# chat_history = vector_store
# SYSTEM_PROMPT = "What are the key contributions of this paper and the evaluation metrics for the paper titled CounterCurate: Enhancing Physical and Semantic Visio-Linguistic Compositional Reasoning via Counterfactual Examples"

SYSTEM_PROMPT = f"summarize the paper {text_before_abstract} "

chat = llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) 
qa = ConversationalRetrievalChain.from_llm(
    llm=chat, retriever=vector_store.as_retriever(), return_source_documents=True
)

result =  qa({"question": SYSTEM_PROMPT, "chat_history": ""})
result

{'question': "summarize the paper <re.Match object; span=(0, 315), match='CounterCurate: Enhancing Physical and Semantic Vi> ",
 'chat_history': '',
 'answer': 'The paper "CounterCurate" proposes a framework to enhance visio-linguistic compositional reasoning in multimodal models. It addresses the neglect of physically grounded reasoning and utilizes text and image generation models for semantic counterfactual fine-tuning. The framework shows significant performance improvements on benchmarks, outperforming other models. By curating challenging semantic counterfactuals and fine-tuning with accurate negative samples, CounterCurate demonstrates effectiveness in improving compositional reasoning capabilities. The authors will release their code, dataset, benchmark, and checkpoints for further research.',
 'source_documents': [Document(page_content='CounterCurate: Enhancing Physical and Semantic Visio-Linguistic\nCompositional Reasoning via Counterfactual Examples\nJianrui Zhang*1Mu Cai∗1T

In [42]:
result['answer']

'The paper "CounterCurate" proposes a framework to enhance visio-linguistic compositional reasoning in multimodal models. It addresses the neglect of physically grounded reasoning and utilizes text and image generation models for semantic counterfactual fine-tuning. The framework shows significant performance improvements on benchmarks, outperforming other models. By curating challenging semantic counterfactuals and fine-tuning with accurate negative samples, CounterCurate demonstrates effectiveness in improving compositional reasoning capabilities. The authors will release their code, dataset, benchmark, and checkpoints for further research.'

In [43]:
SYSTEM_PROMPT = f"tell me about the research approach of the paper {text_before_abstract} and key findings. What metrics are used to evaluate performance?"

chat = llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) 
qa = ConversationalRetrievalChain.from_llm(
    llm=chat, retriever=vector_store.as_retriever(), return_source_documents=True
)

In [44]:
result2 =  qa({"question": SYSTEM_PROMPT, "chat_history": ""})
result2

{'question': "tell me about the research approach of the paper <re.Match object; span=(0, 315), match='CounterCurate: Enhancing Physical and Semantic Vi> and key findings. What metrics are used to evaluate performance?",
 'chat_history': '',
 'answer': 'The research approach of the paper "CounterCurate: Enhancing Physical and Semantic Visio-Linguistic Compositional Reasoning via Counterfactual Examples" involves proposing the CounterCurate framework to improve the visio-linguistic compositional reasoning capability for both contrastive and generative multimodal models. The framework addresses the neglect of physically grounded reasoning and utilizes text and image generation models for semantic counterfactual fine-tuning.\n\nKey findings of the paper include significant performance improvements compared to vanilla CLIP/LLaV A-1.5 model and advanced models like GPT-4V. The fine-tuned CounterCurate model outperforms the state-of-the-art LMM GPT-4V in various categories, showing significa

In [46]:
result2['answer']

'The research approach of the paper "CounterCurate: Enhancing Physical and Semantic Visio-Linguistic Compositional Reasoning via Counterfactual Examples" involves proposing the CounterCurate framework to improve the visio-linguistic compositional reasoning capability for both contrastive and generative multimodal models. The framework addresses the neglect of physically grounded reasoning and utilizes text and image generation models for semantic counterfactual fine-tuning.\n\nKey findings of the paper include significant performance improvements compared to vanilla CLIP/LLaV A-1.5 model and advanced models like GPT-4V. The fine-tuned CounterCurate model outperforms the state-of-the-art LMM GPT-4V in various categories, showing significant improvements over the "add" category. The framework also demonstrates better performance on datasets like Flickr30k-Positions and benchmarks like SugarCrepe.\n\nTo evaluate performance, the paper uses metrics such as percentage improvements over base

In [76]:
from langchain.chains import SimpleSequentialChain, LLMChain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate


# Define your prompt templates
summary_template = f"Summarize the key points of the paper {text_before_abstract}. What is the key research contribution of the paper {text_before_abstract}"
guest_template = f"What are the previous or related work and existing advancements that the paper {text_before_abstract} is based upon and is advancing?"
host_template = f"tell me about the research approach of the paper {text_before_abstract} and key findings. What metrics are used to evaluate performance?"

# Create PromptTemplates
SUMMARY_PROMPT = PromptTemplate(template=summary_template, input_variables=["text_before_abstract"])
GUEST_PROMPT = PromptTemplate(template=guest_template)
HOST_PROMPT = PromptTemplate(template=host_template)

# LLM to use in each stage
llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Create LLMChains for each stage
summary_llm_chain = LLMChain(llm=llm, prompt=SUMMARY_PROMPT)
guest_llm_chain = LLMChain(llm=llm, prompt=GUEST_PROMPT)
host_llm_chain = LLMChain(llm=llm, prompt=HOST_PROMPT)

# Create SequentialDocumentsChain
chain = SimpleSequentialChain(
    chains=[summary_llm_chain, guest_llm_chain, host_llm_chain], verbose=True, input_variables=["text_before_abstract"]
)

output = chain.run("CounterCurate: Enhancing Physical and Semantic Visio-Linguistic\nCompositional Reasoning via Counterfactual Examples")

output

KeyError: 'input_variables'

In [47]:
from langchain.chains import SimpleSequentialChain, LLMChain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate

# Define your prompt templates
summary_template = f"Summarize the key points of the paper {text_before_abstract}. What is the key research contribution of the paper {text_before_abstract}"
guest_template = f"What are the previous or related work and existing advancements that the paper {text_before_abstract} is based upon and is advancing?"
host_template = f"tell me about the research approach of the paper {text_before_abstract} and key findings. What metrics are used to evaluate performance?"

# Create PromptTemplates
SUMMARY_PROMPT = PromptTemplate(template=summary_template, input_variables=["text"])
GUEST_PROMPT = PromptTemplate(template=guest_template, input_variables=["text"])
HOST_PROMPT = PromptTemplate(template=host_template, input_variables=["text"])

# LLM to use in each stage
llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Create LLMChains for each stage
summary_llm_chain = LLMChain(llm=llm, prompt=SUMMARY_PROMPT)
guest_llm_chain = LLMChain(llm=llm, prompt=GUEST_PROMPT)
host_llm_chain = LLMChain(llm=llm, prompt=HOST_PROMPT)

# Create SequentialDocumentsChain
chain = SimpleSequentialChain(
    chains=[summary_llm_chain, guest_llm_chain, host_llm_chain]
)

# Run the chain
results = chain.run(retriever=vector_store.as_retriever())

# Print the results
print(results)

ImportError: cannot import name 'SequentialDocumentsChain' from 'langchain.chains' (/Users/lily/anaconda3/envs/tts-yt/lib/python3.9/site-packages/langchain/chains/__init__.py)

In [77]:
response_template = """
You are a helpful bot that creates a 'thank you' reponse text. 
If customers are unsatisfied, offer them a real world assitant to talk to. 
You will get a sentiment and subject as into and evaluate. 

text: {input}
"""
outline_template = PromptTemplate(input_variables=["input"], template=response_template)
outline_chain = LLMChain(llm=llm, prompt=outline_template)

{'positive': LLMChain(prompt=PromptTemplate(input_variables=['input'], template='You are an AI that focuses on the positive side of things. Whenever you analyze a text, you look for the positive aspects and highlight them. Here is the text:\n{input}'), llm=OpenAI(client=<openai.resources.completions.Completions object at 0x280286100>, async_client=<openai.resources.completions.AsyncCompletions object at 0x280324430>, openai_api_key='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', openai_proxy='')),
 'neutral': LLMChain(prompt=PromptTemplate(input_variables=['input'], template='You are an AI that has a neutral perspective. You just provide a balanced analysis of the text, not favoring any positive or negative aspects. Here is the text:\n{input}'), llm=OpenAI(client=<openai.resources.completions.Completions object at 0x280286100>, async_client=<openai.resources.completions.AsyncCompletions object at 0x280324430>, openai_api_key='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',

In [86]:
text_before_abstract = re.search(r'^(.*?)\bAbstract\b', docs[0].page_content, re.DOTALL)

text_before_abstract = page_header.group(1)

In [100]:
dish_name = text_before_abstract



prompt_outline = PromptTemplate.from_template(
    template= """
    - Use only the the paper {dish_name} to answer the user's question. 
    - Answer in no less than 4000 characters or tokens. Be clear, factual, complete, concise. Answer the question and follow the instructions to the best of your ability.You will be provided a research paper and your task is to summarize the research paper into a 5 minute video as follows:
    - Outline the key points of the paper but do not output it. Use the outline as a guide to expand on a video voiceover of 3 minutes.
    - Edit the outline into a voiceover script for a 5 minute video
    - Clearly state why was the research done, what are the technologies that were previously known involved,
    how is the technique or actions performed advancing the field, what are the key metrics that define the success of the work 
    and what are future directions that lie ahead.
    - Do not write any fact which is not present in the paper
    
    - First, assume the role of a research scientist who has won accolates for being able to explain expert information to a high-schooler and is giving an overview briefing of a research project.
    - Write a clearly organized and to-the-point outline summary of the following research:,
    - The outline should have at least 3000 words and objectives should be clearly defined for each section of the paper while preserving the specifics address in the technology used or methods tried that have advanced the particular field.
    - Introduce the research scientists involved and the institutions involved if known.
    - Every single line in the outline should be in complete sentences, talk with dignity and sophistication. 
    - Use phrases such as "Our research presents", "This paper details the", do not use words such as realm, or start the sentence with "In the"
    - Assume the audience is asking why and how about the reasoning and logic of the content. 
    - Use present tense and do not use past tense.
    - Do not use phrases such as "x has been discussed, x has been highlighted", be as specific on the details as possible.
    - Make sure to answer clearly what is the major contribution of this body of work.
    - Pick the top 10 most occurring terminology in {dish_name} and define the terms.
    - The outline should answer to the point and in specific detail why was the research done, what are the technologies that were previously known involved, how is the technique or actions performed advancing the field, what are the key metrics that define the success of the work and what are future directions that lie ahead.
    - Make sure the output is at least 4000 characters or tokens in length, if not reference the paper {dish_name} to answer the previous points more in depth
""")

# outline_template = PromptTemplate(input_variables=["input"], template=response_template)
chain_outline = LLMChain(llm=llm, prompt=prompt_outline, output_key="outline")


from langchain.chains import SequentialChain

# This is an LLMChain to write a outline given a dish name and the experience.
prompt_summary = PromptTemplate.from_template(
    template="""  Use only the the paper {dish_name} to complete this task:
    - Use this {outline} as structure and {dish_name} for factual reference to convert each point in the outline to be one or more complete sentences in third person point of view, going into detail especially regarding the technicalities and key concepts of the research. Make sure that it is absolutely clear in specific detail why was the research done, what are the technologies that were previously known involved,
    - how is the technique or actions performed advancing the field, what are the key metrics that define the success of the work 
    and what are future directions that lie ahead.
    - The output should be 3 times as long as the outline, using the {dish_name} paper to summarize the related work, methods, evaluation metrics in detail.
    - refer to the paper {dish_name} to state facts presented, answering how the research was conducted.

    - Assume the role of the editor of the best ranking tv production company in the world. 
    - Format into a script but not screenplay to be broadcasted publicly in a 5 minute production of 4000 words for higher education consumption.
    - Introduce yourself to assume the role of a third party and do not assume the time of day, do not say good evening you are not the researcher but you represent
    the researcher in advocating for their work. Provide the narration only, do not format as a screenplay.
    - Spend at least 6 sentences delving deep into the research key findings and evaluation.
    - Do not start a paragraph with 'Good day, esteemed viewers.', do not greet audience, do not pretend to be the researcher
    - Make sure the output is at least 4000 characters or tokens in length, if not reference the paper {dish_name} to answer the previous points more in depth
    - """
)
chain_summary = LLMChain(llm=llm, prompt=prompt_summary, output_key="summary")

# This is an LLMChain to write a follow-up edit given the restaurant outline.
prompt_edit = PromptTemplate.from_template(
    template="""Edit the entire script {summary} for good flow, professionalism, redundancy removal in a voiceover text format. 
    - Make sure that it is obviously stated to the video viewer the related work that the paper is built on, 
    - In the edit, ensure that the voiceover text states the following specifics, if not, go back to the {dish_name} paper to extract more specific details:
    - what research was previously known in the past that the paper is based upon that the paper advances previously known information, what are the technologies that were previously known involved
    - specific methods used,
    - how is the technique, actions or methods performed advanced the field, or challenges the status-quo
    - what are the specific major contributions, and what are the evaluation metrics to prove it. What are the key metrics that define the success of the work
    - make sure to always reference the paper {dish_name}  what are the technologies that were previously known involved, 
    - what are future directions that lie ahead. Cite the grounding sources. 
    - Always start by stating the title and authors of the paper as the first few words.
    - Make sure the output is at least 4000 characters or tokens in length, if not reference the paper {dish_name} to answer the previous points more in depth
    """
)
chain_edit = LLMChain(llm=llm, prompt=prompt_edit, output_key="edit")


overall_chain = SequentialChain(
    chains=[chain_outline, chain_summary, chain_edit],
    input_variables=["dish_name"],
    output_variables=["outline","summary", "edit"],
)

result = overall_chain({"dish_name": text_before_abstract})
result

{'dish_name': 'CounterCurate: Enhancing Physical and Semantic Visio-Linguistic\nCompositional Reasoning via Counterfactual Examples\nJianrui Zhang*1Mu Cai∗1Tengyang Xie1,2Yong Jae Lee1\njzhang2427@wisc.edu, {mucai,tx,yongjaelee}@cs.wisc.edu\n1University of Wisconsin–Madison2Microsoft Research\nhttps://countercurate.github.io\n',
 'outline': '\nIntroduction:\n- Introduce the research scientists involved: Jianrui Zhang, Mu Cai, Tengyang Xie, and Yong Jae Lee.\n- Mention the institution involved: University of Wisconsin-Madison and Microsoft Research.\n- Explain that this paper details the research project titled "CounterCurate: Enhancing Physical and Semantic Visio-Linguistic Compositional Reasoning via Counterfactual Examples".\n- Our research presents a novel approach to enhancing visual and linguistic reasoning in machines through the use of counterfactual examples.\n\nWhy was the research done?\n- The main motivation behind this research was the need to improve the reasoning capabili

In [99]:
result['edit']

'\nHello and welcome to our latest production, featuring the groundbreaking research paper titled "CounterCurate: Enhancing Physical and Semantic Visio-Linguistic Compositional Reasoning via Counterfactual Examples." This paper was written by Jianrui Zhang, Mu Cai, Tengyang Xie, and Yong Jae Lee, esteemed research scientists from the University of Wisconsin-Madison and Microsoft Research.\n\nThis paper builds upon previous research in the field of physical and semantic visio-linguistic reasoning. While previous methods have struggled to effectively combine visual and linguistic information, the team behind CounterCurate has introduced the concept of counterfactual examples, greatly enhancing reasoning in this area.\n\nTo achieve this, the team developed a framework consisting of an image encoder, text encoder, and counterfactual generator. By utilizing counterfactual examples, the model is able to learn to reason in a more compositional manner, resulting in improved performance on data

In [39]:
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) 
    
    context = vector_store.as_retriever()

    # QUESTION = "What are the key contributions of this paper and the evaluation metrics for the paper titled CounterCurate: Enhancing Physical and Semantic Visio-Linguistic Compositional Reasoning via Counterfactual Examples"
    QUESTION = f"who are the authors of the paper {text_before_abstract}"
    # Assemble retrieved context
    # metadata_fields = [f for f in output_fields if f != 'chunk']

    SYSTEM_PROMPT = f"""Use only the the paper {text_before_abstract} to answer the user's question. Answer in no less than 4000 characters. Be clear, factual, complete, concise. Answer the question and follow the instructions to the best of your ability.You will be provided a research paper and your task is to summarize the research paper into a 5 minute video as follows:
    - Outline the key points of the paper
    - Edit the outline into a voiceover script for a 5 minute video
    - Clearly state why was the research done, what are the technologies that were previously known involved,
    how is the technique or actions performed advancing the field, what are the key metrics that define the success of the work 
    and what are future directions that lie ahead.
    - Do not write any fact which is not present in the paper
    
    - First, assume the role of a research scientist who has won accolates for being able to explain expert information to a high-schooler and is giving an overview briefing of a research project.
    - Write a clearly organized and to-the-point outline summary of the following research:,
    - The outline should have 3000 words and objectives should be clearly defined for each section of the paper while preserving the specifics address in the technology used or methods tried that have advanced the particular field.
    - Introduce the research scientists involved and the institutions involved if known.
    - Every single line in the outline should be in complete sentences, talk with dignity and sophistication. 
    - Use phrases such as "Our research presents", "This paper details the", do not use words such as realm, or start the sentence with "In the"
    - Assume the audience is asking why and how about the reasoning and logic of the content. 
    - Use present tense and do not use past tense.
    - Do not use phrases such as "x has been discussed, x has been highlighted", be as specific on the details as possible.
    - Make sure to answer clearly what is the major contribution of this body of work.
    - The outline should answer to the point and in specific detail why was the research done, what are the technologies that were previously known involved,
    how is the technique or actions performed advancing the field, what are the key metrics that define the success of the work 
    and what are future directions that lie ahead.
    
    - After you have produced the outline, next convert each point in the outline to be one or more complete sentences in third person point of view, going into detail especially regarding the technicalities and key concepts of the research. Make sure that it is absolutely clear in specific detail why was the research done, what are the technologies that were previously known involved,
    how is the technique or actions performed advancing the field, what are the key metrics that define the success of the work 
    and what are future directions that lie ahead.
    - Always start by stating the title of the paper as the first few words.
    - Assume the role of the editor of the best ranking tv production company in the world. 
    - Format into a script but not screenplay to be broadcasted publicly in a 5 minute production of 4000 words for higher education consumption.
    - Introduce yourself to assume the role of a third party and do not assume the time of day, do not say good evening you are not the researcher but you represent
    the researcher in advocating for their work. Provide the narration only, do not format as a screenplay.
    - Spend at least 6 sentences delving deep into the research key findings and evaluation.
    - Do not start a paragraph with "Good day, esteemed viewers."
    
    - Lastly edit the entire script to make sure that it is obviously stated to the video viewer why was the research done, what are the technologies that were previously known involved,
    how is the technique or actions performed advancing the field, what are the key metrics that define the success of the work 
    and what are future directions that lie ahead. Cite the grounding sources. 
    Context: {context}
    Question: {QUESTION}
    """
    
    rag_prompt = PromptTemplate.from_template(SYSTEM_PROMPT)
    rag_chain = (
        {"context": context, "question": RunnablePassthrough()}
        | rag_prompt
        | llm
    )
    
    response = rag_chain.invoke(SYSTEM_PROMPT)
    response

AIMessage(content='The authors of the paper "CounterCurate: Enhancing Physical and Semantic Vi" are not explicitly mentioned in the provided text. However, based on the context of the research paper, it is likely that the authors are researchers or scientists involved in the development and implementation of the CounterCurate system.\n\nIn this research paper, the authors present the CounterCurate system, which focuses on enhancing both the physical and semantic aspects of virtual reality (VR) experiences. The key objective of this research is to improve the overall quality and realism of VR environments by integrating physical interactions and semantic understanding.\n\nThe technologies involved in this research include Milvus, an open-source vector database, and OpenAI Embeddings, which are used to represent and process semantic information within the VR environment. These technologies are known for their capabilities in handling large-scale data and extracting meaningful insights fr

In [32]:
response = retrieve_context_and_generate_response(vector_store)
response

AIMessage(content='Title: CounterCurate: Enhancing Physical and Semantic Visio-Linguistic Compositional Reasoning via Counterfactual Examples\n\nOutline Summary:\n\n1. Introduction\n    - Introduce the research paper titled "CounterCurate: Enhancing Physical and Semantic Visio-Linguistic Compositional Reasoning via Counterfactual Examples"\n    - Highlight the key researchers involved in the study and their affiliations\n    - Provide an overview of the purpose of the research and the technologies involved\n\n2. Background\n    - Discuss the existing technologies and methods related to physical and semantic visio-linguistic compositional reasoning\n    - Explain the limitations of current approaches and the need for advancements in the field\n\n3. Methodology\n    - Detail the technique of using counterfactual examples to enhance reasoning in physical and semantic visio-linguistic tasks\n    - Describe how the researchers implemented this technique and the specific actions taken in the

In [5]:
# paper_list = ["2402.13254", "2403.07874", "2403.07872", "2403.07870","2403.07869"]
paper_list = ["2403.07867","2308.08079"]

In [6]:
for paper in paper_list:

    url = f"https://arxiv.org/pdf/{paper}.pdf"
    process_url(url)

The folder 'pdfs' already exists.
The folder 'images' already exists.
The folder 'audio_voiceovers' already exists.
The folder 'final_videos' already exists.
The folder 'transcripts' already exists.
The folder 'pdfs/2403_07867' already exists.

PDF downloaded and saved as 2403_07867.pdf

device: cpu


No sentence-transformers model found with name WhereIsAI/UAE-Large-V1. Creating a new one with MEAN pooling.



Datatype of SentenceTransformer encoded object<class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>


What the encoder object looks like: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)


model_name: WhereIsAI/UAE-Large-V1

EMBEDDING_DIM: 1024

MAX_SEQ_LENGTH: 512

Collection had previously been created, dropping previous collection to initialize anew: `MilvusDocs`

Successfully created collection: `MilvusDocs`
{'collection_name': 'MilvusDocs', 'auto_id': True, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': 5, 'params': {}, 'element_type': 0,

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 27.98it/s]

Finished inserting entities






Number of chunks inserted into Milvus database: 86 with chunk id starting at number: 448349453448642987



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


('\n'
 'Answer 1: Title: LazyBoE: Lazy Propagation with Edge Selection for '
 'Kinodynamic Planning\n'
 '\n'
 'Outline Summary:\n'
 '\n'
 'I. Introduction\n'
 '    A. Introduce the research paper titled "LazyBoE: Lazy Propagation with '
 'Edge Selection for Kinodynamic Planning."\n'
 '    B. Mention the research scientists involved and the institutions where '
 'the research was conducted.\n'
 '    C. Highlight the importance of kinodynamic motion planning and the need '
 'for efficient algorithms in this field.\n'
 '\n'
 'II. Background\n'
 '    A. Discuss the existing challenges in kinodynamic motion planning.\n'
 '    B. Explain the significance of lazy techniques in improving '
 'performance.\n'
 '    C. Introduce the concept of LazyBoE and its approach to kinodynamic '
 'planning.\n'
 '\n'
 'III. Methodology\n'
 '    A. Describe the LazyBoE algorithm in detail, focusing on lazy '
 'propagation and edge selection.\n'
 '    B. Explain how LazyBoE utilizes probabilistic evaluation of

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



All pages converted and saved in the folder: images/2403_07867_pngs

.ppm artifacts deleted in the folder: images/2403_07867_pngs

Images saved: 2403_07867_page_1.png_cropped_1.png (top) and 2403_07867_page_1.png_cropped_2.png (bottom)

Images saved: 2403_07867_page_3.png_cropped_1.png (top) and 2403_07867_page_3.png_cropped_2.png (bottom)

Images saved: 2403_07867_page_2.png_cropped_1.png (top) and 2403_07867_page_2.png_cropped_2.png (bottom)

Images saved: 2403_07867_page_6.png_cropped_1.png (top) and 2403_07867_page_6.png_cropped_2.png (bottom)

Images saved: 2403_07867_page_7.png_cropped_1.png (top) and 2403_07867_page_7.png_cropped_2.png (bottom)

Images saved: 2403_07867_page_5.png_cropped_1.png (top) and 2403_07867_page_5.png_cropped_2.png (bottom)

Images saved: 2403_07867_page_4.png_cropped_1.png (top) and 2403_07867_page_4.png_cropped_2.png (bottom)
Error moving file 2403_07867_page_1.png: name 'shutil' is not defined
Error moving file 2403_07867_page_3.png: name 'shutil' is

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Moviepy - Building video final_videos/2403_07867.mp4.
MoviePy - Writing audio in 2403_07867TEMP_MPY_wvf_snd.mp4


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


MoviePy - Done.
Moviepy - Writing video final_videos/2403_07867.mp4



                                                                                

Moviepy - Done !
Moviepy - video ready final_videos/2403_07867.mp4

Final video saved at: final_videos/2403_07867.mp4.
The folder 'pdfs' already exists.
The folder 'images' already exists.
The folder 'audio_voiceovers' already exists.
The folder 'final_videos' already exists.
The folder 'transcripts' already exists.
The folder 'pdfs/2308_08079' has been created.

PDF downloaded and saved as 2308_08079.pdf

device: cpu


No sentence-transformers model found with name WhereIsAI/UAE-Large-V1. Creating a new one with MEAN pooling.



Datatype of SentenceTransformer encoded object<class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>


What the encoder object looks like: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)


model_name: WhereIsAI/UAE-Large-V1

EMBEDDING_DIM: 1024

MAX_SEQ_LENGTH: 512

Successfully created collection: `MilvusDocs`
{'collection_name': 'MilvusDocs', 'auto_id': True, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': 5, 'params': {}, 'element_type': 0, 'auto_id': True, 'is_primary': True}, {'field_id': 101, 'name': 'vector', 'description': '', 'type': 1

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 24.64it/s]

Finished inserting entities






Number of chunks inserted into Milvus database: 156 with chunk id starting at number: 448349453448643074

('\n'
 'Answer 1: Title: Advancements in Lower-Dimensional Space Stabilization Using '
 'Rigid Transformations: A Novel Approach\n'
 '\n'
 'Hello, esteemed viewers. Today, we will delve into a groundbreaking research '
 'paper titled "Advancements in Lower-Dimensional Space Stabilization Using '
 'Rigid Transformations: A Novel Approach." This paper, authored by a team of '
 'esteemed researchers, presents a novel technique that significantly advances '
 "the field of lower-dimensional space stabilization. Let's explore the key "
 'contributions of this paper and the evaluation metrics that demonstrate how '
 'this work pushes the boundaries of previously known information.\n'
 '\n'
 '1. Introduction:\n'
 '- The research team, led by prominent scientists, embarked on this study to '
 'address the challenges associated with lower-dimensional space '
 'stabilization.\n'
 '- The pape

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



All pages converted and saved in the folder: images/2308_08079_pngs

.ppm artifacts deleted in the folder: images/2308_08079_pngs

Images saved: 2308_08079_page_15.png_cropped_1.png (top) and 2308_08079_page_15.png_cropped_2.png (bottom)

Images saved: 2308_08079_page_29.png_cropped_1.png (top) and 2308_08079_page_29.png_cropped_2.png (bottom)

Images saved: 2308_08079_page_28.png_cropped_1.png (top) and 2308_08079_page_28.png_cropped_2.png (bottom)

Images saved: 2308_08079_page_14.png_cropped_1.png (top) and 2308_08079_page_14.png_cropped_2.png (bottom)

Images saved: 2308_08079_page_16.png_cropped_1.png (top) and 2308_08079_page_16.png_cropped_2.png (bottom)

Images saved: 2308_08079_page_17.png_cropped_1.png (top) and 2308_08079_page_17.png_cropped_2.png (bottom)

Images saved: 2308_08079_page_13.png_cropped_1.png (top) and 2308_08079_page_13.png_cropped_2.png (bottom)

Images saved: 2308_08079_page_12.png_cropped_1.png (top) and 2308_08079_page_12.png_cropped_2.png (bottom)

Imag

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Moviepy - Building video final_videos/2308_08079.mp4.
MoviePy - Writing audio in 2308_08079TEMP_MPY_wvf_snd.mp4


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


MoviePy - Done.
Moviepy - Writing video final_videos/2308_08079.mp4



                                                                                

Moviepy - Done !
Moviepy - video ready final_videos/2308_08079.mp4

Final video saved at: final_videos/2308_08079.mp4.


In [None]:
# Gradio interface
iface = gr.Interface(
    fn=process_url,
    inputs=gr.Textbox(placeholder="Enter arXiv PDF URL"),
    outputs=gr.Video(),
    live=True,
    theme="sky",
    flagging_options=None,  # Disable the flag button
    title="Arxiv2Video",
)

# Add a submit button
submit_button = gr.Button()
iface.launch(share=True)


In [None]:
iface.close()