In [107]:
import whisper
import cv2
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
import warnings
import pickle

import chromadb
from chromadb.api.segment import API
import os
from dotenv import load_dotenv
from chromadb.config import Settings
from langchain.vectorstores import Chroma

In [28]:
CHROMA_SETTINGS = Settings(
        persist_directory='db',
        anonymized_telemetry=False
)

In [29]:
from langchain.document_loaders.base import BaseLoader, Document
from langchain.document_loaders import (
    CSVLoader,
    EverNoteLoader,
    PyMuPDFLoader,
    TextLoader,
    UnstructuredEmailLoader,
    UnstructuredEPubLoader,
    UnstructuredHTMLLoader,
    UnstructuredMarkdownLoader,
    UnstructuredODTLoader,
    UnstructuredPowerPointLoader,
    UnstructuredWordDocumentLoader,
)

In [30]:
class CustomVideoLoader(BaseLoader):
    def __init__(self, file_path, model = 'base'):
        import whisper
        
        self.file_path = file_path
        if model.strip().lower() in whisper.available_models():
            self.model = whisper.load_model(model)
            
        else:
            raise ValueError(
                f"{model} is not available. "
                "Please select one of the following models.\n"
                f"{whisper.available_models()}")
        
    @staticmethod
    def __timestamp__(t):
        
        hours, minutes = divmod(int(t),3600)
        minutes, seconds = divmod(minutes, 60)

        hours, minutes, seconds = str(hours).zfill(2), str(minutes).zfill(2), str(seconds).zfill(2)
        return f"{hours}:{minutes}:{seconds}"

    def load(self):
        text = ""
        result = self.model.transcribe(self.file_path)
        for line in result['segments']:
            text += f"{self.__timestamp__(line['start'])}-{self.__timestamp__(line['end'])} || {line['text']}\n"
        metadata = {"source": self.file_path}
        return [Document(page_content=text, metadata=metadata)]

In [31]:
class CustomEmailLoader(UnstructuredEmailLoader):
    """Wrapper to fallback to text/plain when default does not work"""

    def load(self):
        """Wrapper adding fallback for elm without html"""
        try:
            try:
                doc = UnstructuredEmailLoader.load(self)
            except ValueError as e:
                if 'text/html content not found in email' in str(e):
                    # Try plain text
                    self.unstructured_kwargs["content_source"]="text/plain"
                    doc = UnstructuredEmailLoader.load(self)
                else:
                    raise 
        except Exception as e:
            # Add file_path to exception message
            raise type(e)(f"{self.file_path}: {e}") from e

        return doc

In [13]:
# Map file extensions to document loaders and their arguments
LOADER_MAPPING = {
    ".csv": (CSVLoader, {"encoding": "utf8"}),
    ".doc": (UnstructuredWordDocumentLoader, {}),
    ".docx": (UnstructuredWordDocumentLoader, {}),
    ".enex": (EverNoteLoader, {}),
    ".eml": (CustomEmailLoader, {}),
    ".epub": (UnstructuredEPubLoader, {}),
    ".html": (UnstructuredHTMLLoader, {}),
    ".md": (UnstructuredMarkdownLoader, {}),
    ".odt": (UnstructuredODTLoader, {}),
    ".pdf": (PyMuPDFLoader, {}),
    ".ppt": (UnstructuredPowerPointLoader, {}),
    ".pptx": (UnstructuredPowerPointLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf8"}),
    ".mp4": (CustomVideoLoader, {"model":'base'}),
    ".mp3": (CustomVideoLoader, {"model":'base'})
}

In [14]:
model = CustomVideoLoader(r'C:\Users\hotal\Downloads\OpenAI_DevDay.mp4', model = 'base')

In [15]:
data = model.load()

In [18]:
#print(data[0].page_content)

In [103]:

def load_single_document(file_path: str) -> List[Document]:
    ext = "." + file_path.split(".")[-1].lower()
    if ext in LOADER_MAPPING:
        loader_class, loader_args = LOADER_MAPPING[ext]
        loader = loader_class(file_path, **loader_args)
        return loader.load()
    else:
        print(f"Unsupported file extension: '{ext}'\n"
              "Skipping file")
        

def load_documents(source_dir: str, ignored_files: List[str] = [], loaded_file_set_file: str = "loaded_files.pkl") -> List[Document]:
    """
    Loads all documents from the source documents directory, ignoring specified files and skipping already loaded files
    """
    # Load existing set of loaded files
    loaded_files = set()
    if os.path.exists(loaded_file_set_file):
        with open(loaded_file_set_file, 'rb') as f:
            loaded_files = pickle.load(f)

    all_files = []
    
    for foldername, subfolders, filenames in os.walk(source_dir):
        for ext in LOADER_MAPPING:
            all_files.extend([
                os.path.join(foldername, filename)
                for filename in filenames
                if filename.lower().endswith(ext.lower()) or filename.upper().endswith(ext.upper())
            ])
    
    # Filter out ignored files and already loaded files
    filtered_files = [file_path for file_path in all_files if file_path not in ignored_files and file_path not in loaded_files]

    with Pool(processes=os.cpu_count()) as pool:
        results = []
        with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
            for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
                results.extend(docs)
                # Update the set of loaded files
                loaded_files.add(filtered_files[i])
                pbar.update()

    # Save the updated set of loaded files to pickle file
    with open(loaded_file_set_file, 'wb') as f:
        pickle.dump(loaded_files, f)

    return results

def process_documents(ignored_files: List[str] = []) -> List[Document]:
    """
    Load documents and split in chunks
    """
    print(f"Loading documents from {source_directory}")
    documents = load_documents(source_directory, ignored_files)
    if not documents:
        print("No new documents to load")
        exit(0)
    print(f"Loaded {len(documents)} new documents from {source_directory}")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    documents = text_splitter.split_documents(documents)
    print(f"Split into {len(documents)} chunks of text (max. {chunk_size} tokens each)")
    return documents


In [46]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

In [47]:
documents = text_splitter.split_documents(data)

In [48]:
embeddings = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2')

In [49]:
db = Chroma(
    embedding_function= embeddings, 
    persist_directory = 'db2'
)

In [50]:
db.add_documents(documents)

['448d17e9-8642-11ee-904b-803049142908',
 '448d17ea-8642-11ee-82fb-803049142908',
 '448d17eb-8642-11ee-90b8-803049142908',
 '448d17ec-8642-11ee-9c23-803049142908',
 '448d17ed-8642-11ee-a516-803049142908',
 '448d17ee-8642-11ee-aaed-803049142908',
 '448d17ef-8642-11ee-8076-803049142908',
 '448d17f0-8642-11ee-b554-803049142908',
 '448d17f1-8642-11ee-a5b0-803049142908',
 '448d17f2-8642-11ee-935f-803049142908',
 '448d17f3-8642-11ee-aaec-803049142908',
 '448d17f4-8642-11ee-b18c-803049142908',
 '448d17f5-8642-11ee-b16b-803049142908',
 '448d17f6-8642-11ee-bec3-803049142908',
 '448d17f7-8642-11ee-942c-803049142908',
 '448d17f8-8642-11ee-99c5-803049142908',
 '448d17f9-8642-11ee-8bdc-803049142908',
 '448d17fa-8642-11ee-aed8-803049142908',
 '448d17fb-8642-11ee-9871-803049142908',
 '448d17fc-8642-11ee-8384-803049142908',
 '448d17fd-8642-11ee-95e1-803049142908',
 '448d17fe-8642-11ee-ab5c-803049142908',
 '448d17ff-8642-11ee-b9ea-803049142908',
 '448d1800-8642-11ee-90e8-803049142908',
 '448d1801-8642-

In [51]:
del db

In [62]:
from langchain.llms import OpenAI
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA

In [58]:
db = Chroma(embedding_function=embeddings, persist_directory='db2')

In [59]:
retriever = db.as_retriever()

In [60]:
llm = OpenAI()

In [63]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= True)

In [65]:
res = qa("Whats new at OpenAI?")

In [69]:
print(res['result'].strip())

OpenAI has introduced GPTs, custom versions of chat GPT for a specific purpose that combine instructions, extended knowledge and actions. They have also launched the Assistance API to make it easier to build assisted experiences with apps, and a new GPT-4 Turbo model that delivers improved function calling, knowledge, lowered pricing, new modalities and more. They are also deepening their partnership with Microsoft.


In [82]:
for i in res['source_documents']:
    print(i.metadata['source'])
    print(i.page_content)
    print("")

C:\Users\hotal\Downloads\OpenAI_DevDay.mp4
00:20:28-00:20:32 ||  The upsides of this are going to be tremendous.
00:20:32-00:20:40 ||  At OpenAI, we really believe that gradual iterative deployment is the best way to address the safety issues, the safety challenges with AI.
00:20:40-00:20:44 ||  We think it's especially important to move carefully towards this future of agents.
00:20:44-00:20:50 ||  It's going to require a lot of technical work and a lot of thoughtful consideration by society.
00:20:50-00:20:55 ||  So today, we're taking our first small step that moves us towards this future.
00:20:55-00:21:01 ||  We're thrilled to introduce GPT's.
00:21:01-00:21:07 ||  GPT's are tailored versions of chat GPT for a specific purpose.
00:21:07-00:21:19 ||  You can build a GPT, a customized version of chat GPT for almost anything, with instructions, expanded knowledge, and actions, and then you can publish it for others to use.

C:\Users\hotal\Downloads\OpenAI_DevDay.mp4
00:41:38-00:41:43

In [98]:
res = qa("What is the new GPT4 turbo model and what can it do?")

In [99]:
res['result']

' GPT4 Turbo is a new model that supports up to 128,000 tokens of context, 300 pages of a standard book, and is much more accurate over a long context. It can browse the web, write and run code, analyze data, generate images, and more.'

In [100]:
for i in res['source_documents']:
    print(i.page_content)
    print("")

00:12:07-00:12:13 ||  All right, there's actually one more developer request that's been even bigger than all of these.
00:12:13-00:12:16 ||  And so I'd like to talk about that now.
00:12:16-00:12:21 ||  And that's pricing.
00:12:21-00:12:25 ||  So what's going on here is the industry leading model.
00:12:25-00:12:32 ||  It delivers a lot of improvements that we just covered and it's a smarter model than GPT 4.
00:12:32-00:12:39 ||  We've heard from developers that there are a lot of things that they want to build, but GPT 4 just cost too much.
00:12:39-00:12:45 ||  They've told us that if we could decrease the cost by 20, 25% that would be great.
00:12:46-00:12:56 ||  I'm super excited to announce that we worked really hard on this and GPT 4 Turbo, a better model is considerably cheaper than GPT 4.
00:12:56-00:13:05 ||  By a factor of 3x for prompt tokens.
00:13:05-00:13:13 ||  And 2x for completion tokens starting today.

00:18:53-00:18:56 ||  See you.
00:19:02-00:19:08 ||  Okay. So 

In [85]:
from pytube import YouTube

def download_youtube_video(video_url, output_path="."):
    try:
        # Create a YouTube object
        yt = YouTube(video_url)

        # Get the highest resolution stream
        video_stream = yt.streams.get_highest_resolution()

        # Display video details
        print("Downloading:", yt.title)
        print("Resolution:", video_stream.resolution)
        print("File Size:", round(video_stream.filesize / (1024 * 1024), 2), "MB")

        # Download the video
        video_stream.download(output_path, )
        print("Download complete!")

    except Exception as e:
        print("Error:", str(e))

# Example usage:
video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
output_path = ""

download_youtube_video(video_url, output_path)

Downloading: Never Gonna Give You Up
Resolution: 720p
File Size: 19.26 MB
Download complete!


In [90]:
model = CustomVideoLoader("Never Gonna Give You Up.mp4", model='large')
data = model.load()

100%|█████████████████████████████████████| 2.88G/2.88G [06:26<00:00, 7.99MiB/s]


In [91]:
print(data[0].page_content)

00:00:00-00:00:00 ||  🎵
00:00:00-00:00:22 ||  🎵 We're no strangers to love 🎵
00:00:22-00:00:27 ||  🎵 You know the rules and so do I 🎵
00:00:27-00:00:31 ||  🎵 I've built commitments while I'm thinking of 🎵
00:00:31-00:00:35 ||  🎵 You wouldn't get this from any other guy 🎵
00:00:35-00:00:40 ||  🎵 I just wanna tell you how I'm feeling 🎵
00:00:40-00:00:43 ||  🎵 Gotta make you understand 🎵
00:00:43-00:00:45 ||  🎵 Never gonna give you up 🎵
00:00:45-00:00:47 ||  🎵 Never gonna let you down 🎵
00:00:47-00:00:51 ||  🎵 Never gonna run around and desert you 🎵
00:00:51-00:00:53 ||  🎵 Never gonna make you cry 🎵
00:00:53-00:00:55 ||  🎵 Never gonna say goodbye 🎵
00:00:55-00:00:56 ||  🎵 Never gonna tell you goodbye 🎵
00:00:57-00:01:00 ||  🎵 Never gonna lie and hurt you 🎵
00:01:00-00:01:04 ||  🎵 We've known each other for so long 🎵
00:01:04-00:01:07 ||  🎵 Your heart's been aching but 🎵
00:01:07-00:01:09 ||  🎵 You're too shy to say it 🎵
00:01:09-00:01:13 ||  🎵 Inside we both know what's been going on 🎵
00