# Goal

generate a txt file for a local repository

In [5]:
import os

def read_files(folder_path):
    """
    Read files from the specified folder path with specific extensions.
    
    :param folder_path: Path to the folder containing the files to read
    :return: List of tuples containing file paths and their contents
    """
    files_and_code = []
    file_extensions = ['.py', 'Dockerfile', '.yaml', '.env-example', '.md', "ipynb"]
    
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if any(file_name.endswith(ext) for ext in file_extensions):
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    code = file.read()
                    files_and_code.append((file_path, code))
    
    return files_and_code

def create_txt_from_files(folder_path, output_txt_path):
    """
    Create a text file containing the contents of all files in the specified folder.
    
    :param folder_path: Path to the folder containing the files to process
    :param output_txt_path: Path where the output text file will be saved
    """
    files_and_code = read_files(folder_path)
    text = ""
    with open(output_txt_path, 'w', encoding='utf-8') as output_file:
        for file_path, code in files_and_code:
            # Get relative path from the root folder
            relative_path = os.path.relpath(file_path, folder_path)
            text += f"\n{'='*80}\n"
            output_file.write(f"\n{'='*80}\n")
            text += f"# {relative_path}\n"
            output_file.write(f"# {relative_path}\n")
            text += f"{'='*80}\n\n"
            output_file.write(f"{'='*80}\n\n")
            text += f"{code}\n\n"
            output_file.write(code)
            output_file.write('\n\n')
    return text

# Example usage:
folder_path = '/Users/ganga/Library/CloudStorage/GoogleDrive-goldenguille@gmail.com/My Drive/REPO/llmops-youtube-summarizer'  # replace with your folder path
output_txt_path = 'output.txt'  # replace with your output text file path
text = create_txt_from_files(folder_path, output_txt_path)


In [7]:
import getpass
import os

if not os.getenv("COHERE_API_KEY"):
    os.environ["COHERE_API_KEY"] = getpass.getpass("Enter your Cohere API key: ")

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000,
    chunk_overlap=200,
    length_function=len
)

In [21]:
from langchain_cohere import CohereEmbeddings

embeddings = CohereEmbeddings(
    model="embed-english-v3.0",max_retries=3
)

In [24]:
documents = text_splitter.create_documents([text])[:10]


In [25]:
from langchain.vectorstores import Chroma

texts = [doc.page_content for doc in documents]

vectorstore = Chroma.from_texts(
    texts=texts,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

In [26]:
vectorstore.similarity_search(query="What is the purpose of the repository?", k=3)



In [28]:
import groq
import os
from dotenv import load_dotenv
from urllib.parse import urlparse, parse_qs

# Load environment variables
load_dotenv()

# Initialize Groq client
client = groq.Groq(
    api_key=os.getenv('GROQ_API_KEY')  # Put your API key here
)

def answer_question(query):
    prompt = """Answer the following question: {query}

    based on the following context: {context}"""

    context = vectorstore.similarity_search(query=query, k=3)

    completion = client.chat.completions.create(
        model="llama-3.2-11b-vision-preview",
        messages=[
            {"role": "user", "content": prompt.format(query="What is the purpose of the repository?", context=context)}
        ],
        temperature=0.3,
        max_tokens=2048
    )
    return completion.choices[0].message.content

result = answer_question(query="What is the purpose of the repository?")
print(result)

The purpose of the repository is to host an advanced YouTube video summarization tool that leverages Groq's LLM capabilities and MLOps practices to generate high-quality summaries of YouTube videos from their transcripts. The repository includes several key components:

1. **Proof of concept implementation**: A simple implementation of the YouTube video summarization tool using the `01_PoC.ipynb` notebook.
2. **MLOps pipeline implementation**: A more comprehensive implementation of the YouTube video summarization tool using the `02_MLFlow_implementation.ipynb` notebook.
3. **LLM prompt engineering template**: A template for engineering LLM prompts to generate high-quality summaries.
4. **Requirements**: A list of dependencies required to run the project, including `youtube-transcript-api`, `groq`, `mlflow`, `pandas`, `rouge-score`, and `transformers`.
5. **Project structure**: A description of the project's directory structure, including the `01_PoC.ipynb`, `02_MLFlow_implementation.ip