# Get Repo Data

In [35]:
import git
import os
from supabase import create_client
import uuid
from IPython.display import display, Markdown
from langchain.chat_models import init_chat_model
from langchain.agents import create_react_agent, AgentExecutor
from langchain_core.prompts import PromptTemplate
import re
import tempfile
import shutil
from git import Repo

In [11]:
def get_git_repo_info(repo_path):
    """
    Extract information from a Git repository.
    Returns a dictionary with repo metadata and file contents.
    Only processes relevant text files for code analysis.
    """
    repo = None
    try:
        repo = git.Repo(repo_path)
        repo_info = {
            'repo_name': os.path.basename(repo_path),
            'commit_count': len(list(repo.iter_commits())),
            'branches': [branch.name for branch in repo.branches],
            'files': []
        }
        
        # Define file extensions to INCLUDE (whitelist approach)
        relevant_extensions = {
            '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h',
            '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala',
            '.html', '.css', '.scss', '.sass', '.less',
            '.md', '.txt', '.rst', '.json', '.yaml', '.yml', '.toml',
            '.sql', '.sh', '.bat', '.ps1',
            '.dockerfile', '.gitignore', '.env.example'
        }
        
        # Define files to INCLUDE by name (regardless of extension)
        relevant_filenames = {
            'README', 'LICENSE', 'CHANGELOG', 'CONTRIBUTING', 'INSTALL',
            'Dockerfile', 'Makefile', 'requirements.txt', 'package.json',
            'setup.py', 'pyproject.toml', 'Cargo.toml', 'pom.xml',
            'build.gradle', 'composer.json', 'Gemfile'
        }
        
        # Define directories to SKIP
        skip_directories = {
            'node_modules', '.git', '__pycache__', '.pytest_cache',
            'venv', 'env', '.env', 'build', 'dist', 'target',
            '.idea', '.vscode', 'logs', 'tmp', 'temp',
            'images', 'assets', 'static/images', 'public/images'
        }
        
        # Get file contents from the latest commit in the active branch
        tree = repo.head.commit.tree
        for item in tree.traverse():
            if item.type == 'blob':  # Only process files, not directories
                # Skip files in unwanted directories
                if any(skip_dir in item.path for skip_dir in skip_directories):
                    continue
                
                # Check if file should be included
                file_ext = os.path.splitext(item.path)[1].lower()
                filename = os.path.basename(item.path)
                filename_no_ext = os.path.splitext(filename)[0].upper()
                
                # Include if extension or filename matches our criteria
                should_include = (
                    file_ext in relevant_extensions or
                    filename_no_ext in relevant_filenames or
                    filename in relevant_filenames
                )
                
                if not should_include:
                    continue
                
                try:
                    # Try to decode as UTF-8, skip if it fails
                    content = item.data_stream.read().decode('utf-8')
                    # Skip files with null bytes (binary content)
                    if '\x00' in content:
                        continue
                    
                    # Skip very large files (>100KB)
                    if len(content) > 100000:
                        continue
                    
                    file_info = {
                        'path': item.path,
                        'content': content
                    }
                    repo_info['files'].append(file_info)
                except UnicodeDecodeError:
                    # Skip files that can't be decoded as text
                    continue
        
        print(f"Processed {len(repo_info['files'])} relevant files from {repo_info['repo_name']}")
        return repo_info
    
    except Exception as e:
        print(f"Error accessing Git repo: {e}")
        return None
    finally:
        # Close the repository to release file handles
        if repo is not None:
            repo.close()



In [12]:
repo_info

{'repo_name': 'tmpqv23rcta',
 'commit_count': 6,
 'branches': ['main'],
 'files': [{'path': 'README.md',
   'content': '# LinkedIn-Booster 🚀\n\nAn intelligent AI-powered LinkedIn post creation workflow that combines the power of LangChain, LangGraph, and multiple AI services to create engaging LinkedIn content with matching visuals.\n\n![LangGraph Workflow](images/LangGraph-workflow-diagram.png)\n\n## ✨ Features\n\n- **🤖 AI-Powered Generation**: Google Gemini for high-quality LinkedIn post drafting\n- **🔍 Web Research**: Tavily search integration for up-to-date context and insights\n- **🔄 Interactive Feedback Loop**: Intelligent feedback classification and content refinement\n- **🎨 Image Generation**: DALL·E 3 integration for compelling visual content\n- **📧 Email Delivery**: Gmail API integration for seamless content distribution\n- **🖥️ Multiple Interfaces**: CLI and web-based Chainlit UI options\n- **📊 Workflow Visualization**: LangGraph Studio integration for workflow monitoring\n\

# Git utils

In [42]:
from urllib.parse import urlparse
import os
import tempfile
import shutil
import stat
import git
from git import Repo

In [40]:
def get_git_repo_info(repo_path):
    """
    Extract information from a Git repository.
    Returns a dictionary with repo metadata and file contents.
    Only processes relevant text files for code analysis.
    """
    repo = None
    try:
        repo = git.Repo(repo_path)
        repo_info = {
            'repo_name': os.path.basename(repo_path),
            'commit_count': len(list(repo.iter_commits())),
            'branches': [branch.name for branch in repo.branches],
            'files': []
        }
        
        # Define file extensions to INCLUDE (whitelist approach)
        relevant_extensions = {
            '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h',
            '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala',
            '.html', '.css', '.scss', '.sass', '.less',
            '.md', '.txt', '.rst', '.json', '.yaml', '.yml', '.toml',
            '.sql', '.sh', '.bat', '.ps1',
            '.dockerfile', '.gitignore', '.env.example'
        }
        
        # Define files to INCLUDE by name (regardless of extension)
        relevant_filenames = {
            'README', 'LICENSE', 'CHANGELOG', 'CONTRIBUTING', 'INSTALL',
            'Dockerfile', 'Makefile', 'requirements.txt', 'package.json',
            'setup.py', 'pyproject.toml', 'Cargo.toml', 'pom.xml',
            'build.gradle', 'composer.json', 'Gemfile'
        }
        
        # Define directories to SKIP
        skip_directories = {
            'node_modules', '.git', '__pycache__', '.pytest_cache',
            'venv', 'env', '.env', 'build', 'dist', 'target',
            '.idea', '.vscode', 'logs', 'tmp', 'temp',
            'images', 'assets', 'static/images', 'public/images'
        }
        
        # Get file contents from the latest commit in the active branch
        tree = repo.head.commit.tree
        for item in tree.traverse():
            if item.type == 'blob':  # Only process files, not directories
                # Skip files in unwanted directories
                if any(skip_dir in item.path for skip_dir in skip_directories):
                    continue
                
                # Check if file should be included
                file_ext = os.path.splitext(item.path)[1].lower()
                filename = os.path.basename(item.path)
                filename_no_ext = os.path.splitext(filename)[0].upper()
                
                # Include if extension or filename matches our criteria
                should_include = (
                    file_ext in relevant_extensions or
                    filename_no_ext in relevant_filenames or
                    filename in relevant_filenames
                )
                
                if not should_include:
                    continue
                
                try:
                    # Try to decode as UTF-8, skip if it fails
                    content = item.data_stream.read().decode('utf-8')
                    # Skip files with null bytes (binary content)
                    if '\x00' in content:
                        continue
                    
                    # Skip very large files (>100KB)
                    if len(content) > 100000:
                        continue
                    
                    file_info = {
                        'path': item.path,
                        'content': content
                    }
                    repo_info['files'].append(file_info)
                except UnicodeDecodeError:
                    # Skip files that can't be decoded as text
                    continue
        
        print(f"Processed {len(repo_info['files'])} relevant files from {repo_info['repo_name']}")
        return repo_info
    
    except Exception as e:
        print(f"Error accessing Git repo: {e}")
        return None
    finally:
        # Close the repository to release file handles
        if repo is not None:
            repo.close()



In [41]:
def get_repo_name_from_url(url):
    """Extract repository name from GitHub URL."""
    path = urlparse(url).path  # "/M-Mowina/LinkedIn-Booster"
    return path.strip("/").split("/")[-1]  # "LinkedIn-Booster"

def remove_readonly(func, path, _):
    """Clear the readonly bit and reattempt the removal"""
    os.chmod(path, stat.S_IWRITE)
    func(path)

In [43]:
def get_github_repo_info(github_url, branch=None):
    """
    Clone a GitHub repo temporarily and extract its information.
    Returns repository information or None if error occurs.
    """
    repo_name = get_repo_name_from_url(github_url)
    base_tmp = tempfile.mkdtemp()   # e.g. /tmp/tmpabcd1234
    repo_dir = os.path.join(base_tmp, repo_name)
    repo = None
    
    try:
        repo = Repo.clone_from(github_url, repo_dir, branch=branch)
        result = get_git_repo_info(repo_dir)
        return result
    except Exception as e:
        print(f"Error cloning GitHub repo: {e}")
        return None
    finally:
        # Close the repository to release file handles
        if repo is not None:
            repo.close()
        
        # Use onerror callback to handle readonly files on Windows
        try:
            shutil.rmtree(base_tmp, onerror=remove_readonly)
        except Exception as cleanup_error:
            print(f"Warning: Could not clean up temp directory: {cleanup_error}")
            # Try alternative cleanup method
            try:
                import subprocess
                subprocess.run(['rmdir', '/s', '/q', base_tmp], shell=True, check=False)
            except:
                pass  # If all cleanup methods fail, just continue

In [44]:
# Usage
github_url = "https://github.com/M-Mowina/LinkedIn-Booster"
repo_info = get_github_repo_info(github_url)
print(repo_info['repo_name'])

Processed 14 relevant files from LinkedIn-Booster
LinkedIn-Booster


In [45]:
repo_info

{'repo_name': 'LinkedIn-Booster',
 'commit_count': 6,
 'branches': ['main'],
 'files': [{'path': 'README.md',
   'content': '# LinkedIn-Booster 🚀\n\nAn intelligent AI-powered LinkedIn post creation workflow that combines the power of LangChain, LangGraph, and multiple AI services to create engaging LinkedIn content with matching visuals.\n\n![LangGraph Workflow](images/LangGraph-workflow-diagram.png)\n\n## ✨ Features\n\n- **🤖 AI-Powered Generation**: Google Gemini for high-quality LinkedIn post drafting\n- **🔍 Web Research**: Tavily search integration for up-to-date context and insights\n- **🔄 Interactive Feedback Loop**: Intelligent feedback classification and content refinement\n- **🎨 Image Generation**: DALL·E 3 integration for compelling visual content\n- **📧 Email Delivery**: Gmail API integration for seamless content distribution\n- **🖥️ Multiple Interfaces**: CLI and web-based Chainlit UI options\n- **📊 Workflow Visualization**: LangGraph Studio integration for workflow monitori

## Chunked Git

In [48]:
from typing import Dict, List, Optional

In [56]:
def get_language_from_extension(file_path: str) -> Optional[Language]:
    """
    Map file extensions to LangChain Language enum values.
    
    Args:
        file_path (str): Path to the file
        
    Returns:
        Optional[Language]: Corresponding Language enum or None if not supported
    """
    ext = os.path.splitext(file_path)[1].lower()
    
    # Mapping of file extensions to LangChain Language enum
    extension_map = {
        '.py': Language.PYTHON,
        '.js': Language.JS,
        '.ts': Language.TS,
        '.jsx': Language.JS,
        '.tsx': Language.TS,
        '.java': Language.JAVA,
        '.kt': Language.KOTLIN,
        '.cpp': Language.CPP,
        '.cc': Language.CPP,
        '.cxx': Language.CPP,
        '.c': Language.C,
        '.h': Language.C,
        '.hpp': Language.CPP,
        '.cs': Language.CSHARP,
        '.php': Language.PHP,
        '.rb': Language.RUBY,
        '.go': Language.GO,
        '.rs': Language.RUST,
        '.swift': Language.SWIFT,
        '.scala': Language.SCALA,
        '.md': Language.MARKDOWN,
        '.html': Language.HTML,
        '.htm': Language.HTML,
        '.sol': Language.SOL,
        '.lua': Language.LUA,
        '.pl': Language.PERL,
        '.hs': Language.HASKELL,
        '.ex': Language.ELIXIR,
        '.exs': Language.ELIXIR,
        '.ps1': Language.POWERSHELL,
        '.vb': Language.VISUALBASIC6,
        '.proto': Language.PROTO,
        '.rst': Language.RST,
        '.tex': Language.LATEX,
        '.cob': Language.COBOL,
        '.cbl': Language.COBOL,
    }
    
    return extension_map.get(ext)



In [58]:
def create_splitter_for_language(language: Language, chunk_size: int = 4000, chunk_overlap: int = 400) -> RecursiveCharacterTextSplitter:
    """
    Create a language-specific text splitter.
    
    Args:
        language (Language): The programming language
        chunk_size (int): Maximum chunk size in characters
        chunk_overlap (int): Overlap between chunks
        
    Returns:
        RecursiveCharacterTextSplitter: Configured splitter
    """
    return RecursiveCharacterTextSplitter.from_language(
        language=language,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )



In [53]:
def process_file_with_chunking(file_path: str, content: str, chunk_size: int, chunk_overlap: int) -> List[Dict]:
    """
    Process a single file with intelligent chunking based on language.
    
    Args:
        file_path (str): Path to the file
        content (str): File content
        chunk_size (int): Maximum chunk size
        chunk_overlap (int): Overlap between chunks
        
    Returns:
        List[Dict]: List of chunk information dictionaries
    """
    # Determine if we should chunk this file
    language = get_language_from_extension(file_path)
    
    # If file is small enough or no language-specific splitter, return as single chunk
    if len(content) <= chunk_size or language is None:
        return [{
            'path': file_path,
            'content': content,
            'chunk_index': 0,
            'total_chunks': 1,
            'is_chunked': False,
            'language': language.value if language else 'unknown'
        }]
    
    # Use language-specific chunking
    try:
        splitter = create_splitter_for_language(language, chunk_size, chunk_overlap)
        documents = splitter.create_documents([content])
        
        chunks = []
        for i, doc in enumerate(documents):
            chunk_info = {
                'path': f"{file_path}#chunk_{i}",  # Add chunk identifier to path
                'original_path': file_path,
                'content': doc.page_content,
                'chunk_index': i,
                'total_chunks': len(documents),
                'is_chunked': True,
                'language': language.value,
                'chunk_size_actual': len(doc.page_content)
            }
            chunks.append(chunk_info)
        
        return chunks
        
    except Exception as e:
        print(f"Error chunking file {file_path}: {e}")
        # Fallback to single chunk
        return [{
            'path': file_path,
            'content': content,
            'chunk_index': 0,
            'total_chunks': 1,
            'is_chunked': False,
            'language': language.value if language else 'unknown',
            'chunking_error': str(e)
        }]



In [None]:
def get_git_repo_info_chunked(repo_path: str, chunk_size: int = 4000, chunk_overlap: int = 400) -> Optional[Dict]:
    """
    Extract information from a Git repository with intelligent chunking.
    Returns a dictionary with repo metadata and chunked file contents.
    Uses LangChain's language-aware text splitting for better context preservation.
    
    Args:
        repo_path (str): Path to the Git repository
        chunk_size (int): Maximum chunk size in characters (default: 4000)
        chunk_overlap (int): Overlap between chunks (default: 400)
    
    Returns:
        Optional[Dict]: Repository information with chunked files or None if error
    """
    repo = None
    try:
        repo = git.Repo(repo_path)
        repo_info = {
            'repo_name': os.path.basename(repo_path),
            'commit_count': len(list(repo.iter_commits())),
            'branches': [branch.name for branch in repo.branches],
            'files': [],
            'chunking_info': {
                'chunk_size': chunk_size,
                'chunk_overlap': chunk_overlap,
                'total_chunks': 0,
                'files_chunked': 0,
                'files_not_chunked': 0
            }
        }
        
        # Define file extensions to INCLUDE (whitelist approach)
        relevant_extensions = {
            '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h',
            '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.scala',
            '.html', '.css', '.scss', '.sass', '.less',
            '.md', '.txt', '.rst', '.json', '.yaml', '.yml', '.toml',
            '.sql', '.sh', '.bat', '.ps1', '.proto', '.tex', '.lua', '.pl',
            '.hs', '.ex', '.exs', '.vb', '.sol', '.cob', '.cbl',
            '.dockerfile', '.gitignore', '.env.example'
        }
        
        # Define files to INCLUDE by name (regardless of extension)
        relevant_filenames = {
            'README', 'LICENSE', 'CHANGELOG', 'CONTRIBUTING', 'INSTALL',
            'Dockerfile', 'Makefile', 'requirements.txt', 'package.json',
            'setup.py', 'pyproject.toml', 'Cargo.toml', 'pom.xml',
            'build.gradle', 'composer.json', 'Gemfile'
        }
        
        # Define directories to SKIP
        skip_directories = {
            'node_modules', '.git', '__pycache__', '.pytest_cache',
            'venv', 'env', '.env', 'build', 'dist', 'target',
            '.idea', '.vscode', 'logs', 'tmp', 'temp',
            'images', 'assets', 'static/images', 'public/images'
        }
        
        # Get file contents from the latest commit in the active branch
        tree = repo.head.commit.tree
        for item in tree.traverse():
            if item.type == 'blob':  # Only process files, not directories
                # Skip files in unwanted directories
                if any(skip_dir in item.path for skip_dir in skip_directories):
                    continue
                
                # Check if file should be included
                file_ext = os.path.splitext(item.path)[1].lower()
                filename = os.path.basename(item.path)
                filename_no_ext = os.path.splitext(filename)[0].upper()
                
                # Include if extension or filename matches our criteria
                should_include = (
                    file_ext in relevant_extensions or
                    filename_no_ext in relevant_filenames or
                    filename in relevant_filenames
                )
                
                if not should_include:
                    continue
                
                try:
                    # Try to decode as UTF-8, skip if it fails
                    content = item.data_stream.read().decode('utf-8')
                    # Skip files with null bytes (binary content)
                    if '\x00' in content:
                        continue
                    
                    # Skip very large files (>500KB for chunking)
                    if len(content) > 500000:
                        continue
                    
                    # Process the file with chunking
                    processed_chunks = process_file_with_chunking(
                        item.path, content, chunk_size, chunk_overlap
                    )
                    
                    # Add chunks to repo info
                    for chunk_info in processed_chunks:
                        repo_info['files'].append(chunk_info)
                    
                    # Update chunking statistics
                    if len(processed_chunks) > 1:
                        repo_info['chunking_info']['files_chunked'] += 1
                        repo_info['chunking_info']['total_chunks'] += len(processed_chunks)
                    else:
                        repo_info['chunking_info']['files_not_chunked'] += 1
                        repo_info['chunking_info']['total_chunks'] += 1
                        
                except UnicodeDecodeError:
                    # Skip files that can't be decoded as text
                    continue
        
        print(f"Processed {len(repo_info['files'])} chunks from {repo_info['repo_name']}")
        print(f"Chunking stats: {repo_info['chunking_info']['files_chunked']} files chunked, "
              f"{repo_info['chunking_info']['files_not_chunked']} files kept whole")
        
        return repo_info
    
    except Exception as e:
        print(f"Error accessing Git repo: {e}")
        return None
    finally:
        # Close the repository to release file handles
        if repo is not None:
            repo.close()

In [54]:
def get_github_repo_info_chunked(github_url: str, branch: Optional[str] = None, 
                                chunk_size: int = 4000, chunk_overlap: int = 400) -> Optional[Dict]:
    """
    Clone a GitHub repo temporarily and extract its information with chunking.
    
    Args:
        github_url (str): GitHub repository URL
        branch (Optional[str]): Specific branch to clone
        chunk_size (int): Maximum chunk size in characters
        chunk_overlap (int): Overlap between chunks
        
    Returns:
        Optional[Dict]: Repository information with chunked files or None if error
    """
    repo_name = get_repo_name_from_url(github_url)
    base_tmp = tempfile.mkdtemp()   # e.g. /tmp/tmpabcd1234
    repo_dir = os.path.join(base_tmp, repo_name)
    repo = None
    
    try:
        repo = Repo.clone_from(github_url, repo_dir, branch=branch)
        result = get_git_repo_info_chunked(repo_dir, chunk_size, chunk_overlap)
        return result
    except Exception as e:
        print(f"Error cloning GitHub repo: {e}")
        return None
    finally:
        # Close the repository to release file handles
        if repo is not None:
            repo.close()
        
        # Use onerror callback to handle readonly files on Windows
        try:
            shutil.rmtree(base_tmp, onerror=remove_readonly)
        except Exception as cleanup_error:
            print(f"Warning: Could not clean up temp directory: {cleanup_error}")
            # Try alternative cleanup method
            try:
                import subprocess
                subprocess.run(['rmdir', '/s', '/q', base_tmp], shell=True, check=False)
            except:
                pass  # If all cleanup methods fail, just continue

In [59]:
repo_info = get_github_repo_info_chunked(github_url)

Processed 20 chunks from LinkedIn-Booster
Chunking stats: 3 files chunked, 11 files kept whole


In [60]:
repo_info

{'repo_name': 'LinkedIn-Booster',
 'commit_count': 6,
 'branches': ['main'],
 'files': [{'path': 'README.md#chunk_0',
   'original_path': 'README.md',
   'content': '# LinkedIn-Booster 🚀\n\nAn intelligent AI-powered LinkedIn post creation workflow that combines the power of LangChain, LangGraph, and multiple AI services to create engaging LinkedIn content with matching visuals.\n\n![LangGraph Workflow](images/LangGraph-workflow-diagram.png)\n\n## ✨ Features\n\n- **🤖 AI-Powered Generation**: Google Gemini for high-quality LinkedIn post drafting\n- **🔍 Web Research**: Tavily search integration for up-to-date context and insights\n- **🔄 Interactive Feedback Loop**: Intelligent feedback classification and content refinement\n- **🎨 Image Generation**: DALL·E 3 integration for compelling visual content\n- **📧 Email Delivery**: Gmail API integration for seamless content distribution\n- **🖥️ Multiple Interfaces**: CLI and web-based Chainlit UI options\n- **📊 Workflow Visualization**: LangGraph

# Azure Testing

In [62]:
from dotenv import load_dotenv
load_dotenv()
os.getenv("AZURE_API_KEY")

'AG94NddmMCL59NC4j2kmSQ29vT4HN9lGdqXEiQrUwO9hiX4CUsBnJQQJ99BIACAAAAAHdXvcAAASAZDOXgON'

In [32]:
# --- Fill these in ---
organization = "areebgroup"
project      = "Internship-Playground"
repository   = "Internship-ai"
branch       = "Linked-Booster-LangGraph-Task" # Optional
pat          = os.getenv("AZURE_API_KEY")  # must have Code -> Read
# ----------------------

# Embed the PAT directly in the clone URL
# NOTE: Keep this token private! Don't commit it.
clone_url = f"https://{organization}:{pat}@dev.azure.com/{organization}/{project}/_git/{repository}"

In [33]:
azure_repo_info = get_github_repo_info(clone_url, branch)
azure_repo_info

Processed 14 relevant files from Internship-ai


{'repo_name': 'Internship-ai',
 'commit_count': 19,
 'branches': ['Linked-Booster-LangGraph-Task'],
 'files': [{'path': 'README.md',
   'content': '# LinkedIn-Booster 🚀\n\nAn intelligent AI-powered LinkedIn post creation workflow that combines the power of LangChain, LangGraph, and multiple AI services to create engaging LinkedIn content with matching visuals.\n\n![LangGraph Workflow](images/LangGraph-workflow-diagram.png)\n\n## ✨ Features\n\n- **🤖 AI-Powered Generation**: Google Gemini for high-quality LinkedIn post drafting\n- **🔍 Web Research**: Tavily search integration for up-to-date context and insights\n- **🔄 Interactive Feedback Loop**: Intelligent feedback classification and content refinement\n- **🎨 Image Generation**: DALL·E 3 integration for compelling visual content\n- **📧 Email Delivery**: Gmail API integration for seamless content distribution\n- **🖥️ Multiple Interfaces**: CLI and web-based Chainlit UI options\n- **📊 Workflow Visualization**: LangGraph Studio integratio

In [17]:
azure_repo_info['files']

[{'path': 'README.md',
  'content': '# Introduction \r\nTODO: Give a short introduction of your project. Let this section explain the objectives or the motivation behind this project. \r\n\r\n# Getting Started\r\nTODO: Guide users through getting your code up and running on their own system. In this section you can talk about:\r\n1.\tInstallation process\r\n2.\tSoftware dependencies\r\n3.\tLatest releases\r\n4.\tAPI references\r\n\r\n# Build and Test\r\nTODO: Describe and show how to build your code and run the tests. \r\n\r\n# Contribute\r\nTODO: Explain how other users and developers can contribute to make your code better. \r\n\r\nIf you want to learn more about creating good readme files then refer the following [guidelines](https://docs.microsoft.com/en-us/azure/devops/repos/git/create-a-readme?view=azure-devops). You can also seek inspiration from the below readme files:\r\n- [ASP.NET Core](https://github.com/aspnet/Home)\r\n- [Visual Studio Code](https://github.com/Microsoft/vsc

### Azure chunked

In [61]:
def get_azure_repo_info_chunked(organization: str, project: str, repository: str, 
                               pat: str, branch: Optional[str] = None,
                               chunk_size: int = 4000, chunk_overlap: int = 400) -> Optional[Dict]:
    """
    Clone an Azure DevOps repo temporarily and extract its information with chunking.
    
    Args:
        organization (str): Azure DevOps organization name
        project (str): Azure DevOps project name
        repository (str): Repository name
        pat (str): Personal Access Token with Code -> Read permissions
        branch (Optional[str]): Specific branch to clone
        chunk_size (int): Maximum chunk size in characters
        chunk_overlap (int): Overlap between chunks
        
    Returns:
        Optional[Dict]: Repository information with chunked files or None if error
    """
    # Construct Azure DevOps clone URL with embedded PAT
    clone_url = f"https://{organization}:{pat}@dev.azure.com/{organization}/{project}/_git/{repository}"
    
    base_tmp = tempfile.mkdtemp()
    repo_dir = os.path.join(base_tmp, repository)
    repo = None
    
    try:
        print(f"Cloning Azure DevOps repo: {organization}/{project}/{repository}")
        if branch:
            print(f"Branch: {branch}")
        
        repo = Repo.clone_from(clone_url, repo_dir, branch=branch)
        result = get_git_repo_info_chunked(repo_dir, chunk_size, chunk_overlap)
        
        # Add Azure-specific metadata
        if result:
            result['source_type'] = 'azure_devops'
            result['organization'] = organization
            result['project'] = project
            result['azure_repo_name'] = repository
        
        return result
    except Exception as e:
        print(f"Error cloning Azure DevOps repo: {e}")
        return None
    finally:
        # Close the repository to release file handles
        if repo is not None:
            repo.close()
        
        # Use onerror callback to handle readonly files on Windows
        try:
            shutil.rmtree(base_tmp, onerror=remove_readonly)
        except Exception as cleanup_error:
            print(f"Warning: Could not clean up temp directory: {cleanup_error}")
            # Try alternative cleanup method
            try:
                import subprocess
                subprocess.run(['rmdir', '/s', '/q', base_tmp], shell=True, check=False)
            except:
                pass  # If all cleanup methods fail, just continue



In [63]:
organization = "areebgroup"
project      = "Internship-Playground"
repository   = "Internship-ai"
branch       = "Linked-Booster-LangGraph-Task" # Optional
pat          = os.getenv("AZURE_API_KEY")  # must have Code -> Read

azure_repo_info = get_azure_repo_info_chunked(organization, project, repository, pat, branch)

Cloning Azure DevOps repo: areebgroup/Internship-Playground/Internship-ai
Branch: Linked-Booster-LangGraph-Task
Processed 18 chunks from Internship-ai
Chunking stats: 3 files chunked, 11 files kept whole


In [64]:
azure_repo_info

{'repo_name': 'Internship-ai',
 'commit_count': 19,
 'branches': ['Linked-Booster-LangGraph-Task'],
 'files': [{'path': 'README.md#chunk_0',
   'original_path': 'README.md',
   'content': '# LinkedIn-Booster 🚀\n\nAn intelligent AI-powered LinkedIn post creation workflow that combines the power of LangChain, LangGraph, and multiple AI services to create engaging LinkedIn content with matching visuals.\n\n![LangGraph Workflow](images/LangGraph-workflow-diagram.png)\n\n## ✨ Features\n\n- **🤖 AI-Powered Generation**: Google Gemini for high-quality LinkedIn post drafting\n- **🔍 Web Research**: Tavily search integration for up-to-date context and insights\n- **🔄 Interactive Feedback Loop**: Intelligent feedback classification and content refinement\n- **🎨 Image Generation**: DALL·E 3 integration for compelling visual content\n- **📧 Email Delivery**: Gmail API integration for seamless content distribution\n- **🖥️ Multiple Interfaces**: CLI and web-based Chainlit UI options\n- **📊 Workflow Vis

# Setup SupaBase

In [36]:
import getpass
import os
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")



embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector = embeddings.embed_query("Hello, world!")
len(vector)

1536

In [37]:
load_dotenv()
os.getenv("SUPABASE_DB_URL")

'postgresql://postgres.xrnjnxbdriboxwfoyrlx:areebrepoassistant@aws-1-eu-north-1.pooler.supabase.com:6543/postgres'

In [38]:
import vecs
import json  # If needed for serializing branches
import os
from dotenv import load_dotenv
load_dotenv()

# Load the embedding model (only do this once)
model = embeddings

# Connect to Supabase (replace with your actual connection string)
SUPABASE_DB_URL = os.getenv("SUPABASE_DB_URL")
vx = vecs.create_client(SUPABASE_DB_URL)

# Create or get the vector collection (name it whatever you like, e.g., "repo_files")
# Dimension must match the model's output (384 for 'supabase/gte-small')
collection = vx.get_or_create_collection(name="repo_files", dimension=1536)

In [39]:
vx.list_collections()

[vecs.Collection(name="LinkedIn-Booster", dimension=1536),
 vecs.Collection(name="TalentTalk---AI-powered-interview-system", dimension=1536),
 vecs.Collection(name="repo_files", dimension=1536)]

In [10]:
def store_repo_in_vector_db(repo_info):
    vectors_to_upsert = []
    
    for file in repo_info['files']:
        # Generate embedding for the file content
        content = file['content']
        embedding = model.embed_query(content)
        
        # Create a unique ID (e.g., "repo_name/file_path")
        unique_id = f"{repo_info['repo_name']}/{file['path']}"
        
        # Metadata: Store repo-level info here for each file
        metadata = {
            'repo_name': repo_info['repo_name'],
            'path': file['path'],
            'content': content,  # Store the full content for retrieval
            'commit_count': repo_info['commit_count'],
            'branches': repo_info['branches']  # List is JSON-serializable
        }
        
        # Add to batch
        vectors_to_upsert.append((unique_id, embedding, metadata))
    
    # Fix: Remove the 'vectors=' keyword argument
    collection.upsert(vectors_to_upsert)
    print(f"Stored {len(vectors_to_upsert)} files from repo '{repo_info['repo_name']}'")


In [16]:
# Example usage with your repo_info
store_repo_in_vector_db(repo_info)

# Don't forget to disconnect when done
vx.disconnect()

Stored 14 files from repo 'tmpqv23rcta'


In [10]:
# intialize connection with the Database
vx = vecs.create_client(SUPABASE_DB_URL)

# View the scheme of the collection
collection = vx.get_or_create_collection(name="repo_files", dimension=1536)

# Get collection info
print(f"Collection name: {collection.name}")
print(f"Collection dimension: {collection.dimension}")

# List all available methods and attributes
print("Available methods:")
print([method for method in dir(collection) if not method.startswith('_')])


Collection name: repo_files
Collection dimension: 1536
Available methods:
['adapter', 'client', 'create_index', 'delete', 'dimension', 'fetch', 'index', 'is_indexed_for_measure', 'name', 'query', 'table', 'upsert']


In [11]:
vx.disconnect()

# Supabase utils

In [None]:
import os
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import vecs
import json  # If needed for serializing branches
load_dotenv()

# Connect to Supabase (replace with your actual connection string)
SUPABASE_DB_URL = os.getenv("SUPABASE_DB_URL")
vx = vecs.create_client(SUPABASE_DB_URL)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the embedding model (only do this once)
model = embeddings

In [26]:
def store_repo_in_own_collection(repo_info, refresh=False):
    repo_name = repo_info['repo_name']

    if refresh:
        # Drop existing collection if it exists
        try:
            vx.delete_collection(repo_name)
            print(f"🗑️ Old collection '{repo_name}' deleted.")
        except Exception:
            pass  # collection may not exist yet

    # Create/get a dedicated collection for this repo
    collection = vx.get_or_create_collection(
        name=repo_name,
        dimension=1536
    )

    vectors_to_upsert = []
    for file in repo_info['files']:
        content = file['content']
        embedding = model.embed_query(content)

        unique_id = f"{repo_name}/{file['path']}"
        metadata = {
            'repo_name': repo_name,
            'path': file['path'],
            'commit_count': repo_info['commit_count'],
            'branches': repo_info['branches']
        }

        vectors_to_upsert.append((unique_id, embedding, metadata))

    collection.upsert(vectors_to_upsert)
    print(f"✅ Stored {len(vectors_to_upsert)} files into collection '{repo_name}'")


In [28]:
store_repo_in_own_collection(repo_info)

✅ Stored 14 files into collection 'LinkedIn-Booster'


In [41]:
vx.list_collections()

[vecs.Collection(name="LinkedIn-Booster", dimension=1536),
 vecs.Collection(name="TalentTalk---AI-powered-interview-system", dimension=1536),
 vecs.Collection(name="repo_files", dimension=1536)]

In [43]:
collection = vx.get_or_create_collection(name="LinkedIn-Booster", dimension=1536)

query = model.embed_query("Workflow")

results = collection.query(
    data=query,
    limit=5,
    include_value=True,
    include_metadata=True
)

results



[('LinkedIn-Booster/workflow/__init__.py', 0.604595542638825, {'path': 'workflow/__init__.py', 'branches': ['main'], 'repo_name': 'LinkedIn-Booster', 'commit_count': 6}),
 ('LinkedIn-Booster/workflow/graph.py', 0.617298291051589, {'path': 'workflow/graph.py', 'branches': ['main'], 'repo_name': 'LinkedIn-Booster', 'commit_count': 6}),
 ('LinkedIn-Booster/studio/graph.py', 0.649370674582624, {'path': 'studio/graph.py', 'branches': ['main'], 'repo_name': 'LinkedIn-Booster', 'commit_count': 6}),
 ('LinkedIn-Booster/main.py', 0.651415278059699, {'path': 'main.py', 'branches': ['main'], 'repo_name': 'LinkedIn-Booster', 'commit_count': 6}),
 ('LinkedIn-Booster/workflow/nodes.py', 0.658244322747666, {'path': 'workflow/nodes.py', 'branches': ['main'], 'repo_name': 'LinkedIn-Booster', 'commit_count': 6})]

In [44]:
results[0][2]

{'path': 'workflow/__init__.py',
 'branches': ['main'],
 'repo_name': 'LinkedIn-Booster',
 'commit_count': 6}

## Re-structure

| Field       | Description                                                 |
| ----------- | ----------------------------------------------------------- |
| `id`        | Unique identifier for the chunk/file                        |
| `repo_id`   | Which repo/project this belongs to                          |
| `file_path` | Path to the source file                                     |
| `chunk_id`  | If the file is split, ID for the chunk                      |
| `content`   | Raw code text (or chunk of it)                              |
| `embedding` | Vector representation of the content                        |
| `symbols`   | Extracted functions, classes, variables (for faster lookup) |
| `imports`   | Which files/functions this chunk depends on                 |
| `metadata`  | Language, size, timestamps, etc.                            |


## Symbols extraction

In [65]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser

def extract_python_symbols(code):
    PY_LANGUAGE = Language(tspython.language())
    parser = Parser(PY_LANGUAGE)
    tree = parser.parse(bytes(code, "utf8"))
    
    symbols = {
        'imports': [],
        'functions': [],
        'classes': [],
        'variables': []
    }
    
    def traverse_tree(node):
        if node.type == 'import_statement':
            symbols['imports'].append(node.text.decode('utf8'))
        elif node.type == 'function_definition':
            name_node = node.child_by_field_name('name')
            if name_node:
                symbols['functions'].append(name_node.text.decode('utf8'))
        elif node.type == 'class_definition':
            name_node = node.child_by_field_name('name')
            if name_node:
                symbols['classes'].append(name_node.text.decode('utf8'))
        
        for child in node.children:
            traverse_tree(child)
    
    traverse_tree(tree.root_node)
    return symbols


In [66]:
extract_python_symbols(file_content)

{'imports': ['import os', 'import vecs', 'import json', 'import traceback'],
 'functions': ['get_embedding_model',
  'get_vector_client',
  'store_repo_in_own_collection',
  'search_repo_collection',
  'list_repo_collections',
  'delete_repo_collection',
  'debug_search_results',
  'get_file_content',
  'search_with_content'],
 'classes': [],
 'variables': []}

# RAG intiations

## Search repo with similarity search

In [21]:
def search_repo_code(query, top_k=5, similarity_threshold=0.7):
    """
    Search the repository code using RAG (Retrieval-Augmented Generation).
    
    Args:
        query (str): The search query
        top_k (int): Number of top results to return
        similarity_threshold (float): Minimum similarity score (0-1)
    
    Returns:
        list: List of relevant code snippets with metadata
    """
    try:
        # Connect to the database
        vx = vecs.create_client(SUPABASE_DB_URL)
        collection = vx.get_or_create_collection(name="repo_files", dimension=1536)
        
        # Generate embedding for the query
        query_embedding = model.embed_query(query)
        
        # Search for similar vectors
        results = collection.query(
            data=query_embedding,
            limit=top_k,
            include_value=True,
            include_metadata=True
        )
        
        # Process and filter results
        relevant_results = []
        for result in results:
            similarity_score = result[1]  # The similarity score
            metadata = result[2]  # The metadata
            
            # Filter by similarity threshold
            if similarity_score >= similarity_threshold:
                relevant_results.append({
                    'similarity_score': similarity_score,
                    'repo_name': metadata.get('repo_name', 'Unknown'),
                    'file_path': metadata.get('path', 'Unknown'),
                    'content': metadata.get('content', ''),
                    'commit_count': metadata.get('commit_count', 0),
                    'branches': metadata.get('branches', [])
                })
        
        # Sort by similarity score (highest first)
        relevant_results.sort(key=lambda x: x['similarity_score'], reverse=True)
        
        vx.disconnect()
        return relevant_results
        
    except Exception as e:
        print(f"Error searching repository: {e}")
        return []

In [25]:
results = search_repo_code('send mail node', 5, 0.5)



In [26]:
results

[{'similarity_score': 0.783879111756007,
  'repo_name': 'tmpqv23rcta',
  'file_path': 'main.py',
  'content': 'import sys\nfrom utils import load_env\nfrom workflow.graph import start_workflow, continue_workflow\n\n\ndef main() -> None:\n    load_env()\n    if len(sys.argv) < 2:\n        print("Usage: python main.py \\"Your post topic...\\"")\n        sys.exit(1)\n    topic = sys.argv[1]\n    thread_id = "cli"\n    state = start_workflow(topic, thread_id)\n    if state.current_step != "wait_feedback":\n        print(f"Error: {state.error_message}")\n        sys.exit(1)\n    print("\\nGenerated Post:\\n")\n    print(state.post_text)\n    print("\\nImage URL:\\n")\n    print(state.image_url)\n    while True:\n        feedback = input("\\nFeedback (approve/refine text .../refine image .../quit): ").strip()\n        if feedback.lower() in {"quit", "exit", "q"}:\n            print("Goodbye!")\n            break\n        state = continue_workflow(feedback, thread_id)\n        if state.curren

In [31]:
display(Markdown(results[4]['content']))

import os
from typing import List

from dotenv import load_dotenv
load_dotenv()

from langchain.chat_models import init_chat_model
from langchain.schema import SystemMessage, HumanMessage
from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper
from langgraph.errors import GraphRecursionError
from langgraph.prebuilt import create_react_agent
# Gmail imports will be done lazily in send_email function

from workflow.state import WorkflowState
from workflow.tools import tavily_tool


# Initialize models
gemini_model = init_chat_model("gpt-4o-mini", model_provider="openai")

def create_post_text(state: WorkflowState) -> WorkflowState:
    """Generate LinkedIn post text based on user input"""
    system_message = """
    You are a professional LinkedIn content creator. Create engaging, professional LinkedIn posts.

    Your tone should be:
    - Professional yet approachable
    - Concise and clear
    - Action-oriented with clear value

    Format your post with:
    - A strong hook in the first sentence
    - Short, scannable paragraphs
    - 1-3 relevant emojis strategically placed
    - A clear call-to-action
    - 3-5 relevant hashtags at the end

    Output ONLY the post text, nothing else."""
    
    try:
        max_iterations = 3
        recursion_limit = 2 * max_iterations + 1
        
        agent = create_react_agent(
            model="openai:gpt-4o-mini",
            tools=[tavily_tool],
            prompt=system_message,
            debug=True
        )
        
        try:
            response = agent.invoke(
                {"messages": [{"role": "user", "content": state.user_input}]},
                {"recursion_limit": recursion_limit},
            )
            
            state.post_text = response['messages'][-1].content
            state.current_step = "create_image"
            
        except GraphRecursionError:
            print("Agent stopped due to max iterations.")
            state.error_message = "Agent stopped due to max iterations"
            state.current_step = "error"
        
    except Exception as e:
        state.error_message = f"Error creating post text: {str(e)}"
        state.current_step = "error"
    
    return state


def generate_image(state: WorkflowState) -> WorkflowState:
    """Generate image directly from post text"""
    
    try:
        post_text = state.post_text
        
        # Create a proper prompt for image generation
        image_prompt_template = """
        Create a modern, minimalistic professional LinkedIn illustration for the following post: {post_content}
        
        Style: 3D Object Generator with clean flat vector, LinkedIn color palette (blue, white, grey), simple, corporate-appropriate.
        Avoid clutter, make it inspiring and easy to understand.
        No text in the image, just visual elements.
        
        Generate only the image description prompt, no other text.
        """
        
        # Create messages for the LLM
        messages = [
            SystemMessage(content="You are an expert LinkedIn Image Prompt Engineer. Generate concise AI image prompts."),
            HumanMessage(content=image_prompt_template.format(post_content=post_text))
        ]
        
        # Get the image prompt from Gemini
        response = gemini_model.invoke(messages)
        image_prompt = response.content.strip()
        
        print(f"Image prompt: {image_prompt}")
        
        # Generate image using DALL-E
        dalle_wrapper = DallEAPIWrapper(model="dall-e-3")
        image_url = dalle_wrapper.run(image_prompt)
        
        # Update state
        state.image_url = image_url
        state.image_prompt = image_prompt
        state.current_step = "wait_feedback" 
        
    except Exception as e:
        state.error_message = f"Error generating image: {str(e)}"
        state.current_step = "error"
    
    return state


def classify_feedback(state: WorkflowState) -> WorkflowState:
    """Classify user feedback into categories"""
    system_message = """You are a text classifier. Classify the input text into one of these categories:

        1. Approved - Approval message refers that its ok to proceed with the current status.
        2. Refine Text - Request to refine the post text
        3. Terminate - Termination request for the process.
        4. Refine Image - Request to refine the post image

        Output only the category name, nothing else."""

    try:
        messages = [
            SystemMessage(content=system_message),
            HumanMessage(content=state.user_feedback)
        ]
        
        response = gemini_model.invoke(messages)
        classification = response.content.strip()
        
        # Map classification to next step
        if "Approved" in classification:
            state.classification = "Approved"
            state.current_step = "post_content"
            state.final_post_ready = True
        elif "Refine Text" in classification:
            state.classification = "Refine Text"
            state.current_step = "refine_text"
        elif "Terminate" in classification:
            state.classification = "Terminate"
            state.current_step = "terminate"
        elif "Refine Image" in classification:
            state.classification = "Refine Image"
            state.current_step = "refine_image"
        else:
            state.classification = "Unknown"
            state.current_step = "wait_feedback"
            state.error_message = f"Unrecognized feedback classification: {classification}"
        
        print(f"DEBUG: Updated state - classification: {state.classification}, current_step: {state.current_step}")
            
    except Exception as e:
        state.error_message = f"Error classifying feedback: {str(e)}"
        state.current_step = "error"
        print(f"DEBUG: Exception in classify_feedback: {str(e)}")
    
    return state


def refine_text(state: WorkflowState) -> WorkflowState:
    """Refine the post text based on user feedback"""
    system_message = """
        You are a helpful assistant who improve and refine LinkedIn post.

        Output only the post text."""

    try:
        prompt = f"Original post: {state.post_text}\\n\\nRequest: {state.user_feedback}"
        
        messages = [
            SystemMessage(content=system_message),
            HumanMessage(content=prompt)
        ]
        
        response = gemini_model.invoke(messages)
        state.refined_text = response.content
        state.post_text = state.refined_text  # Update the main post text
        state.current_step = "wait_feedback"
        
    except Exception as e:
        state.error_message = f"Error refining text: {str(e)}"
        state.current_step = "error"
    
    return state


def refine_image(state: WorkflowState) -> WorkflowState:
    """Refine the image based on user feedback"""
    try:
        # Use the existing image prompt and add user feedback
        modified_prompt = f"{state.image_prompt}. {state.user_feedback}"
        
        print(f"Refined image prompt: {modified_prompt}")
        
        # Generate image using DALL-E with the modified prompt
        dalle_wrapper = DallEAPIWrapper(model="dall-e-3")
        image_url = dalle_wrapper.run(modified_prompt)
        
        # Update state with the refined image
        state.image_url = image_url
        state.image_prompt = modified_prompt  # Update the prompt with feedback
        state.current_step = "wait_feedback"  # Go back to wait for more feedback
        
        print(f"Refined image URL: {image_url}")
        
    except Exception as e:
        state.error_message = f"Error refining image: {str(e)}"
        state.current_step = "error"
    
    return state


def send_email(state: WorkflowState) -> WorkflowState:
    """Send email with the final post content"""
    try:
        # Lazy import Gmail functionality
        from langchain_google_community import GmailToolkit
        from langchain_google_community.gmail.utils import (
            build_resource_service,
            get_gmail_credentials,
        )

        # Test imports without initializing toolkit
        print("Imports successful!")

            
        # Initialize Gmail toolkit
        credentials = get_gmail_credentials(
            token_file="token.json",
            scopes=["https://mail.google.com/"],
            client_secrets_file="credentials.json",
        )
        
        api_resource = build_resource_service(credentials=credentials)
        gmail_toolkit = GmailToolkit(api_resource=api_resource)
        
        tools = gmail_toolkit.get_tools()
        print(f'🚩DEBUG: send mail tool: {tools[1]}')
        # Compose email content
        subject = "LinkedIn Post Ready for Publishing"
        body = f"""
            Your LinkedIn post is ready:

            {state.post_text}

            Image URL: {state.image_url}

            Best regards,
            LinkedIn Post Creator
            """
        
        # Send email using Gmail toolkit
        send_message_tool = gmail_toolkit.get_tools()[1]  # send_gmail_message tool
        
        email_result = send_message_tool.run({
            "to": "mohammed-mowina@outlook.com",
            "subject": subject,
            "message": body
        })
        
        state.current_step = "completed"
        
    except Exception as e:
        state.error_message = f"🚩Error sending email: {str(e)}"
        state.current_step = "error"
    
    return state

## Results Formating

In [32]:
def format_search_results(results, max_content_length=500):
    """
    Format search results for better readability.
    
    Args:
        results (list): Results from search_repo_code
        max_content_length (int): Maximum length of content to display
    
    Returns:
        str: Formatted results string
    """
    if not results:
        return "No relevant results found."
    
    formatted_output = f"Found {len(results)} relevant results:\n\n"
    
    for i, result in enumerate(results, 1):
        # Truncate content if too long
        content = result['content']
        if len(content) > max_content_length:
            content = content[:max_content_length] + "..."
        
        formatted_output += f"--- Result {i} ---\n"
        formatted_output += f"File: {result['file_path']}\n"
        formatted_output += f"Repository: {result['repo_name']}\n"
        formatted_output += f"Similarity Score: {result['similarity_score']:.3f}\n"
        formatted_output += f"Content:\n{content}\n\n"
    
    return formatted_output

In [34]:
print(format_search_results(results))

Found 5 relevant results:

--- Result 1 ---
File: main.py
Repository: tmpqv23rcta
Similarity Score: 0.784
Content:
import sys
from utils import load_env
from workflow.graph import start_workflow, continue_workflow


def main() -> None:
    load_env()
    if len(sys.argv) < 2:
        print("Usage: python main.py \"Your post topic...\"")
        sys.exit(1)
    topic = sys.argv[1]
    thread_id = "cli"
    state = start_workflow(topic, thread_id)
    if state.current_step != "wait_feedback":
        print(f"Error: {state.error_message}")
        sys.exit(1)
    print("\nGenerated Post:\n")
    print(state.pos...

--- Result 2 ---
File: workflow/graph.py
Repository: tmpqv23rcta
Similarity Score: 0.781
Content:
from langgraph.graph import StateGraph, END
from langgraph.checkpoint.memory import MemorySaver

from workflow.state import WorkflowState
from workflow.nodes import (
    create_post_text,
    generate_image,
    classify_feedback,
    refine_text,
    refine_image,
    send_email,

## RAG S&A 

In [35]:
def rag_search_and_answer(question, context_limit=3):
    """
    Perform RAG search and generate an answer using the retrieved context.
    
    Args:
        question (str): The question to answer
        context_limit (int): Number of top results to use as context
    
    Returns:
        dict: Contains the answer and source information
    """
    # Search for relevant code
    search_results = search_repo_code(question, top_k=context_limit)
    
    if not search_results:
        return {
            'answer': "I couldn't find relevant information in the repository to answer your question.",
            'sources': []
        }
    
    # Build context from search results
    context_parts = []
    sources = []
    
    for result in search_results:
        context_parts.append(f"File: {result['file_path']}\n{result['content']}")
        sources.append({
            'file_path': result['file_path'],
            'repo_name': result['repo_name'],
            'similarity_score': result['similarity_score']
        })
    
    context = "\n\n---\n\n".join(context_parts)
    
    # Create a prompt for the LLM (you can customize this)
    prompt = f"""
        Based on the following code repository context, please answer the question.

        Context from repository:
        {context}

        Question: {question}

        Please provide a detailed answer based on the code context above. If the context doesn't contain enough information to fully answer the question, please mention what additional information might be needed.
    """

    return {
        'prompt': prompt,
        'context': context,
        'sources': sources,
        'search_results': search_results
    }

In [36]:
results2 = rag_search_and_answer('What is the part in the project responsible on the generate image functionality')



In [37]:
results2

{'prompt': '\n        Based on the following code repository context, please answer the question.\n\n        Context from repository:\n        File: README.md\n# LinkedIn-Booster 🚀\n\nAn intelligent AI-powered LinkedIn post creation workflow that combines the power of LangChain, LangGraph, and multiple AI services to create engaging LinkedIn content with matching visuals.\n\n![LangGraph Workflow](images/LangGraph-workflow-diagram.png)\n\n## ✨ Features\n\n- **🤖 AI-Powered Generation**: Google Gemini for high-quality LinkedIn post drafting\n- **🔍 Web Research**: Tavily search integration for up-to-date context and insights\n- **🔄 Interactive Feedback Loop**: Intelligent feedback classification and content refinement\n- **🎨 Image Generation**: DALL·E 3 integration for compelling visual content\n- **📧 Email Delivery**: Gmail API integration for seamless content distribution\n- **🖥️ Multiple Interfaces**: CLI and web-based Chainlit UI options\n- **📊 Workflow Visualization**: LangGraph Studi

In [43]:
# Example usage functions
def demo_search():
    """Demo function to show how to use the search functionality."""
    
    # Example searches
    queries = [
        "How does the workflow work?",
        "What are the main Python functions?",
        "How to configure the application?",
        "What dependencies are required?"
    ]
    
    print("=== Repository Search Demo ===\n")
    
    for query in queries:
        print(f"Query: {query}")
        print("-" * 50)
        
        results = search_repo_code(query, top_k=3, similarity_threshold=0.6)
        formatted_results = format_search_results(results, max_content_length=200)
        print(formatted_results)
        print("=" * 80 + "\n")

# Quick search function for interactive use
def quick_search(query, show_content=True):
    """
    Quick search function for interactive use in notebooks.
    
    Args:
        query (str): Search query
        show_content (bool): Whether to show file content
    
    Returns:
        None: Prints results directly
    """
    results = search_repo_code(query, top_k=5, similarity_threshold= 0.5)
    
    if not results:
        print("No results found.")
        return
    
    print(f"Found {len(results)} results for: '{query}'\n")
    
    for i, result in enumerate(results, 1):
        print(f"{i}. {result['file_path']} (Score: {result['similarity_score']:.3f})")
        if show_content:
            content = result['content'][:300] + "..." if len(result['content']) > 300 else result['content']
            print(f"   Content preview: {content}\n")


In [44]:
quick_search("The generate text functionality")



Found 5 results for: 'The generate text functionality'

1. main.py (Score: 0.788)
   Content preview: import sys
from utils import load_env
from workflow.graph import start_workflow, continue_workflow


def main() -> None:
    load_env()
    if len(sys.argv) < 2:
        print("Usage: python main.py \"Your post topic...\"")
        sys.exit(1)
    topic = sys.argv[1]
    thread_id = "cli"
    state ...

2. app.py (Score: 0.775)
   Content preview: """Chainlit GUI for LinkedIn Booster workflow."""
import chainlit as cl
from typing import Dict, Any
import asyncio
from utils import load_env
from workflow.graph import start_workflow, continue_workflow


# Load environment variables
load_env()

# Store user sessions
user_sessions: Dict[str, Dict[s...

3. README.md (Score: 0.746)
   Content preview: # LinkedIn-Booster 🚀

An intelligent AI-powered LinkedIn post creation workflow that combines the power of LangChain, LangGraph, and multiple AI services to create engaging LinkedIn content with 

# Agentic RAG

## Agent Tools

In [13]:
# code Repository Search
from langchain.tools import Tool
from typing import List, Dict, Any
import json

def code_repository_search(query: str) -> str:
    """
    Search through code repositories using semantic similarity.
    
    This tool searches through stored repository code and documentation
    to find relevant files and code snippets based on the query.
    
    Args:
        query (str): The search query describing what you're looking for
        
    Returns:
        str: JSON string containing search results with file paths, 
             similarity scores, and relevant code snippets
    """
    try:
        # Connect to the database
        vx = vecs.create_client(SUPABASE_DB_URL)
        collection = vx.get_or_create_collection(name="repo_files", dimension=1536)
        
        # Generate embedding for the query
        query_embedding = model.embed_query(query)
        
        # Search for similar vectors (limit to top 5 for agent efficiency)
        results = collection.query(
            data=query_embedding,
            limit=5,
            include_value=True,
            include_metadata=True
        )
        
        # Process results for agent consumption
        agent_results = []
        for result in results:
            similarity_score = result[1]
            metadata = result[2]
            
            # Only include results with decent similarity (>0.6)
            if similarity_score >= 0.6:
                # Truncate content for agent efficiency (max 1000 chars)
                content = metadata.get('content', '')
                if len(content) > 1000:
                    content = content[:1000] + "... [truncated]"
                
                agent_results.append({
                    'file_path': metadata.get('path', 'Unknown'),
                    'repo_name': metadata.get('repo_name', 'Unknown'),
                    'similarity_score': round(similarity_score, 3),
                    'content_preview': content,
                    'file_type': metadata.get('path', '').split('.')[-1] if '.' in metadata.get('path', '') else 'unknown'
                })
        
        vx.disconnect()
        
        # Return structured JSON for agent
        return json.dumps({
            'query': query,
            'results_found': len(agent_results),
            'results': agent_results
        }, indent=2)
        
    except Exception as e:
        return json.dumps({
            'error': f"Search failed: {str(e)}",
            'query': query,
            'results_found': 0,
            'results': []
        })

# Create the LangChain Tool
repository_search_tool = Tool(
    name="repository_search",
    description="""Search through code repositories to find relevant files, functions, and documentation. 
    Use this when you need to:
    - Find specific code implementations
    - Locate configuration files
    - Search for documentation
    - Find examples of how something is implemented
    - Understand project structure
    
    Input should be a descriptive search query about what you're looking for.""",
    func=code_repository_search
)

In [23]:
code_repository_search("classify feedback")



'{\n  "query": "classify feedback",\n  "results_found": 5,\n  "results": [\n    {\n      "file_path": "workflow/graph.py",\n      "repo_name": "tmpqv23rcta",\n      "similarity_score": 0.661,\n      "content_preview": "from langgraph.graph import StateGraph, END\\nfrom langgraph.checkpoint.memory import MemorySaver\\n\\nfrom workflow.state import WorkflowState\\nfrom workflow.nodes import (\\n    create_post_text,\\n    generate_image,\\n    classify_feedback,\\n    refine_text,\\n    refine_image,\\n    send_email,\\n)\\n\\n\\ndef route_feedback(state: WorkflowState) -> str:\\n    \\"\\"\\"Route based on feedback classification.\\"\\"\\"\\n    classification = state.classification.lower()\\n    if \\"approved\\" in classification:\\n        return \\"approved\\"\\n    if \\"refine text\\" in classification or \\"refine_text\\" in classification:\\n        return \\"refine_text\\"\\n    if \\"refine image\\" in classification or \\"refine_image\\" in classification:\\n        return \\

In [None]:
from langgraph.errors import GraphRecursionError
from langgraph.prebuilt import create_react_agent

max_iterations = 3
recursion_limit = 2 * max_iterations + 1
agent = create_react_agent(
    model="openai:gpt-4o-mini",
    tools=[repository_search_tool]
)

try:
    response = agent.invoke(
        {"messages": [
            {'role': "system", "content": """ 
            You are a helpfull DevOps assistant you are spetialized in searching the code repositories and analyze code."""},
            {"role": "user", "content": "What part in the repo is responsible for the classifing the user feadback"}]},
        {"recursion_limit": recursion_limit},
    )
except GraphRecursionError:
    print("Agent stopped due to max iterations.")



In [34]:
for m in response['messages']:
    m.pretty_print()


 
            You are a helpfull DevOps assistant you are spetialized in searching the code repositories and analyze code.

What part in the repo is responsible for the classifing the user feadback
Tool Calls:
  repository_search (call_aL0YJEAr3mK5XGFuPudaDpSu)
 Call ID: call_aL0YJEAr3mK5XGFuPudaDpSu
  Args:
    __arg1: classifying user feedback
Name: repository_search

{
  "query": "classifying user feedback",
  "results_found": 5,
  "results": [
    {
      "file_path": "workflow/graph.py",
      "repo_name": "tmpqv23rcta",
      "similarity_score": 0.614,
      "content_preview": "from langgraph.graph import StateGraph, END\nfrom langgraph.checkpoint.memory import MemorySaver\n\nfrom workflow.state import WorkflowState\nfrom workflow.nodes import (\n    create_post_text,\n    generate_image,\n    classify_feedback,\n    refine_text,\n    refine_image,\n    send_email,\n)\n\n\ndef route_feedback(state: WorkflowState) -> str:\n    \"\"\"Route based on feedback classification.\"\"\"\n

### Alternative Tools

In [7]:
# Alternative: More specific tool for code analysis
def analyze_code_structure(query: str) -> str:
    """
    Analyze code repository structure and find specific implementations.
    
    Optimized for finding:
    - Function definitions
    - Class implementations  
    - Configuration patterns
    - API endpoints
    - Database schemas
    
    Args:
        query (str): What you want to analyze (e.g., "main workflow functions", "API routes", "database models")
        
    Returns:
        str: Structured analysis results
    """
    try:
        # Connect and search
        vx = vecs.create_client(SUPABASE_DB_URL)
        collection = vx.get_or_create_collection(name="repo_files", dimension=1536)
        
        query_embedding = model.embed_query(query)
        results = collection.query(
            data=query_embedding,
            limit=3,  # Fewer results for focused analysis
            include_value=True,
            include_metadata=True
        )
        
        analysis_results = {
            'query': query,
            'analysis': [],
            'summary': ''
        }
        
        for result in results:
            similarity_score = result[1]
            metadata = result[2]
            
            if similarity_score >= 0.65:  # Higher threshold for analysis
                file_path = metadata.get('path', '')
                content = metadata.get('content', '')
                
                # Extract key information based on file type
                file_analysis = {
                    'file': file_path,
                    'type': file_path.split('.')[-1] if '.' in file_path else 'unknown',
                    'relevance': round(similarity_score, 3),
                    'key_elements': []
                }
                
                # Simple pattern matching for common code elements
                if '.py' in file_path:
                    # Find function definitions
                    import re
                    functions = re.findall(r'def\s+(\w+)\s*\(', content)
                    classes = re.findall(r'class\s+(\w+)\s*[\(:]', content)
                    if functions:
                        file_analysis['key_elements'].append(f"Functions: {', '.join(functions[:5])}")
                    if classes:
                        file_analysis['key_elements'].append(f"Classes: {', '.join(classes[:3])}")
                
                elif '.js' in file_path or '.ts' in file_path:
                    # Find function definitions and exports
                    functions = re.findall(r'function\s+(\w+)\s*\(', content)
                    exports = re.findall(r'export\s+(?:default\s+)?(?:function\s+)?(\w+)', content)
                    if functions:
                        file_analysis['key_elements'].append(f"Functions: {', '.join(functions[:5])}")
                    if exports:
                        file_analysis['key_elements'].append(f"Exports: {', '.join(exports[:3])}")
                
                # Add content preview
                preview = content[:500] + "..." if len(content) > 500 else content
                file_analysis['preview'] = preview
                
                analysis_results['analysis'].append(file_analysis)
        
        # Generate summary
        if analysis_results['analysis']:
            file_count = len(analysis_results['analysis'])
            file_types = list(set([item['type'] for item in analysis_results['analysis']]))
            analysis_results['summary'] = f"Found {file_count} relevant files of types: {', '.join(file_types)}"
        else:
            analysis_results['summary'] = "No relevant code found for the query"
        
        vx.disconnect()
        return json.dumps(analysis_results, indent=2)
        
    except Exception as e:
        return json.dumps({
            'error': f"Analysis failed: {str(e)}",
            'query': query,
            'analysis': [],
            'summary': 'Analysis failed due to error'
        })

# Create the analysis tool
code_analysis_tool = Tool(
    name="code_analysis",
    description="""Analyze code repository structure and implementations. 
    Use this when you need to:
    - Understand how specific features are implemented
    - Find function and class definitions
    - Analyze code patterns and structure
    - Get an overview of specific modules or components
    
    Input should describe what code structure or implementation you want to analyze.""",
    func=analyze_code_structure
)


In [37]:
# Simplified tool optimized for ReAct agents
def search_codebase(query: str) -> str:
    """
    Simple codebase search tool optimized for ReAct agents.
    Returns concise, actionable results that agents can easily process.
    
    Args:
        query (str): Search query for code, documentation, or configuration
        
    Returns:
        str: Concise search results with file paths and key information
    """
    try:
        vx = vecs.create_client(SUPABASE_DB_URL)
        collection = vx.get_or_create_collection(name="repo_files", dimension=1536)
        
        query_embedding = model.embed_query(query)
        results = collection.query(
            data=query_embedding,
            limit=3,  # Keep it focused for agent efficiency
            include_value=True,
            include_metadata=True
        )
        
        if not results or len(results) == 0:
            return f"No relevant code found for query: '{query}'"
        
        # Format results for agent consumption
        formatted_results = []
        for result in results:
            similarity_score = result[1]
            metadata = result[2]
            
            if similarity_score >= 0.6:
                file_path = metadata.get('path', 'Unknown')
                content = metadata.get('content', '')
                
                # Extract key lines or functions
                key_info = ""
                if '.py' in file_path:
                    # Extract function/class definitions
                    import re
                    functions = re.findall(r'def\s+(\w+)', content)
                    classes = re.findall(r'class\s+(\w+)', content)
                    if functions:
                        key_info = f"Functions: {', '.join(functions[:3])}"
                    if classes:
                        key_info += f" Classes: {', '.join(classes[:2])}"
                elif '.md' in file_path:
                    # Extract first few lines for documentation
                    lines = content.split('\n')[:3]
                    key_info = ' '.join(lines).strip()[:100]
                else:
                    # General content preview
                    key_info = content[:150].replace('\n', ' ').strip()
                
                formatted_results.append(
                    f"File: {file_path} (Score: {similarity_score:.2f})\n"
                    f"Content: {key_info}..."
                )
        
        vx.disconnect()
        
        if not formatted_results:
            return f"No sufficiently relevant results found for: '{query}'"
        
        return f"Found {len(formatted_results)} relevant files:\n\n" + "\n\n".join(formatted_results)
        
    except Exception as e:
        return f"Search error: {str(e)}"

# The optimal ReAct agent tool
codebase_search_tool = Tool(
    name="search_codebase",
    description="""Search through code repositories to find relevant files and implementations.
    
    Use this tool when you need to:
    - Find specific functions or classes
    - Locate configuration files  
    - Search documentation
    - Understand how features are implemented
    - Find code examples
    
    The tool returns file paths with relevance scores and key content snippets.
    Input: A descriptive search query about what you're looking for.""",
    func=search_codebase
)



# Test function for the tool
def test_codebase_search():
    """Test the codebase search functionality."""
    test_queries = [
        "workflow implementation",
        "main Python functions", 
        "configuration files",
        "API endpoints",
        "database connection"
    ]
    
    print("Testing Codebase Search Tool:")
    print("=" * 50)
    
    for query in test_queries:
        print(f"\nQuery: {query}")
        print("-" * 30)
        result = search_codebase(query)
        print(result)
        print()

# Test the tool
test_codebase_search()


Testing Codebase Search Tool:

Query: workflow implementation
------------------------------




Found 3 relevant files:

File: workflow/graph.py (Score: 0.61)
Content: Functions: route_feedback, create_workflow, start_workflow...

File: workflow/nodes.py (Score: 0.62)
Content: Functions: create_post_text, generate_image, classify_feedback...

File: workflow/state.py (Score: 0.63)
Content:  Classes: from, class...


Query: main Python functions
------------------------------
Found 2 relevant files:

File: main.py (Score: 0.66)
Content: Functions: main...

File: studio/langgraph.json (Score: 0.73)
Content: {     "dockerfile_lines": [],     "graphs": {       "chatbot": "./graph.py:app"     },     "env": "./.env",     "python_version": "3.11",     "depende...


Query: configuration files
------------------------------
Found 3 relevant files:

File: utils/config.py (Score: 0.72)
Content: Functions: load_env Classes: from, class...

File: studio/langgraph.json (Score: 0.73)
Content: {     "dockerfile_lines": [],     "graphs": {       "chatbot": "./graph.py:app"     },     "env": "./.en

In [45]:
def create_langgraph_agent():
    """
    Create a LangGraph agent which handles parsing more robustly.
    """
    from langgraph.prebuilt import create_react_agent
    
    # Initialize LLM
    #llm = init_chat_model("gpt-4o-mini", model_provider="openai")
    
    # Create LangGraph agent (more robust)
    agent = create_react_agent(
        model = "openai:gpt-4o-mini",
        tools=[codebase_search_tool],
        prompt = "You are a helpful coding assistant. Use the search tool to find relevant code and provide detailed answers.",
        debug = True
    )
    
    return agent

agent = create_langgraph_agent()

response = agent.invoke({
    "messages": [
        {"role": "user", "content": "How does the workflow functions"}
    ],
    "recursion_limit": recursion_limit
})

for m in response['messages']:
    m.pretty_print()

[1m[values][0m {'messages': [HumanMessage(content='How does the workflow functions', additional_kwargs={}, response_metadata={}, id='61c3b5d1-a339-41a6-8db6-3b27d500d0e3')]}
[1m[updates][0m {'agent': {'messages': [AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_rMhuVcO5Ra7jnGzzBCMwfqHW', 'function': {'arguments': '{"__arg1":"workflow functions"}', 'name': 'search_codebase'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 147, 'total_tokens': 165, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_560af6e559', 'id': 'chatcmpl-CFcKIxsCWmCgNIGkXEne1ZpjljplW', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--87467977-2c56-4adf-b52b-996840b2062



[1m[updates][0m {'tools': {'messages': [ToolMessage(content='Found 3 relevant files:\n\nFile: main.py (Score: 0.61)\nContent: Functions: main...\n\nFile: workflow/__init__.py (Score: 0.62)\nContent: ...\n\nFile: workflow/graph.py (Score: 0.63)\nContent: Functions: route_feedback, create_workflow, start_workflow...', name='search_codebase', id='8fb79245-f351-4d70-b1df-54c4a2fbfc36', tool_call_id='call_rMhuVcO5Ra7jnGzzBCMwfqHW')]}}
[1m[values][0m {'messages': [HumanMessage(content='How does the workflow functions', additional_kwargs={}, response_metadata={}, id='61c3b5d1-a339-41a6-8db6-3b27d500d0e3'), AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_rMhuVcO5Ra7jnGzzBCMwfqHW', 'function': {'arguments': '{"__arg1":"workflow functions"}', 'name': 'search_codebase'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 147, 'total_tokens': 165, 'completion_tokens_details': {'accepted_prediction_tokens': 0



[1m[updates][0m {'tools': {'messages': [ToolMessage(content="No sufficiently relevant results found for: 'create_workflow'", name='search_codebase', tool_call_id='call_9jAlF5eFb05pRIUD5PFK4yXA')]}}
[1m[updates][0m {'tools': {'messages': [ToolMessage(content='Found 3 relevant files:\n\nFile: n8n/main (NO img2img).json (Score: 0.72)\nContent: {   "name": "main (NO img2img)",   "nodes": [     {       "parameters": {         "options": {}       },       "type": "@n8n/n8n-nodes-langchain.lmCha...\n\nFile: main.py (Score: 0.77)\nContent: Functions: main...\n\nFile: requirements.txt (Score: 0.80)\nContent: chainlit>=1.0.0 langchain>=0.1.0 langchain-community>=0.1.0 langchain-google-genai>=1.0.0 langchain-google-community>=0.1.0 langgraph>=0.0.40 requests...', name='search_codebase', tool_call_id='call_Jb9qe6Gltidb8HZUdKrqfdnJ')]}}
[1m[updates][0m {'tools': {'messages': [ToolMessage(content="No sufficiently relevant results found for: 'start_workflow'", name='search_codebase', tool_call_



[1m[updates][0m {'tools': {'messages': [ToolMessage(content="No sufficiently relevant results found for: 'workflow/graph.py'", name='search_codebase', tool_call_id='call_rqB9fvXtD7J3TnOOAFAYTxf4')]}}
[1m[updates][0m {'tools': {'messages': [ToolMessage(content='Found 2 relevant files:\n\nFile: studio/__init__.py (Score: 0.61)\nContent: ...\n\nFile: studio/langgraph.json (Score: 0.67)\nContent: {     "dockerfile_lines": [],     "graphs": {       "chatbot": "./graph.py:app"     },     "env": "./.env",     "python_version": "3.11",     "depende...', name='search_codebase', tool_call_id='call_wt4VcgKLQi72WWpkRi1fsF3v')]}}
[1m[updates][0m {'tools': {'messages': [ToolMessage(content='Found 3 relevant files:\n\nFile: studio/__init__.py (Score: 0.63)\nContent: ...\n\nFile: main.py (Score: 0.67)\nContent: Functions: main...\n\nFile: studio/langgraph.json (Score: 0.69)\nContent: {     "dockerfile_lines": [],     "graphs": {       "chatbot": "./graph.py:app"     },     "env": "./.env",     "



[1m[updates][0m {'tools': {'messages': [ToolMessage(content='Found 3 relevant files:\n\nFile: workflow/graph.py (Score: 0.62)\nContent: Functions: route_feedback, create_workflow, start_workflow...\n\nFile: workflow/__init__.py (Score: 0.63)\nContent: ...\n\nFile: main.py (Score: 0.64)\nContent: Functions: main...', name='search_codebase', id='f70807af-6efb-4c46-aaff-9820479ba4a7', tool_call_id='call_bWVQ5uOn6JJq3e4aTR5QMijo')]}}
[1m[values][0m {'messages': [HumanMessage(content='How does the workflow functions', additional_kwargs={}, response_metadata={}, id='61c3b5d1-a339-41a6-8db6-3b27d500d0e3'), AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_rMhuVcO5Ra7jnGzzBCMwfqHW', 'function': {'arguments': '{"__arg1":"workflow functions"}', 'name': 'search_codebase'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 147, 'total_tokens': 165, 'completion_tokens_details': {'accepted_prediction_tokens': 0

In [47]:
display(Markdown(response['messages'][-1].content))

To understand how the workflow functions are implemented, let's focus on the relevant code snippets and files found during the searches:

### Key Files and Functions

1. **`workflow/graph.py`**
   - This file contains several important functions related to workflow management:
     - **`route_feedback`**: Likely handles feedback routing within the workflow process.
     - **`create_workflow`**: Responsible for initializing or setting up new workflows.
     - **`start_workflow`**: Begins the execution of a defined workflow.

2. **`main.py`**
   - Contains the main execution logic of the application, including initializing the application and possibly invoking the workflow functions.

3. **`workflow/__init__.py`**
   - This file typically initializes the workflow module and may import key functions from `graph.py` for use elsewhere in the application.

### Overview of Functions

- **`create_workflow`**: 
   - This function is essential for defining a new workflow, where it might accept parameters that dictate its structure (such as nodes, edges, and conditions for execution). The function likely sets up the workflow data structure and prepares it for execution.
  
- **`start_workflow`**: 
   - Invoked to kick off the execution of a previously defined workflow. This function probably handles necessary checks (like validation) and executes the tasks or nodes defined in the workflow sequence.

- **`route_feedback`**: 
   - Designed to manage the feedback process, which can include sending information back from completed tasks in the workflow for logging, updates, or decision-making.

### Code Structure Recommendations

If you want to explore the actual code implementation, look into `workflow/graph.py` for the definitions of `create_workflow`, `start_workflow`, and `route_feedback`. You will likely find detailed comments and logic that will clarify how each function interacts within the workflow system.

### Additional Considerations

For a more integrative view of how these functions operate within the entire application:
- Check how `main.py` orchestrates the workflow initiation.
- Review any dependencies in `requirements.txt`, as these can often provide insight into what libraries or frameworks are being used that might affect workflow behavior.

You may also want to run the application to observe how these functions are invoked in real time during execution. This can be especially valuable to visualize their impact in a live scenario.

# Tools Play Ground

In [132]:
class CodebaseTools:
    """Collection of tools for codebase search and analysis."""
    
    def __init__(self, supabase_url: str, embedding_model: str = "text-embedding-3-small"):
        self.supabase_url = supabase_url
        self.embeddings = OpenAIEmbeddings(model=embedding_model)
        self.collection_name = "repo_files"
        self.dimension = 1536
    
    def search_codebase(self, query: str) -> str:
        """Search through code repositories to find relevant files and implementations."""
        try:
            vx = vecs.create_client(self.supabase_url)
            collection = vx.get_or_create_collection(
                name=self.collection_name, 
                dimension=self.dimension
            )
            
            query_embedding = self.embeddings.embed_query(query)
            results = collection.query(
                data=query_embedding,
                limit=5,
                include_value=True,
                include_metadata=True
            )
            
            if not results or len(results) == 0:
                return f"No relevant code found for query: '{query}'"
            
            formatted_results = []
            for result in results:
                cosine_distance = result[1]
                metadata = result[2]
                
                # Convert distance to similarity: similarity = 1 - distance
                # For cosine distance: 0 = identical, 1 = completely different
                similarity_score = 1 - cosine_distance
                
                if similarity_score >= 0.3:  # Adjusted threshold for similarity
                    file_path = metadata.get('path', 'Unknown')
                    content = metadata.get('content', '')
                    repo_name = metadata.get('repo_name', 'Unknown')
                    
                    # Extract key information based on file type
                    key_info = self._extract_file_info(file_path, content)
                    
                    formatted_results.append(
                        f"Repository: {repo_name}\n"
                        f"File: {file_path} (Score: {similarity_score:.2f})\n"
                        f"Content: {key_info}..."
                    )
            
            vx.disconnect()
            
            if not formatted_results:
                return f"No sufficiently relevant results found for: '{query}'"
            
            return f"Found {len(formatted_results)} relevant files:\n\n" + "\n\n".join(formatted_results)
            
        except Exception as e:
            return f"Search error: {str(e)}"

    def list_directories(self, repo_name: str = "") -> list:
        """List all file paths (indices) in the repositories."""
        try:
            vx = vecs.create_client(self.supabase_url)
            collection = vx.get_or_create_collection(
                name=self.collection_name, 
                dimension=self.dimension
            )
            
            # Get all file paths
            results = collection.query(
                data=[0.0] * self.dimension,
                limit=1000,
                include_metadata=False
            )
            
            vx.disconnect()
            return results
            
        except Exception as e:
            return [f"Error: {str(e)}"]

    def get_file_content(self, file_path: str, repo_name: str = "") -> str:
        """Get the full content of a specific file using direct fetch by ID."""
        try:
            vx = vecs.create_client(self.supabase_url)
            collection = vx.get_or_create_collection(
                name=self.collection_name, 
                dimension=self.dimension
            )
            
            # The file_path IS the ID, so fetch directly
            result = collection.fetch(ids=[file_path])
            
            if result and len(result) > 0:
                metadata = result[0][2]  # Get metadata from fetch result
                content = metadata.get('content', '')
                current_path = metadata.get('path', '')
                current_repo = metadata.get('repo_name', '')
                vx.disconnect()
                return f"File: {current_repo}:{current_path}\n\n{content}"
            
            vx.disconnect()
            return f"File '{file_path}' not found"
            
        except Exception as e:
            return f"File content error: {str(e)}"
    
    def analyze_code_structure(self, query: str) -> str:
        """Analyze code structure and patterns across the codebase."""
        try:
            vx = vecs.create_client(self.supabase_url)
            collection = vx.get_or_create_collection(
                name=self.collection_name, 
                dimension=self.dimension
            )
            
            query_embedding = self.embeddings.embed_query(query)
            results = collection.query(
                data=query_embedding,
                limit=10,
                include_value=True,
                include_metadata=True
            )
            
            if not results:
                return f"No code found for analysis: '{query}'"
            
            # Debug: Show similarity scores
            debug_info = []
            for i, result in enumerate(results[:3]):
                if len(result) > 2:
                    cosine_distance = result[1]
                    similarity_score = 1 - cosine_distance
                    metadata = result[2]
                    file_path = metadata.get('path', 'Unknown')
                    debug_info.append(f"Result {i+1}: {file_path} (similarity: {similarity_score:.3f})")
            
            debug_text = "Debug - Top results:\n" + "\n".join(debug_info) + "\n\n"
            
            analysis = {
                'functions': set(),
                'classes': set(),
                'imports': set(),
                'patterns': [],
                'files_analyzed': 0
            }
            
            for result in results:
                if len(result) > 2:
                    cosine_distance = result[1]
                    similarity_score = 1 - cosine_distance
                    if similarity_score >= 0.2:  # Lower threshold to include more results
                        metadata = result[2]
                        content = metadata.get('content', '')
                        file_path = metadata.get('path', '')
                        
                        analysis['files_analyzed'] += 1
                        
                        # Extract Python patterns
                        if file_path.endswith('.py'):
                            functions = re.findall(r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(', content)
                            classes = re.findall(r'class\s+([A-Z][a-zA-Z0-9_]*)\s*[\(:]', content)
                            imports = re.findall(r'(?:from\s+(\S+)\s+)?import\s+([^\n]+)', content)
                            
                            analysis['functions'].update(functions)
                            analysis['classes'].update(classes)
                            for imp in imports:
                                if imp[0]:  # from X import Y
                                    analysis['imports'].add(f"from {imp[0]} import {imp[1]}")
                                else:  # import X
                                    analysis['imports'].add(f"import {imp[1]}")
                        
                        # Extract JavaScript/TypeScript patterns
                        elif file_path.endswith(('.js', '.ts', '.jsx', '.tsx')):
                            functions = re.findall(r'(?:function\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(|([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*(?:async\s+)?(?:function|\([^)]*\)\s*=>))', content)
                            classes = re.findall(r'class\s+([A-Z][a-zA-Z0-9_]*)\s*[\{]', content)
                            imports = re.findall(r'import\s+.*?from\s+[\'"]([^\'"]+)[\'"]', content)
                            
                            for func_match in functions:
                                func_name = func_match[0] or func_match[1]
                                if func_name:
                                    analysis['functions'].add(func_name)
                            
                            analysis['classes'].update(classes)
                            analysis['imports'].update(imports)
            
            vx.disconnect()
            
            # Format analysis results
            result_parts = [
                f"Code Structure Analysis for: '{query}'",
                f"Files analyzed: {analysis['files_analyzed']}",
                ""
            ]
            
            if analysis['functions']:
                result_parts.append(f"Functions found ({len(analysis['functions'])}):")
                result_parts.extend([f"  - {func}" for func in sorted(list(analysis['functions']))[:10]])
                if len(analysis['functions']) > 10:
                    result_parts.append(f"  ... and {len(analysis['functions']) - 10} more")
                result_parts.append("")
            
            if analysis['classes']:
                result_parts.append(f"Classes found ({len(analysis['classes'])}):")
                result_parts.extend([f"  - {cls}" for cls in sorted(list(analysis['classes']))[:10]])
                result_parts.append("")
            
            if analysis['imports']:
                result_parts.append(f"Key imports ({len(analysis['imports'])}):")
                result_parts.extend([f"  - {imp}" for imp in sorted(list(analysis['imports']))[:10]])
                if len(analysis['imports']) > 10:
                    result_parts.append(f"  ... and {len(analysis['imports']) - 10} more")
            
            return debug_text + "\n".join(result_parts)
            
        except Exception as e:
            return f"Code analysis error: {str(e)}"
    
    def _extract_file_info(self, file_path: str, content: str) -> str:
        """Extract key information from file content based on file type."""
        if '.py' in file_path:
            functions = re.findall(r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(', content)
            classes = re.findall(r'class\s+([A-Z][a-zA-Z0-9_]*)\s*[\(:]', content)
            key_info = ""
            if functions:
                key_info = f"Functions: {', '.join(functions[:3])}"
            if classes:
                key_info += f" Classes: {', '.join(classes[:2])}"
            clean_content = content[:150].replace('\n', ' ').strip()
            return key_info or clean_content
        
        elif file_path.endswith(('.js', '.ts', '.jsx', '.tsx')):
            functions = re.findall(r'(?:function\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(|([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*(?:async\s+)?(?:function|\([^)]*\)\s*=>))', content)
            func_names = [f[0] or f[1] for f in functions if f[0] or f[1]]
            classes = re.findall(r'class\s+([A-Z][a-zA-Z0-9_]*)\s*[\{]', content)
            key_info = ""
            if func_names:
                key_info = f"Functions: {', '.join(func_names[:3])}"
            if classes:
                key_info += f" Classes: {', '.join(classes[:2])}"
            clean_content = content[:150].replace('\n', ' ').strip()
            return key_info or clean_content
        
        elif '.md' in file_path:
            lines = content.split('\n')[:3]
            return ' '.join(lines).strip()[:200]
        
        elif file_path.endswith(('.json', '.yaml', '.yml')):
            clean_content = content[:100].replace('\n', ' ').strip()
            return f"Config file: {clean_content}"
        
        else:
            clean_content = content[:150].replace('\n', ' ').strip()
            return clean_content
    
    def _is_in_directory(self, file_path: str, target_directory: str) -> bool:
        """Check if a file path is within the target directory."""
        # Normalize paths
        file_dir = os.path.dirname(file_path).replace('\\', '/')
        target_dir = target_directory.strip('/').replace('\\', '/')
        
        if not target_dir:  # Root directory
            return '/' not in file_dir
        
        # Check if file is in the directory or subdirectory
        return file_dir == target_dir or file_dir.startswith(target_dir + '/')
 

In [133]:
# Get your Supabase URL from environment variables
supabase_url = os.getenv("SUPABASE_DB_URL")
openai_key = os.getenv("OPENAI_API_KEY")

if not supabase_url:
    print("❌ Error: SUPABASE_DB_URL environment variable not set")
elif not openai_key:
    print("❌ Error: OPENAI_API_KEY environment variable not set")
else:
    print("✅ Environment variables loaded successfully")
    
    # Initialize the tools
    tools = CodebaseTools(supabase_url)
    print("✅ CodebaseTools initialized successfully")

✅ Environment variables loaded successfully
✅ CodebaseTools initialized successfully


### List Directories

In [27]:
collection = vx.get_or_create_collection(name="repo_files", dimension=1536)

# get the indeces of the collection
print(collection.__doc__)


    The `vecs.Collection` class represents a collection of vectors within a PostgreSQL database with pgvector support.
    It provides methods to manage (create, delete, fetch, upsert), index, and perform similarity searches on these vector collections.

    The collections are stored in separate tables in the database, with each vector associated with an identifier and optional metadata.

    Example usage:

        with vecs.create_client(DB_CONNECTION) as vx:
            collection = vx.create_collection(name="docs", dimension=3)
            collection.upsert([("id1", [1, 1, 1], {"key": "value"})])
            # Further operations on 'collection'

    Public Attributes:
        name: The name of the vector collection.
        dimension: The dimension of vectors in the collection.

    Note: Some methods of this class can raise exceptions from the `vecs.exc` module if errors occur.
    


In [15]:
tools.list_directories()

['tmpqv23rcta/README.md',
 'tmpqv23rcta/app.py',
 'tmpqv23rcta/main.py',
 'tmpqv23rcta/requirements.txt',
 'tmpqv23rcta/n8n/main (NO img2img).json',
 'tmpqv23rcta/studio/__init__.py',
 'tmpqv23rcta/studio/graph.py',
 'tmpqv23rcta/studio/langgraph.json',
 'tmpqv23rcta/utils/config.py',
 'tmpqv23rcta/workflow/__init__.py',
 'tmpqv23rcta/workflow/graph.py',
 'tmpqv23rcta/workflow/nodes.py',
 'tmpqv23rcta/workflow/state.py',
 'tmpqv23rcta/workflow/tools.py']

### Search Code Base

In [80]:
results = tools.search_codebase("The web search tool")
print(results)

Found 4 relevant files:

Repository: tmpqv23rcta
File: workflow/tools.py (Score: 0.49)
Content: Functions: tavily_search...

Repository: tmpqv23rcta
File: n8n/main (NO img2img).json (Score: 0.24)
Content: Config file: {   "name": "main (NO img2img)",   "nodes": [     {       "parameters": {         "options": {}...

Repository: tmpqv23rcta
File: README.md (Score: 0.21)
Content: # LinkedIn-Booster 🚀  An intelligent AI-powered LinkedIn post creation workflow that combines the power of LangChain, LangGraph, and multiple AI services to create engaging LinkedIn content with match...

Repository: tmpqv23rcta
File: requirements.txt (Score: 0.20)
Content: chainlit>=1.0.0 langchain>=0.1.0 langchain-community>=0.1.0 langchain-google-genai>=1.0.0 langchain-google-community>=0.1.0 langgraph>=0.0.40 requests...


In [76]:
query_embedding = model.embed_query("The web search tool")

results = collection.query(
            data=query_embedding,
            limit=5,
            include_value=True,
            include_metadata=False
        )

In [77]:
results

[('tmpqv23rcta/workflow/tools.py', 0.511380403555607),
 ('tmpqv23rcta/n8n/main (NO img2img).json', 0.764289990919228),
 ('tmpqv23rcta/README.md', 0.789835356503363),
 ('tmpqv23rcta/requirements.txt', 0.795650249849252),
 ('tmpqv23rcta/workflow/nodes.py', 0.829359775261906)]

In [83]:
print(collection.query.__doc__)


        Executes a similarity search in the collection.

        The return type is dependent on arguments *include_value* and *include_metadata*

        Args:
            data (Any): The vector to use as the query.
            limit (int, optional): The maximum number of results to return. Defaults to 10.
            filters (Optional[Dict], optional): Filters to apply to the search. Defaults to None.
            measure (Union[IndexMeasure, str], optional): The distance measure to use for the search. Defaults to 'cosine_distance'.
            include_value (bool, optional): Whether to include the distance value in the results. Defaults to False.
            include_metadata (bool, optional): Whether to include the metadata in the results. Defaults to False.
            probes (Optional[Int], optional): Number of ivfflat index lists to query. Higher increases accuracy but decreases speed
            ef_search (Optional[Int], optional): Size of the dynamic candidate list for HNSW ind

### Get File Content

In [24]:
collection.fetch(ids=["tmpqv23rcta/studio/graph.py"])[0][2].get('content')

'"""Standalone graph file for LangGraph Studio."""\nimport sys\nimport os\nfrom typing import Optional\nfrom dataclasses import dataclass\n\n# Add the parent directory to the path so we can import from workflow\nsys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))\n\nfrom langgraph.graph import StateGraph, END\n\n# Define state directly to avoid import conflicts\n@dataclass\nclass WorkflowState:\n    """State management for the LinkedIn post creation workflow."""\n    user_input: str = ""\n    post_text: str = ""\n    image_prompt: str = ""\n    image_data: Optional[bytes] = None\n    image_url: str = ""\n    user_feedback: str = ""\n    classification: str = ""\n    refined_text: str = ""\n    refined_image_data: Optional[bytes] = None\n    current_step: str = "start"\n    error_message: str = ""\n    final_post_ready: bool = False\n\n# Import nodes after defining state\ntry:\n    from workflow.nodes import (\n        create_post_text,\n        generate_imag

In [89]:
collection.fetch(ids=["tmpqv23rcta/workflow/tools.py"])

[('tmpqv23rcta/workflow/tools.py', array([-0.01585723,  0.01547127, -0.02408358, ...,  0.01043181,
         0.00584997, -0.03328033], shape=(1536,), dtype=float32), {'path': 'workflow/tools.py', 'content': 'import os\nimport requests\nfrom typing import Any, Dict\n\nfrom dotenv import load_dotenv\nload_dotenv()\n\ ... (980 characters truncated) ... cription="Search the web for current information",\n    func=tavily_search\n)\n', 'branches': ['main'], 'repo_name': 'tmpqv23rcta', 'commit_count': 6})]

In [88]:
print(collection.fetch.__doc__)


        Fetches vectors from the collection by their identifiers.

        Args:
            ids (Iterable[str]): An iterable of vector identifiers.

        Returns:
            List[Record]: A list of the fetched vectors.
        


In [117]:
results = tools.get_file_content("tmpqv23rcta/studio/graph.py")

In [98]:
print(results)

File: tmpqv23rcta:studio/graph.py

"""Standalone graph file for LangGraph Studio."""
import sys
import os
from typing import Optional
from dataclasses import dataclass

# Add the parent directory to the path so we can import from workflow
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from langgraph.graph import StateGraph, END

# Define state directly to avoid import conflicts
@dataclass
class WorkflowState:
    """State management for the LinkedIn post creation workflow."""
    user_input: str = ""
    post_text: str = ""
    image_prompt: str = ""
    image_data: Optional[bytes] = None
    image_url: str = ""
    user_feedback: str = ""
    classification: str = ""
    refined_text: str = ""
    refined_image_data: Optional[bytes] = None
    current_step: str = "start"
    error_message: str = ""
    final_post_ready: bool = False

# Import nodes after defining state
try:
    from workflow.nodes import (
        create_post_text,
        generate_im

### Analyze Code Structure

In [136]:
result = tools.analyze_code_structure("Image generation")

In [137]:
print(result)

Debug - Top results:
Result 1: n8n/main (NO img2img).json (similarity: 0.299)
Result 2: workflow/nodes.py (similarity: 0.270)
Result 3: README.md (similarity: 0.228)

Code Structure Analysis for: 'Image generation'
Files analyzed: 3

Functions found (6):
  - classify_feedback
  - create_post_text
  - generate_image
  - refine_image
  - refine_text
  - send_email

Key imports (13):
  - from dotenv import load_dotenv
  - from langchain.chat_models import init_chat_model
  - from langchain.schema import SystemMessage, HumanMessage
  - from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper
  - from langchain_google_community import GmailToolkit
  - from langchain_google_community.gmail.utils import (
  - from langgraph.errors import GraphRecursionError
  - from langgraph.prebuilt import create_react_agent
  - from typing import List
  - from workflow.state import WorkflowState
  ... and 3 more


# Chunking

## Tree-sitter

In [1]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser

PY_LANGUAGE = Language(tspython.language())

In [2]:
parser = Parser(PY_LANGUAGE)

In [None]:
tree = parser.parse(
    bytes(
        '''
        import os
        from langchain_openai import OpenAIEmbeddings
        from dotenv import load_dotenv
        import vecs
        import json

        # Load environment variables
        load_dotenv()


        def get_embedding_model():
            """Initialize and return the OpenAI embedding model."""
            return OpenAIEmbeddings(model="text-embedding-3-small")


        def get_vector_client():
            """Initialize and return the Supabase vector client."""
            supabase_db_url = os.getenv("SUPABASE_DB_URL")
            if not supabase_db_url:
                raise ValueError("SUPABASE_DB_URL environment variable not set")
            return vecs.create_client(supabase_db_url)


        def store_repo_in_own_collection(repo_info, refresh=False):
            """
            Store repository information in a dedicated vector collection.
            
            Args:
                repo_info (dict): Repository information containing files and metadata
                refresh (bool): Whether to delete existing collection before creating new one
            
            Returns:
                str: Name of the collection created
            """
            repo_name = repo_info['repo_name']
            
            # Initialize clients
            vx = get_vector_client()
            embeddings = get_embedding_model()
            
            if refresh:
                # Drop existing collection if it exists
                try:
                    vx.delete_collection(repo_name)
                    print(f"🗑️ Old collection '{repo_name}' deleted.")
                except Exception:
                    pass  # collection may not exist yet
            
            # Create/get a dedicated collection for this repo
            collection = vx.get_or_create_collection(name=repo_name, dimension=1536)
            
            vectors_to_upsert = []
            
            for file in repo_info['files']:
                content = file['content']
                
                # Generate embedding for file content
                embedding = embeddings.embed_query(content)
                
                # Create unique ID for this file
                unique_id = f"{repo_name}/{file['path']}"
                
                # Prepare metadata (store full content like your working version)
                metadata = {
                    'repo_name': repo_name,
                    'path': file['path'],
                    'content': content,  # Store the full content for retrieval
                    'commit_count': repo_info['commit_count'],
                    'branches': repo_info['branches']  # List is JSON-serializable, don't stringify
                }
                
                vectors_to_upsert.append((unique_id, embedding, metadata))
            
            # Upsert all vectors at once
            collection.upsert(vectors_to_upsert)
            
            print(f"✅ Stored {len(vectors_to_upsert)} files into collection '{repo_name}'")
            return repo_name


        def search_repo_collection(repo_name, query, limit=5):
            """
            Search within a specific repository collection.
            
            Args:
                repo_name (str): Name of the repository collection
                query (str): Search query
                limit (int): Maximum number of results to return
            
            Returns:
                list: Search results with metadata
            """
            vx = get_vector_client()
            embeddings = get_embedding_model()
            
            try:
                collection = vx.get_collection(name=repo_name)
                query_embedding = embeddings.embed_query(query)
                
                # Query the collection
                results = collection.query(
                    data=query_embedding,
                    limit=limit,
                    include_metadata=True,
                    include_value=True
                )
                
                # Convert results to a more usable format
                # vecs returns SQLAlchemy Row objects with: (id, metadata)
                formatted_results = []
                for i, result in enumerate(results):
                    formatted_result = {
                        'id': result[0] if len(result) > 0 else None,
                        'cos_distance': result[1],
                        'metadata': result[2] if len(result) > 1 else {}
                    }
                    formatted_results.append(formatted_result)
                
                return formatted_results
                
            except Exception as e:
                print(f"Error searching collection '{repo_name}': {e}")
                return []


        def list_repo_collections():
            """
            List all available repository collections.
            
            Returns:
                list: Names of all collections
            """
            vx = get_vector_client()
            try:
                collections = vx.list_collections()
                return [col.name for col in collections]
            except Exception as e:
                print(f"Error listing collections: {e}")
                return []


        def delete_repo_collection(repo_name):
            """
            Delete a repository collection.
            
            Args:
                repo_name (str): Name of the repository collection to delete
            
            Returns:
                bool: True if successful, False otherwise
            """
            vx = get_vector_client()
            try:
                vx.delete_collection(repo_name)
                print(f"🗑️ Collection '{repo_name}' deleted successfully.")
                return True
            except Exception as e:
                print(f"Error deleting collection '{repo_name}': {e}")
                return False


        # Example usage
        if __name__ == "__main__":
            # This would typically be called with repo_info from git_utils
            # store_repo_in_own_collection(repo_info)
            pass

        def debug_search_results(repo_name, query, limit=2):
            """
            Debug function to inspect the raw search results format.
            
            Args:
                repo_name (str): Name of the repository collection
                query (str): Search query
                limit (int): Maximum number of results to return
            """
            vx = get_vector_client()
            embeddings = get_embedding_model()
            
            try:
                collection = vx.get_collection(name=repo_name)
                query_embedding = embeddings.embed_query(query)
                
                print(f"🔍 Debug search in '{repo_name}' for query: '{query}'")
                
                # Query the collection
                raw_results = collection.query(
                    data=query_embedding,
                    limit=limit,
                    include_metadata=True
                )
                
                print(f"📊 Raw results type: {type(raw_results)}")
                print(f"📊 Raw results length: {len(raw_results) if hasattr(raw_results, '__len__') else 'N/A'}")
                
                if raw_results:
                    print(f"📊 First result type: {type(raw_results[0])}")
                    print(f"📊 First result: {raw_results[0]}")
                    
                    if len(raw_results[0]) > 0:
                        print(f"📊 First result parts:")
                        for i, part in enumerate(raw_results[0]):
                            print(f"   Part {i}: {type(part)} = {part}")
                
                return raw_results
                
            except Exception as e:
                print(f"❌ Debug error: {e}")
                import traceback
                traceback.print_exc()
                return None
        def get_file_content(repo_name, file_path):
            """
            Get the full content of a specific file from the vector database.
            
            Args:
                repo_name (str): Name of the repository collection
                file_path (str): Path of the file to retrieve
            
            Returns:
                str: File content or None if not found
            """
            vx = get_vector_client()
            
            try:
                collection = vx.get_collection(name=repo_name)
                
                # Search for the specific file by ID
                file_id = f"{repo_name}/{file_path}"
                
                # Use a simple query to find the exact file
                # Since we can't query by ID directly, we'll search and filter
                embeddings = get_embedding_model()
                dummy_query = embeddings.embed_query("content")  # Dummy query
                
                results = collection.query(
                    data=dummy_query,
                    limit=100,  # Get more results to find our file
                    include_metadata=True
                )
                
                # Find the specific file
                for result in results:
                    if result[0] == file_id:  # Match the ID
                        metadata = result[1] if len(result) > 1 else {}
                        return metadata.get('content', '')
                
                return None
                
            except Exception as e:
                print(f"Error retrieving file content: {e}")
                return None


        def search_with_content(repo_name, query, limit=5):
            """
            Search and return results with full content included.
            
            Args:
                repo_name (str): Name of the repository collection
                query (str): Search query
                limit (int): Maximum number of results to return
            
            Returns:
                list: Search results with full content
            """
            results = search_repo_collection(repo_name, query, limit)
            
            # Add full content to each result
            for result in results:
                metadata = result.get('metadata', {})
                file_path = metadata.get('path', '')
                
                if file_path:
                    full_content = get_file_content(repo_name, file_path)
                    if full_content:
                        result['full_content'] = full_content
            
            return results''',
                "utf8"
            )
        )

In [18]:
root_node = tree.root_node
root_node.start_point

Point(row=1, column=8)

In [20]:
# inspecting nodes in the tree
root_node = tree.root_node
assert root_node.type == "module"
assert root_node.start_point == (1, 8)
assert root_node.end_point == (279, 18)


In [23]:
print(root_node)

(module (import_statement name: (dotted_name (identifier))) (import_from_statement module_name: (dotted_name (identifier)) name: (dotted_name (identifier))) (import_from_statement module_name: (dotted_name (identifier)) name: (dotted_name (identifier))) (import_statement name: (dotted_name (identifier))) (import_statement name: (dotted_name (identifier))) (comment) (expression_statement (call function: (identifier) arguments: (argument_list))) (function_definition name: (identifier) parameters: (parameters) body: (block (expression_statement (string (string_start) (string_content) (string_end))) (return_statement (call function: (identifier) arguments: (argument_list (keyword_argument name: (identifier) value: (string (string_start) (string_content) (string_end)))))))) (function_definition name: (identifier) parameters: (parameters) body: (block (expression_statement (string (string_start) (string_content) (string_end))) (expression_statement (assignment left: (identifier) right: (call

## Langchain pre-built

In [24]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

In [29]:
# view seperators of swift for example
RecursiveCharacterTextSplitter.get_separators_for_language('swift')

['\nfunc ',
 '\nclass ',
 '\nstruct ',
 '\nenum ',
 '\nif ',
 '\nfor ',
 '\nwhile ',
 '\ndo ',
 '\nswitch ',
 '\ncase ',
 '\n\n',
 '\n',
 ' ',
 '']

In [30]:
# Supported languages
[e.value for e in Language]

['cpp',
 'go',
 'java',
 'kotlin',
 'js',
 'ts',
 'php',
 'proto',
 'python',
 'rst',
 'ruby',
 'rust',
 'scala',
 'swift',
 'markdown',
 'latex',
 'html',
 'sol',
 'csharp',
 'cobol',
 'c',
 'lua',
 'perl',
 'haskell',
 'elixir',
 'powershell',
 'visualbasic6']

In [None]:
PYTHON_CODE = """
def hello_world():
    print("Hello, World!")

# Call the function
hello_world()
"""

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)

python_docs = python_splitter.create_documents([PYTHON_CODE])
python_docs

[Document(metadata={}, page_content='def hello_world():\n    print("Hello, World!")'),
 Document(metadata={}, page_content='# Call the function\nhello_world()')]

In [46]:
import os

file_path = r"vector_utils.py"

# Read the file as text to pass to a function
with open(file_path, 'r', encoding='utf-8') as file:
    file_content = file.read()

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=3000, chunk_overlap=300
)

python_docs = python_splitter.create_documents([file_content])
python_docs

[Document(metadata={}, page_content='import os\nfrom langchain_openai import OpenAIEmbeddings\nfrom dotenv import load_dotenv\nimport vecs\nimport json\n\n# Load environment variables\nload_dotenv()\n\n\ndef get_embedding_model():\n    """Initialize and return the OpenAI embedding model."""\n    return OpenAIEmbeddings(model="text-embedding-3-small")\n\n\ndef get_vector_client():\n    """Initialize and return the Supabase vector client."""\n    supabase_db_url = os.getenv("SUPABASE_DB_URL")\n    if not supabase_db_url:\n        raise ValueError("SUPABASE_DB_URL environment variable not set")\n    return vecs.create_client(supabase_db_url)\n\n\ndef store_repo_in_own_collection(repo_info, refresh=False):\n    """\n    Store repository information in a dedicated vector collection.\n    \n    Args:\n        repo_info (dict): Repository information containing files and metadata\n        refresh (bool): Whether to delete existing collection before creating new one\n    \n    Returns:\n      