In [1]:
from supabase import create_client
from langchain_community.embeddings import OpenAIEmbeddings
import os
from dotenv import load_dotenv

from financial_report_extraction import FinancialContentExtractor

load_dotenv()

# Initialize your Supabase client here
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")
supabase_client = create_client(url, key)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def list_all_files(bucket: str, path: str):
    """List all files in a storage bucket path using offset pagination."""
    all_files = []
    offset = 0
    limit = 100  # Supabase's max limit
    
    while True:
        # Get batch of files with offset
        files = supabase_client.storage.from_(bucket).list(path, {
            'limit': limit,
            'offset': offset,
            'sortBy': {'column': 'name', 'order': 'asc'}
        })
        
        if not files:
            break
            
        all_files.extend(files)
        offset += limit
        
        # Break if we got less than the limit (means we're at the end)
        if len(files) < limit:
            break
            
    return all_files

In [3]:
# files = supabase_client.storage.from_('pdf_chunks').list('test_tables')
files = list_all_files('pdf_chunks', 'test_tables')

In [4]:
len(files)

2978

In [5]:
def _clean_tables_data(text):
    """
    Extract only sections containing markdown tables and skip sections with "no table" messages.
    """
    # Split text by the separator
    sections = text.split("---------------------------------------------------------------------------------------------------")
    
    # Skip phrases that indicate no tables
    skip_phrases = [
        "no tabular data",
        "cannot extract tabular",
        "does not contain any tabular",
        "no tables",
    ]
    
    cleaned_sections = []
    
    for section in sections:
        # Skip if section contains any of the skip phrases
        if any(phrase in section.lower() for phrase in skip_phrases):
            continue
            
        # Skip if section doesn't contain any markdown tables (checking for |)
        if '|' not in section:
            continue
        
        # Add non-empty cleaned section
        cleaned_section = section.strip()
        if cleaned_section:
            cleaned_sections.append(cleaned_section)
    
    return '\n\n' + '='*80 + '\n\n'.join(cleaned_sections)


In [6]:
files[0]['name'].endswith('.txt')

True

In [7]:
for file in files:
    if not file['name'].endswith('.txt'):
        continue
        
    # Download file content
    response = supabase_client.storage.from_('pdf_chunks')\
        .download(f'test_tables/{file["name"]}')
    
    # Clean the content
    content = response.decode('utf-8')
    cleaned_content = _clean_tables_data(content)
    
    # Upload cleaned content
    supabase_client.storage.from_('pdf_chunks')\
        .update(
            f'test_tables/{file["name"]}',
            cleaned_content.encode('utf-8'),
            {'content-type': 'text/plain'}
        )
