In [1]:
import os
from pathlib import Path
from llmsherpa.readers import LayoutPDFReader
import glob
from llama_index.core import VectorStoreIndex, ServiceContext, StorageContext
from llama_index.core.schema import Document

from pinecone import Pinecone, ServerlessSpec
from pinecone.grpc import PineconeGRPC
from llama_index.vector_stores.pinecone import PineconeVectorStore

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
Settings.embed_model = OpenAIEmbedding()

# PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_API_KEY = '5cd0703e-5be9-4830-9550-a1b504fd5bfc'
OPENAI_API_KEY   = os.getenv('OPENAI_API_KEY')

pc = PineconeGRPC(api_key=PINECONE_API_KEY)

# index_names = [idx['name'] for idx in pc.list_indexes()]

base_directory = Path('/mnt/c/Users/Gwool/Documents/scripts/mkt_chatbot/data')

llmsherpa_api_url = "http://localhost:5010/api/parseDocument?renderFormat=all&useNewIndent=true"
pdf_reader = LayoutPDFReader(llmsherpa_api_url)

tickers = ['MELI', 'TSLA']
years   = ['2023']

# files = glob.glob(str(base_directory / years[0]) + str('/*.pdf'))

  from tqdm.autonotebook import tqdm


In [19]:
# pinecone_index = pc.Index(index_name)
# vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# pc.create_index(
#     name=ticker,
#     dimension=1536,
#     metric="cosine",
#     spec=ServerlessSpec(
#         cloud='aws', 
#         region='us-east-1'
#     ) 
# )

In [8]:
def create_index(pc_object, idx_name: str, dimension: int = 1536, metric: str = "cosine"):
    """Create pinecone index"""

    pc_object.create_index(
        name=idx_name,
        dimension=dimension,
        metric=metric,
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    )

#---------------------------------------------------------

def validate_index(idx_name: str, pinecone_api: str = PINECONE_API_KEY):
    """Validate index existance"""

    _pc = PineconeGRPC(api_key=pinecone_api)

    idx_names = [idx['name'] for idx in _pc.list_indexes()]

    if idx_name not in idx_names:
        create_index(
            pc_object = _pc,
            idx_name = idx_name
            )
        print(f'Index: {idx_name} - created succesfully')

    else:
        print(f'Index: {idx_name} - already exists')
        pass

In [21]:
for ticker in tickers:
    for year in years:
        file_path = str(base_directory / year / f'{ticker}-{year}-10K.pdf')

        ticker_idx = str(ticker.lower())

        validate_index(idx_name = ticker_idx)

        pinecone_index = pc.Index(ticker_idx)
        vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)
        
        index = VectorStoreIndex([], show_progress=False, storage_context=storage_context)
        
        doc = pdf_reader.read_pdf(file_path)

        for chunk in doc.chunks():
            index.insert(Document(text=chunk.to_context_text(), extra_info = {'description': f'Document about {ticker} 10-K {year} SEC report'}))

Index: meli - created succesfully


Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  1.24it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  4.92it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  4.54it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.72it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  4.27it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  7.41it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.25it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.27it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  4.27it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  4.66it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.59it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  7.33it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  7.41it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  6.89it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.22it/s]
Upserted vectors: 100%|██████████| 1/1 [

Index: tsla - created succesfully


Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  1.51it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.62it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  6.50it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  7.29it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  6.01it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.68it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  4.59it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  4.01it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.56it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.98it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  6.27it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  6.10it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  6.33it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  7.23it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.21it/s]
Upserted vectors: 100%|██████████| 1/1 [

chat - v0 - con divisor detablas y demas

In [1]:
import os
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.llms.openai import OpenAI
from llama_index.agent.openai import OpenAIAgent
from pinecone.grpc import PineconeGRPC
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import VectorStoreIndex

PINECONE_API_KEY = '5cd0703e-5be9-4830-9550-a1b504fd5bfc'
OPENAI_API_KEY   = os.getenv('OPENAI_API_KEY')

  from tqdm.autonotebook import tqdm


In [2]:
tickers_dict = {
    'MELI': 'Mercado Libre',
    'TSLA': 'Tesla',
}

ticker = 'MELI'
ticker_idx = str(ticker.lower())
ticker_str = tickers_dict.get(ticker, '')

In [3]:
pc = PineconeGRPC(api_key=PINECONE_API_KEY)

pinecone_index = pc.Index(ticker_idx)
vector_store   = PineconeVectorStore(pinecone_index=pinecone_index)
vector_index   = VectorStoreIndex.from_vector_store(vector_store=vector_store)
retriever      = VectorIndexRetriever(index=vector_index, similarity_top_k=4)

query_engine  = RetrieverQueryEngine(retriever=retriever)

query_engine_tool = QueryEngineTool(
    query_engine=query_engine,
    metadata=ToolMetadata(
        name=f"{ticker}-{year}",
        description=f"Useful to answer questions about {ticker} - {ticker_str} 10 Q report for {year}",
    ),
)

llm = OpenAI(model="gpt-4o-mini", 
             temperature=0)

agent = OpenAIAgent.from_tools([query_engine_tool], 
                               llm=llm,
                               system_prompt=f"assume you are a financial expert about {ticker} - {ticker_str} performance. only response questions related to the ticker {ticker} ({ticker_str}). be strictly detailed in the responses using the context provided",
                               verbose=False, 
                               temperature=0)





In [4]:
async def get_response(question: str, agent = agent):
    response = await agent.astream_chat(question)
    full_response = ""

    async for token in response.async_response_gen():
        full_response += token
        yield full_response


# async for partial_response in get_response(question):
#     print(partial_response)


In [13]:
question = "Explain the MELI tax constrains in Argentina"

In [14]:
response = agent.chat(question)
print(response.response)

Mercado Libre (MELI) encounters several tax constraints in Argentina, which include:

1. **Foreign Currency Acquisition Tax**: The Argentine government imposes a tax on the acquisition of foreign currency through the official exchange market. This can affect MELI's ability to conduct international transactions and manage foreign currency exposure.

2. **Specific Tax Rates on Services and Goods**: Certain services acquired from abroad, as well as transportation services for the import and export of goods, are subject to specific tax rates. This can increase operational costs for MELI when dealing with international suppliers or logistics.

3. **Imported Goods Taxation**: Imported goods also face specific tax rates, which can impact the cost structure for MELI, especially if it relies on imported products for its marketplace.

4. **Knowledge-Based Economy Promotional Regime**: On a positive note, MELI can benefit from the knowledge-based economy promotional regime in Argentina. This regi

In [5]:
# !pip install llama-hub unstructured

In [58]:
import os
from pathlib import Path
import numpy as np
from openai import OpenAI
from langchain_core.documents.base import Document

base_directory = Path('/mnt/c/Users/Gwool/Documents/scripts/mkt_chatbot/data')
tickers = ['MELI', 'TSLA']
years   = ['2023']

tickers_dict = {
    'MELI': 'Mercado Libre',
    'TSLA': 'Tesla',
}

OPENAI_API_KEY   = os.getenv('OPENAI_API_KEY')
client = OpenAI()

from PyPDF2 import PdfReader
from langchain_community.document_loaders import PyPDFLoader
# from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from docx import Document

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        texts = [reader.pages[page_num].extract_text() for page_num in range(len(reader.pages))]
    return texts

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    texts = []
    for para in doc.paragraphs:
        texts.append(para.text)
    return texts

def get_embeddings(texts, model = "text-embedding-ada-002", vector_size = 1536):
    if texts is None or len(texts) < 1:
        return np.zeros(vector_size).tolist()

    if isinstance(texts, list):
        texts = [text.replace("\n", '') for text in texts]
        embedding_vector = [client.embeddings.create(input = [text], model = model).data[0].embedding for text in texts]
    
    else:
        text = texts.replace("\n", '')
        embedding_vector = client.embeddings.create(input = [text], model = model).data[0].embedding
    
    return embedding_vector

#---------------------------------------------------

def get_embeddings(texts, model = "text-embedding-ada-002", vector_size = 1536):
    if texts is None:
        return np.zeros(vector_size).tolist()

    if isinstance(texts, list):
        pass
    else:
        texts = [texts]

    if any(isinstance(text, Document) for text in texts):
        texts_dict = []
        for text in texts:
            txt = text.page_content.replace("\n", '')
            mtd = text.metadata
            emb = client.embeddings.create(input = [txt], model = model).data[0].embedding
            dc  = { 'metadata': mtd, 
                    'page_content': txt,
                    'embedding': emb}
            texts_dict.append(dc)
        
        return texts_dict

    texts = [text.replace("\n", '') for text in texts]
    embedding_vector = [client.embeddings.create(input = [text], model = model).data[0].embedding for text in texts]

    if len(embedding_vector) == 1:
        return embedding_vector[0]
    else:
        return embedding_vector

#---------------------------------------------------

def process_pdf(file_path, metadata: dict = None):
# pip install -qU langchain-text-splitters
# 
    loader = PyPDFLoader(file_path)
    data = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=580, chunk_overlap=20)
    documents = text_splitter.split_documents(data)
    if metadata:
        for i in range(len(documents)):
            page_n = documents[i].metadata.get('page', None)
            # documents[i].metadata = metadata
            documents[i].metadata.update(metadata)
            documents[i].metadata['page'] = page_n

    return documents

#---------------------------------------------------

def chunker(seq, batch_size):
    return (seq[pos:pos + batch_size] for pos in range(0, len(seq), batch_size))

def upsert_embeddings_to_pinecone(index, texts: list):

    data = [
        (str(i), item['embedding'], {**item['metadata'], "page_content": item['page_content']})
        for i, item in enumerate(texts)
    ]

    async_results = [
        index.upsert(vectors=chunk, async_req=True)
        for chunk in chunker(data, batch_size=200)
    ]

    [async_result.result() for async_result in async_results]


In [6]:
# upsert_embeddings_to_pinecone(index,  embeddings, ['metadata'])

In [20]:
from pinecone import Pinecone, ServerlessSpec
from pinecone.grpc import PineconeGRPC
PINECONE_API_KEY = '5cd0703e-5be9-4830-9550-a1b504fd5bfc'

ticker     = 'MELI'
ticker_idx = 'meli-2'
year       = years[0]
ticker_str = tickers_dict.get(ticker, '')

# pc = Pinecone(api_key=PINECONE_API_KEY)
pc = PineconeGRPC(api_key=PINECONE_API_KEY)
validate_index(idx_name = ticker_idx, pinecone_api = PINECONE_API_KEY)

metadata = {
    'ticker': ticker,
    'year': year,
    'description': f"Text to answer questions about {ticker} - {ticker_str} 10 Q report for {year}",
}

Index: meli-2 - created succesfully


In [21]:
file_path = Path(base_directory / year / f'{ticker}-{year}-10K.pdf')

docs = process_pdf(file_path, metadata)

In [22]:
docs_emb = get_embeddings(docs)

In [25]:
pc = PineconeGRPC(api_key=PINECONE_API_KEY)
index = pc.Index(ticker_idx)

upsert_embeddings_to_pinecone(index=index, texts=docs_emb)

chat - v1 - PDF divider

In [28]:
import os
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.llms.openai import OpenAI
from llama_index.agent.openai import OpenAIAgent
from pinecone.grpc import PineconeGRPC
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import VectorStoreIndex

PINECONE_API_KEY = '5cd0703e-5be9-4830-9550-a1b504fd5bfc'
OPENAI_API_KEY   = os.getenv('OPENAI_API_KEY')

In [10]:
# tickers_dict = {
#     'MELI': 'Mercado Libre',
#     'TSLA': 'Tesla',
# }

# ticker = 'MELI'

# # ticker_idx = str(ticker.lower())

# ticker_str = tickers_dict.get(ticker, '')

_ticker_idx = 'meli-2'

In [17]:
# pc = PineconeGRPC(api_key=PINECONE_API_KEY)

_pinecone_index = pc.Index(_ticker_idx)
_vector_store   = PineconeVectorStore(pinecone_index=_pinecone_index)
_vector_index   = VectorStoreIndex.from_vector_store(vector_store=_vector_store)
_retriever      = VectorIndexRetriever(index=_vector_index, similarity_top_k=4)

_query_engine  = RetrieverQueryEngine(retriever=_retriever)

_query_engine_tool = QueryEngineTool(
    query_engine=_query_engine,
    metadata=ToolMetadata(
        name="query_engine",
        description=f"Useful to answer questions about {ticker} - {ticker_str} 10 Q report ",
    ),
)

llm = OpenAI(model="gpt-4o-mini", 
             temperature=0)

_agent = OpenAIAgent.from_tools([_query_engine_tool], 
                               llm=llm,
                               system_prompt=f"assume you are a financial expert about {ticker} - {ticker_str} performance. only response questions related to the ticker {ticker} ({ticker_str}). be strictly detailed in the responses using the context provided",
                               verbose=False, 
                               temperature=0)


In [18]:
response = _agent.chat(question)
print(response.response)

I currently do not have specific information on the tax constraints for Mercado Libre (MELI) in Argentina. However, I can provide a general overview of the tax environment in Argentina that may affect companies like MELI.

Argentina has a complex tax system that includes various taxes at the national, provincial, and municipal levels. Key taxes that may impact Mercado Libre include:

1. **Value Added Tax (VAT)**: A significant tax on goods and services, which can affect the pricing and profitability of transactions conducted through the platform.

2. **Income Tax**: Companies in Argentina are subject to corporate income tax on their profits. The rate can vary, and there may be specific deductions or incentives available.

3. **Export Taxes**: As a company that may engage in cross-border transactions, MELI could be subject to export taxes, which can impact its international sales.

4. **Withholding Taxes**: Payments made to foreign entities may be subject to withholding taxes, affecting

In [80]:
# q = 'Since then, we have seen significant adoption of our platform and entire companies built on and around ourAPIs and services'

query_vector = get_embeddings(question)

In [81]:
index.query(
    vector = query_vector,
    top_k=4,
    # namespace = 'meli-2',
    include_metadata=True

)

{'matches': [{'id': '1203',
              'metadata': {'description': 'Text to answer questions about MELI '
                                          '- Mercado Libre 10 Q report for '
                                          '2023',
                           'page': 194.0,
                           'page_content': 'Table of ContentsMercadoLibre, '
                                           'Inc.Notes to Consolidated '
                                           'Financial StatementsNOTE 25. SHARE '
                                           'REPURCHASE PROGRAMOn February 21, '
                                           '2023, the Board authorized the '
                                           'Company to repurchase shares of '
                                           'the Company’s common stock, for an '
                                           'aggregateconsideration of up to '
                                           '$900 million  to expire on March '
                   

In [70]:
index.query(
    vector = query_vector,
    top_k=4,
    namespace = 'meli',
    include_metadata=False

)

{'matches': [], 'namespace': 'meli', 'usage': {'read_units': 1}}

In [77]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1207}},
 'total_vector_count': 1207}

In [None]:
OpenAIEmbeddings

In [84]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

# chat completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-4o',
    temperature=0.0
)
# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)
# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=_vector_store.as_retriever()
)

  llm = ChatOpenAI(


AttributeError: 'PineconeVectorStore' object has no attribute 'as_retriever'

In [85]:
_vector_store

PineconeVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False, api_key=None, index_name=None, environment=None, namespace=None, insert_kwargs={}, add_sparse_vector=False, text_key='text', batch_size=100, remove_text_from_metadata=False)

In [88]:
from langchain.vectorstores import Pinecone

text_field = "text"

# switch back to normal index for langchain
index = pc.Index('meli-2')

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

NameError: name 'embed' is not defined

In [49]:
async def process(question: str):
    """Get the response from the API."""

    try:
        response = await agent.astream_chat(question)
        full_response = ""

        async for token in response.async_response_gen():
            full_response += token

            if chat.messages[-1]["role"] == "assistant":
                chat.messages[-1]["content"] = full_response
            else:
                chat.messages.append({"role": "assistant", "content": full_response})
            yield

    except Exception as e:
        yield rx.window_alert(f"There is an error in the server: {str(e)}. Try again later.")
        return

    if len(chat.messages) > 1:
        chat.messages = chat.messages[-2:]


In [8]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

Settings.embed_model = OpenAIEmbedding()


In [13]:
file_path = str(base_directory / year / f'{ticker}-{year}-10K.pdf')

ticker_idx = str(ticker.lower())

validate_index(idx_name = ticker_idx)

Index: tsla - already exists


In [None]:
file_path = str(base_directory / year / f'{ticker}-{year}-10K.pdf')

ticker_idx = str(ticker.lower())

validate_index(idx_name = ticker_idx)

pinecone_index  = pc.Index(ticker_idx)
vector_store    = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex([], show_progress=True, storage_context=storage_context)

doc = pdf_reader.read_pdf(file_path)

for chunk in doc.chunks():
    index.insert(Document(text=chunk.to_context_text(), extra_info = {'description': f'Document about {ticker} 10-K {year} SEC report'}))

In [67]:
file_path = str(base_directory / year / f'{ticker}-{year}-10K.pdf')

index = VectorStoreIndex([], show_progress=True, storage_context=storage_context, embed_model=embed_model)

doc = pdf_reader.read_pdf(file_path)

for chunk in doc.chunks():
    index.insert(Document(text=chunk.to_context_text(), extra_info = {'description': f'Document about {ticker} 10-K {year} SEC report'}))

Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 847.68it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  1.58it/s]
Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 1524.09it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  6.77it/s]
Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 1086.61it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.97it/s]
Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 637.14it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  7.40it/s]
Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 1305.82it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  6.63it/s]
Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 1525.76it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.23it/s]
Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 966.88it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.89it/s]
Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 1087.17it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00

In [66]:
# index.storage_context()

In [None]:
index = VectorStoreIndex.from_documents(
    all_docs,
    storage_context=storage_context
)

In [117]:
# VectorStoreIndex??

In [110]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
Settings.embed_model = OpenAIEmbedding()

In [124]:
file_path = '/mnt/c/Users/Gwool/Documents/Python Scripts/LLM/data/cv_data/general.pdf'

pinecone_index = pc.Index('cv-data-2')
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex([], show_progress=False, storage_context=storage_context)

doc = pdf_reader.read_pdf(file_path)

In [125]:
for chunk in doc.chunks():
    index.insert(Document(text=chunk.to_context_text(), extra_info = {"description": "file explaining general aspects from Gaston Woollands' experience"}))

Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  1.89it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  6.72it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.24it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.33it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  6.81it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  7.37it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.49it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.59it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  4.59it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  7.35it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  6.14it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  7.32it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.91it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  5.74it/s]
Upserted vectors: 100%|██████████| 1/1 [00:00<00:00,  3.03it/s]
Upserted vectors: 100%|██████████| 1/1 [

In [119]:
{"description": "file explaining general aspects from Gaston Woollands' experience"}

{'description': "file explaining general aspects from Gaston Woollands' experience"}

Test

In [27]:
# from IPython.core.display import display, HTML
# HTML(doc.tables()[8].to_html())

In [None]:
from llama_index.core import Document
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex([])
for chunk in doc.chunks():
    index.insert(Document(text=chunk.to_context_text(), extra_info={}))
query_engine = index.as_query_engine()

In [14]:
file = files[1]

doc = pdf_reader.read_pdf(file)

In [37]:
chunk.to_text(include_children=True, recurse=True)

'tsla-20240630 https://www.sec.gov/Archives/edgar/data/1318605/00016282802403266'

In [16]:
for chunk in doc.chunks():
    print(chunk.section)

AttributeError: 'Paragraph' object has no attribute 'section'

In [32]:
chunk.sections()

[]

In [21]:
# pdf_reader.read_pdf(str(base_directory / year / f'{ticker}-{year}-10K.pdf'))

In [11]:
file_name = file.split()

'/mnt/c/Users/Gwool/Documents/scripts/mkt_chatbot/data/2023/TSLA-2023-10K.pdf'

In [116]:
[sec.title for sec in doc.sections()]

['Technological Proficiencies:', 'Professional Experience:']

In [38]:
[sec.title for sec in doc.sections()]

['TESLA, INC.',
 'FORM 10-Q FOR THE QUARTER ENDED JUNE 30, 2024',
 'INDEX',
 'Page',
 'PART I. FINANCIAL INFORMATION',
 'Forward-Looking Statements',
 'PART I. FINANCIAL INFORMATION ITEM 1. FINANCIAL STATEMENTS',
 'Tesla, Inc.',
 'Tesla, Inc.',
 'Tesla, Inc.',
 'Tesla, Inc.',
 'Tesla, Inc.',
 'Tesla, Inc. (unaudited)',
 'Note 1 – Overview & Summary of Significant Accounting Policies',
 'Overview',
 'Unaudited Interim Financial Statements',
 'Reclassifications',
 'Revenue Recognition',
 'Automotive Segment',
 'Automotive Sales',
 'Automotive Regulatory Credits',
 'Automotive Leasing Revenue',
 'Direct Sales-Type Leasing Program',
 'Energy Generation and Storage Segment',
 'Energy Generation and Storage Sales',
 'Income Taxes',
 'Net Income per Share of Common Stock Attributable to Common Stockholders',
 'Restricted Cash',
 'Accounts Receivable and Allowance for Doubtful Accounts',
 'Financing Receivables',
 'Concentration of Risk',
 'Credit Risk',
 'Supply Risk',
 'Warranties',
 'Recent

In [57]:
# for section in doc.sections():
#     if section.title == 'PART I':
#         break

# HTML(section.to_html(include_children=True, recurse=True))


In [94]:
# for section in doc.sections():
#     if section.tag == 'header':
#         break
#         # pass

# HTML(section.to_html(include_children=True, recurse=True))


In [90]:
for section in doc.sections():
    if section.tag == 'list_item':
        break

        HTML(section.to_html(include_children=True, recurse=True))


In [89]:
set([doc.tag for doc in doc.sections()])

{'header'}

In [78]:
HTML(section.to_html(include_children=True, recurse=True))

In [45]:
set([v.get('tag') for v in doc.json])

{'header', 'list_item', 'para', 'table'}

In [59]:
for v in doc.json:
    if v.get('tag') == 'table':
        break

In [60]:
v

{'bbox': [165.17, 401.04, 348.99, 416.02000000000004],
 'block_class': 'cls_3',
 'block_idx': 9,
 'left': 165.17,
 'level': 1,
 'name': 'TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934',
 'page_idx': 0,
 'table_rows': [{'block_idx': 9,
   'cells': [{'cell_value': 'Delaware'}, {'cell_value': '98-0212790'}],
   'type': 'table_header'},
  {'block_idx': 10,
   'cells': [{'cell_value': 'State or other jurisdiction of incorporation or organization'},
    {'cell_value': '(I.R.S. Employer Identification Number)'}],
   'type': 'table_data_row'}],
 'tag': 'table',
 'top': 401.04}

In [61]:
from IPython.core.display import HTML

HTML(v.to_html(include_children=True, recurse=True))

AttributeError: 'dict' object has no attribute 'to_html'

In [24]:
file = Path(base_directory / year / f"{ticker}.pdf")