In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from IPython.display import Markdown, display

from dotenv import load_dotenv

load_dotenv('/workspaces/ml-learning/src/airtable-chatbot/.env', override=True)

import nest_asyncio
nest_asyncio.apply()

%load_ext autoreload
%autoreload 2

In [4]:
!which python

/workspaces/ml-learning/.venv/bin/python


https://llamahub.ai/l/airtable?from=all

Base ID: The path in the URL that begins with app  appfjm76R87oVG3BP  
Table ID: The path in the URL that begins with tbl tblkBhmgAFBRGmNRL  

https://support.airtable.com/docs/creating-personal-access-tokens  




In [None]:
!pip install pyairtable

In [30]:
"""Airtable reader."""
from typing import List

from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document, BaseNode, TextNode
from pyairtable import Table, Api, Base
import pandas as pd
import json


class CustomAirtableReader(BaseReader):
    """Airtable reader. Reads data from a table in a base.

    Args:
        api_key (str): Airtable API key.
    """

    def __init__(self, api_key: str) -> None:
        """Initialize Airtable reader."""

        self.api = Api(api_key)

    def load_data(self, base_id: str, table_id: str) -> List[Document]:
        """Load data from a table in a base

        Args:
            table_id (str): Table ID.
            base_id (str): Base ID.
        Returns:
            List[Document]: List of documents.
        """

        table = self.api.table(base_id, table_id)

        # table = Table(self.api_key, base_id, table_id)
        all_records = table.all()

        # Extract the 'fields' content from each element
        fields = [item['fields'] for item in all_records]

        documents = []
        for field in fields:
            # Copy the fields dictionary to extra_info
            extra_info = field.copy()

            # Keys to be removed
            keys_to_remove = ['What will you build']

            # Remove the keys from extra_info if they exist
            for key in keys_to_remove:
                extra_info.pop(key, None)  # The None argument ensures no error if the key doesn't exist

            # Now extra_info contains the fields data without the specified keys
            print(extra_info)

            formatted_string = json.dumps(field, indent=3)
            print(formatted_string)

            document = Document(text=formatted_string, extra_info=extra_info)
            documents.append(document)

        return documents

    def extract_nodes(self, base_id: str, table_id: str) -> List[BaseNode]:
        documents = self.load_data(base_id, table_id)

        nodes = [ TextNode(text=d.text, metadata=d.metadata) for d in documents]
        return nodes
    
    def get_airtable_df(self, base_id: str, table_id: str)-> pd.DataFrame:

        table = self.api.table(base_id, table_id)

        # table = Table(self.api_key, base_id, table_id)
        all_records = table.all()

        # Extract the 'fields' content from each element
        fields = [item['fields'] for item in all_records]

        # Create a DataFrame from the extracted 'fields'
        df = pd.DataFrame(fields)

        return df

In [31]:
from llama_index.core.readers.download import download_loader
import os

# AirtableReader = download_loader('AirtableReader')

AIRTABLE_TOKEN=os.environ['AIRTABLE_TOKEN']
AIRTABLE_BASE_ID=os.environ['AIRTABLE_BASE_ID']
AIRTABLE_TABLE_ID=os.environ['AIRTABLE_TABLE_ID']

reader = CustomAirtableReader(AIRTABLE_TOKEN)


In [32]:
documents = reader.load_data(table_id=AIRTABLE_TABLE_ID,base_id=AIRTABLE_BASE_ID)

{'Name': 'Mike Gardiner', 'Profile picture': [{'id': 'attgeBMA8bKI4rNg4', 'width': 4624, 'height': 3468, 'url': 'https://v5.airtableusercontent.com/v3/u/25/25/1707912000000/Hbc8wbDWwZyo6h6e_xHaiw/vEzBiZKX0rKTSNnGDv5ci5SygjhiiU67dAMtSLdsfawZF3OKg2T-8oEVZ5bJ-QEDZvBKhlP7DxQRPyocJ2wN7mwj87agzGXlxu_ptFI4jvg92q7AFE2bin3yxULbw5k7os88uEWBvADgERI_Ay3p1A/cQwWasDyu45mPn_ct1Hih46Gf7T_mlR5CUC9XcfJ01I', 'filename': '20230501_142648.jpg', 'size': 6286835, 'type': 'image/jpeg', 'thumbnails': {'small': {'url': 'https://v5.airtableusercontent.com/v3/u/25/25/1707912000000/4MGRedGwud0JyS7K9nXk9A/BD0bFmipETI1NtnIR3FbLqVhf0gWvFTQdTLw8LWZMlqGL3MNXb5ThZbdmM6nZcPooCKZrRiP6Xj982dpcUC2ocDLHIVO6P2wE_Dfb6638b0qX67qqPkKp6gedKYRgFYnmyTub1XHGeFG0xaBd3RPgw/mEOQUL0ltlPGbtvWf5j40tIDaoIC8Dsit45mKgyxJyw', 'width': 27, 'height': 36}, 'large': {'url': 'https://v5.airtableusercontent.com/v3/u/25/25/1707912000000/1KJWd0zVSNPuVAwsppX43Q/Q3WEcRbZypfGZRdNoTq1zUMx_1qxg3W9PhWodfLFIfZPyEtn4_5G9jVjh1dba8khI5pK4p5-F-_UY6OTunqCN70QuCM

In [27]:
df = reader.get_airtable_df(table_id=AIRTABLE_TABLE_ID,base_id=AIRTABLE_BASE_ID)

In [20]:
df.head(5)

Unnamed: 0,Name,What are your areas of expertise you have (select max 4 please),What will you build,Profile picture,What's the link to your LinkedIn?
0,Mike Gardiner,"[Backend software dev, Front end software dev]",🐝 AI coach for game development teams to enabl...,"[{'id': 'attgeBMA8bKI4rNg4', 'width': 4624, 'h...",https://www.linkedin.com/in/mrmikeg
1,Marina Ritchie,"[Go to market, Idea validating]",I provide advise to business owners on AI Stra...,"[{'id': 'att7mGbakEok4H6LX', 'width': 800, 'he...",https://www.linkedin.com/in/marinaritchie/
2,Cameron Bogatez,"[Product management, Designer]","AI Steve, an AI product consultant that genera...","[{'id': 'attLGo9ZydYXUPNaP', 'width': 302, 'he...",https://www.linkedin.com/in/cameron-bogatez123/
3,Atena Pegler,"[AI / ML specialist researcher, Designer, Prod...",Virtual CMO,"[{'id': 'attqgrwjILyrwyHwz', 'width': 2560, 'h...",https://www.linkedin.com/in/atenakouchaki
4,Sumit Saggar,"[Front end software dev, Product management, D...",An intelligent CRM system for founders and sol...,"[{'id': 'attx1IJRUSkjUBHua', 'width': 1836, 'h...",www.linkedin.com/in/sumitsaggar


In [38]:
from llama_index.core.indices import VectorStoreIndex

[autoreload of _pydevd_bundle failed: Traceback (most recent call last):
  File "/workspaces/ml-learning/.venv/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/workspaces/ml-learning/.venv/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
             ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/importlib/__init__.py", line 168, in reload
    raise ModuleNotFoundError(f"spec not found for the module {name!r}", name=name)
ModuleNotFoundError: spec not found for the module '_pydevd_bundle'
]
[autoreload of _pydev_bundle failed: Traceback (most recent call last):
  File "/workspaces/ml-learning/.venv/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/workspaces/ml-learning/.venv/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 4

In [34]:
from llama_index.core.indices import VectorStoreIndex
from llama_index.llms import OpenAI

nodes = reader.extract_nodes(base_id=AIRTABLE_BASE_ID,table_id=AIRTABLE_TABLE_ID)
index = VectorStoreIndex.build_index_from_nodes(nodes)


ImportError: cannot import name 'ChatMessage' from 'llama_index.core.llms' (unknown location)

In [24]:

from llama_index import ServiceContext
llm = OpenAI(model="gpt-4", temperature=0)
service_context = ServiceContext.from_defaults(
    llm=llm,
)

query_engine = index.as_query_engine(service_context=service_context)

response = query_engine.query('What is Chamira\'s project?')

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [25]:
response.response

"Chamira's project is called Minerva. It is aimed at revolutionizing the educational space by leveraging LLM technology through an education-first architecture. The goal is to improve upon the current system, which Chamira believes is ineffective due to the ratio of students to teachers in classrooms. The project is inspired by the effectiveness of Open-Domain Question Answering as revealed by ChatGPT, but aims to structure it in a way that can make a significant impact in education."

In [11]:
response.source_nodes[0].node

TextNode(id_='5b000a73-06a2-480e-b835-58582f8a50e2', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='ca4a5898-f09e-45c3-93ad-ed2d076fc889', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='0aacf5616a0bb588455d93d76345cca4380eff89d0ce59a0adfd3b238698234e'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='82c649be-99b5-4041-9181-895039e09f1f', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='6b1510515a9d3b93f4eb42c6da6a19014b219a804f42ff7e5b99bfbe5b86fb4e'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='80cec339-7588-462e-ac04-2f89be679fa2', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='2d711e7560c5d10d44b977f0ebbe83dbf61083dd6c82abf71dece4f9f8f55bd0')}, text='": \'https://www.linkedin.com/in/shaemckenna/\', \'What are your areas of expertise you have (select max 4 please)\': [\'AI Engineer\', \'Product management\', \'AI / ML special

In [21]:
from pprint import pprint
pprint(response.source_nodes[0].node.text)

('": \'https://www.linkedin.com/in/shaemckenna/\', \'What are your areas of '
 "expertise you have (select max 4 please)': ['AI Engineer', 'Product "
 "management', 'AI / ML specialist researcher'], 'What will you build': "
 '"I am planning on launching a startup in the edtech space. I always thought '
 'classrooms were a really ineffective way of teaching, one teacher for 25+ '
 "students just doesn't seem effective no matter how good a teacher may be. "
 'ChatGPT revealed the effectiveness of Open-Domain Question Answering '
 'however, it is still too unstructured to make a true dent in the educational '
 'space. This is where I see project Minerva coming into play. I aim to '
 'leverage LLM technology through an education-first architecture, to '
 'revolutionize how students learn around the world.')


In [6]:
"""Airtable reader."""
from typing import List

from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
import pandas as pd


class CustomAirtableReader(BaseReader):
    """Airtable reader. Reads data from a table in a base.

    Args:
        api_key (str): Airtable API key.
    """

    def __init__(self, api_key: str) -> None:
        """Initialize Airtable reader."""

        self.api_key = api_key

    def load_data(self, base_id: str, table_id: str) -> List[Document]:
        """Load data from a table in a base

        Args:
            table_id (str): Table ID.
            base_id (str): Base ID.
        Returns:
            List[Document]: List of documents.
        """
        from pyairtable import Table

        table = Table(self.api_key, base_id, table_id)
        all_records = table.all()
        return [Document(text=f"{all_records}", extra_info={})]
    
    def get_airtable_df(self, base_id: str, table_id: str) -> pd.DataFrame:
        from pyairtable import Table

        table = Table(self.api_key, base_id, table_id)
        all_records = table.all()
        df = pd.read_json(all_records)