In [1]:
# JSON Files (small data)

import json
import boto3

def get_embedding(bedrock, text):
    modelId = 'amazon.titan-embed-text-v1'
    accept = 'application/json'
    contentType = 'application/json'
    input = {
            'inputText': text
        }
    body=json.dumps(input)
    response = bedrock.invoke_model(
        body=body, modelId=modelId, accept=accept,contentType=contentType)
    response_body = json.loads(response.get('body').read())
    embedding = response_body['embedding']
    return embedding

# main function
bedrock = boto3.client(
    service_name='bedrock-runtime'
)
# some random data
people = ['Albert Einstein', 'Isaac Newton', 'Stephen Hawking', 
          'Galileo Galilei', 'Niels Bohr', 'Werner Heisenberg', 
          'Marie Curie', 'Ernest Rutherford', 'Michael Faraday', 'Richard Feynman']
actions = ['plays basketball', 'teaches physics', 'sells sea shells', 
           'collects tax', 'drives buses', 'researches into gravity', 
           'manages a shop', 'supervises graduate students', 
           'works as a support engineer', 'runs a bank']
places = ['London', 'Sydney', 'Los Angeles', 'San Francisco', 'Beijing', 
          'Cape Town', 'Paris', 'Cairo', 'New Delhi', 'Seoul']
# create a data file
count = 10000
with open('dataset.json', 'w') as outfile:
    for name in people:
        for action in actions:
            for place in places:
                id   = count
                text = '{name} {action} in {place}.'.format(name=name, action=action, place=place)
                embedding = get_embedding(bedrock, text)
                item = {'id': id, 'text': text, 'embedding': embedding}
                json_object = json.dumps(item)
                outfile.write(json_object + '\n')
                count = count + 1
    print('Dataset created.')




Dataset created.


In [3]:
# Searching the vector DB
import json
import boto3
import math
from datetime import datetime

def get_embedding(bedrock, text):
    modelId = 'amazon.titan-embed-text-v1'
    accept = 'application/json'
    contentType = 'application/json'
    input = {
            'inputText': text
        }
    body=json.dumps(input)
    response = bedrock.invoke_model(
        body=body, modelId=modelId, accept=accept,contentType=contentType)
    response_body = json.loads(response.get('body').read())
    embedding = response_body['embedding']
    return embedding

def load_dataset(filename):
    dataset = []
    with open(filename) as file:
        for line in file:
            dataset.append(json.loads(line))
    return dataset

def calculate_distance(v1, v2):
    distance = math.dist(v1, v2)
    return distance
    
def search(dataset, embedding):
    t1 = datetime.now()
    for item in dataset:
        item['distance'] = calculate_distance(item['embedding'], embedding)
    t2 = datetime.now()
    delta = t2 - t1
    ms1 = 1000 * delta.total_seconds()
    dataset.sort(key=lambda x: x['distance'])
    t3 = datetime.now()
    delta = t3 - t2
    ms2 = 1000 * delta.total_seconds()
    print(str(ms1) + 'ms in calculating distances')
    print(str(ms2) + 'ms in sorting distances')
    return dataset[0]['text']

# main function
bedrock = boto3.client(
    service_name='bedrock-runtime'
)
dataset = load_dataset('dataset.json')
query   = 'Lady Gaga purchased a necklace in Singapore.'
embedding = get_embedding(bedrock, query)
result  = search(dataset, embedding)
print(result)


36.317ms in calculating distances
0.501ms in sorting distances
Marie Curie sells sea shells in Los Angeles.


In [4]:
# pgvector (PostgreSQL has a pgvector extension for vector similarity search. We are using RDS for this exercise)
# As compared to the JSON files approach, pgvector makes things easier by hiding the details related to storage and algorithm. 
# All you need to do is sending an SQL query to the database server.
secret_name = 'bedrock-workshop-4cc1d9f0'


In [5]:
import json
import boto3
import psycopg2
from botocore.exceptions import ClientError

def get_secrets():
    client = boto3.client(
        service_name='secretsmanager',
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        raise e
    secrets = json.loads(get_secret_value_response['SecretString'])
    return secrets
    
def load_dataset(filename):
    dataset = []
    with open(filename) as file:
        for line in file:
            dataset.append(json.loads(line))
    return dataset
    
# main function
secrets = get_secrets()
conn = psycopg2.connect(
    host=secrets['db_hostname'],
    port=secrets['db_hostport'],
    user=secrets['db_username'],
    password=secrets['db_password'],
    database=secrets['db_database']
)
cursor = conn.cursor()
cursor.execute('CREATE EXTENSION vector')
cursor.execute('CREATE TABLE dataset (id SERIAL, content TEXT, embedding VECTOR(1536))')
conn.commit()
print('Table created.')


Table created.


In [7]:
# Load Sample Data
def get_secrets():
    client = boto3.client(
        service_name='secretsmanager',
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        raise e
    secrets = json.loads(get_secret_value_response['SecretString'])
    return secrets
    
def load_dataset(filename):
    dataset = []
    with open(filename) as file:
        for line in file:
            dataset.append(json.loads(line))
    return dataset
    
# main function
secrets = get_secrets()
conn = psycopg2.connect(
    host=secrets['db_hostname'],
    port=secrets['db_hostport'],
    user=secrets['db_username'],
    password=secrets['db_password'],
    database=secrets['db_database']
)
cursor = conn.cursor()
# populate the data into the database
sql = 'INSERT INTO dataset (content, embedding) VALUES(%s, %s)'
dataset = load_dataset('dataset.json')
for item in dataset:
    cursor.execute(sql, (item['text'], item['embedding']))
conn.commit()
print('Loaded data into table.')


Loaded data into table.


In [8]:
# Perform a Search

from datetime import datetime

def get_secrets():
    client = boto3.client(
        service_name='secretsmanager',
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        raise e
    secrets = json.loads(get_secret_value_response['SecretString'])
    return secrets

def get_embedding(bedrock, text):
    modelId = 'amazon.titan-embed-text-v1'
    accept = 'application/json'
    contentType = 'application/json'
    input = {
            'inputText': text
        }
    body=json.dumps(input)
    response = bedrock.invoke_model(
        body=body, modelId=modelId, accept=accept,contentType=contentType)
    response_body = json.loads(response.get('body').read())
    embedding = response_body['embedding']
    return embedding

def search(bedrock, cursor, query, limit):
    embedding = str(get_embedding(bedrock, query))
    sql = 'SELECT id, content FROM dataset ORDER BY embedding <-> %s LIMIT %s'
    cursor.execute(sql, (embedding, limit))
    result = []
    for row in cursor:
        result.append(row)
    return result
    
# main function
bedrock = boto3.client(
    service_name='bedrock-runtime'
)
secrets = get_secrets()
conn = psycopg2.connect(
    host=secrets['db_hostname'],
    port=secrets['db_hostport'],
    user=secrets['db_username'],
    password=secrets['db_password'],
    database=secrets['db_database']
)
cursor = conn.cursor()
query   = 'Lady Gaga purchased a necklace in Singapore.'
result = search(bedrock, cursor, query, 1)
print(result)


[(623, 'Marie Curie sells sea shells in Los Angeles.')]
