In [5]:
#| default_exp clean
from hn_jobs_chat.var import postsTableName
from hn_jobs_chat.db import connectToDB


With our new tables created, we now need to create embeddings for each of the items.

First, let's add the embedding column to each of the tables.

In [5]:
from hn_jobs_chat import keys

array_items = []
for key in keys.described_keys:
    if key['type'] == 'array' and key['embed']:
        array_items.append(key['key'])

print('array items: ' + str(array_items))


array items: ['more_company_info', 'job_requirements', 'job_soft_skills', 'tech_stack', 'additional_notes']


In [9]:
# Create a new table with the keys that are embeddable
from hn_jobs_chat.var import postsTableName
from hn_jobs_chat.db import connectToDB

print('creating arrays/embeddings for: ' + str(array_items))

conn, cursor = connectToDB()

for item in array_items:
    tableName = postsTableName + "_" + item

    query = """ALTER TABLE """+tableName+""" ADD COLUMN IF NOT EXISTS embedding vector(1536)"""
    cursor.execute(query)
    conn.commit()

conn.close()


creating arrays/embeddings for: ['more_company_info', 'job_requirements', 'job_soft_skills', 'tech_stack', 'additional_notes']


Now we can create the embeddings for each of the items.

In [11]:
from openai import OpenAI
client = OpenAI()

conn, cursor = connectToDB()

for item in array_items:
    print('processing field: ' + item)
    tableName = postsTableName + "_" + item

    query = """SELECT * FROM """ + tableName

    cursor.execute(query)

    data = cursor.fetchall()

    for datum in data:
        # if an identical item has already been processed, just use that embedding
        # otherwise, retreive and embedding
        item_id, text = datum[0],datum[2]

        print('processing: ' + str(item_id))

        search_query = """SELECT * FROM """ + tableName + """ WHERE item = %s AND embedding IS NOT NULL"""
        cursor.execute(search_query, (text,))
        search_results = cursor.fetchone()

        if search_results is not None:
            embedding = search_results[3]
        else:
            response = client.embeddings.create(
                input=text,
                model="text-embedding-3-small"
            )

            embedding = (response.data[0].embedding)

        query = """UPDATE """ + tableName + """ SET embedding = %s::vector(1536) WHERE id = %s"""
        cursor.execute(query, (embedding, item_id))
        conn.commit()
        
conn.close()

processing field: more_company_info
processing: 1
processing: 2
processing: 3
processing: 4
processing: 5
processing: 6
processing: 7
processing: 8
processing: 9
processing: 10
processing: 11
processing: 12
processing: 13
processing: 14
processing: 15
processing: 16
processing: 17
processing: 18
processing: 19
processing: 20
processing: 21
processing: 22
processing: 23
processing: 24
processing: 25
processing: 26
processing: 27
processing: 28
processing: 29
processing: 30
processing: 31
processing: 32
processing: 33
processing: 34
processing: 35
processing: 36
processing: 37
processing: 38
processing: 39
processing: 40
processing: 41
processing: 42
processing: 43
processing: 44
processing: 45
processing: 46
processing: 47
processing: 48
processing: 49
processing: 50
processing: 51
processing: 52
processing: 53
processing: 54
processing: 55
processing: 56
processing: 57
processing: 58
processing: 59
processing: 60
processing: 61
processing: 62
processing: 63
processing: 64
processing: 6