In [1]:
import openai
import pandas as pd
import os

openai.api_key = os.environ.get('OPENAI_API_KEY')

In [5]:
def generate_embedding(text: str) -> list[float]:
	resp = openai.Embedding.create(
		input=[text], 
		model="text-embedding-ada-002")

	return resp["data"][0]["embedding"] 

In [49]:
res = openai.Embedding.create(
		input=['hello'], 
		model="text-embedding-ada-002")

In [50]:
res.keys()

dict_keys(['object', 'data', 'model', 'usage'])

In [26]:
thesis = 'climate transition'
embed_thesis = generate_embedding(thesis)
len(embed_thesis)

In [2]:
import pymongo

uri = os.environ.get('MONGO_URI')
client = pymongo.MongoClient(uri)
db = client.General
collection = db.metricMeta

# Try one

In [43]:
doc = collection.find_one({'title':{"$exists": True}})

In [38]:
# to get rid of the id. None refers to what it does when the key is not in the dict
doc_ = doc
doc_.pop("_id", None)

In [39]:
doc_

{'code': 'greenhouse_gas_emissions_avoided',
 'title': 'Greenhouse Gas Emissions Avoided',
 'description': 'Amount of greenhouse gas (GHG) emissions avoided by the organization during the reporting period.',
 'type': 'numeric',
 'measure': 'tCO2eq',
 'framework': ['PAIs'],
 'category': ['Climate']}

In [40]:
embeds = generate_embedding(str(doc_))

In [41]:
from openai.embeddings_utils import cosine_similarity

cosine_similarity(embeds, embed_thesis)

0.7681959046246072

In [44]:
collection.update_one({'_id': doc['_id']}, 
                       {"$set": {"embeddings": generate_embedding(str(doc_))}})

<pymongo.results.UpdateResult at 0x2380797e4d0>

In [45]:
collection.find_one()

{'_id': ObjectId('6449b282a2927cd3aa323dc4'),
 'code': 'greenhouse_gas_emissions_avoided',
 'title': 'Greenhouse Gas Emissions Avoided',
 'description': 'Amount of greenhouse gas (GHG) emissions avoided by the organization during the reporting period.',
 'type': 'numeric',
 'measure': 'tCO2eq',
 'framework': ['PAIs'],
 'category': ['Climate'],
 'embeddings': [-0.010822176933288574,
  -0.004700609482824802,
  -0.014088360592722893,
  -0.024365050718188286,
  -0.0007492017466574907,
  0.007461712695658207,
  0.008067809045314789,
  -0.015745021402835846,
  -0.022950828075408936,
  -0.017482496798038483,
  0.013125341385602951,
  -0.0018738461658358574,
  -0.01722658984363079,
  0.006707460153847933,
  -0.017751872539520264,
  0.006976836360991001,
  0.022236982360482216,
  -0.024930741637945175,
  0.001830072607845068,
  -0.012977184727787971,
  0.003247663378715515,
  0.012243134900927544,
  -0.038897883147001266,
  0.005855558905750513,
  0.0002792670566122979,
  0.017980841919779778,


In [48]:
# delete the field by updating: note the filter stage, then the $unset operator 
collection.update_one({'_id': doc['_id']}, 
                       {"$unset": {"embeddings": ''}})

<pymongo.results.UpdateResult at 0x2381557ea10>

# Write ALL

In [4]:
collection.find_one({'embeddings':{"$exists": True}})

{'_id': ObjectId('6449b282a2927cd3aa323dc4'),
 'code': 'greenhouse_gas_emissions_avoided',
 'title': 'Greenhouse Gas Emissions Avoided',
 'description': 'Amount of greenhouse gas (GHG) emissions avoided by the organization during the reporting period.',
 'type': 'numeric',
 'measure': 'tCO2eq',
 'framework': ['PAIs'],
 'category': ['Climate'],
 'embeddings': [-0.005473030265420675,
  0.008236774243414402,
  -0.009074065834283829,
  -0.01825704798102379,
  -0.011123049072921276,
  0.005282427184283733,
  0.01825704798102379,
  -0.009768405929207802,
  -0.018325120210647583,
  -0.006841287482529879,
  0.015438846312463284,
  -0.011048168875277042,
  -0.008754124864935875,
  0.0023229746147990227,
  -0.027637440711259842,
  0.012436848133802414,
  0.021565372124314308,
  -0.025663336738944054,
  -6.147798558231443e-05,
  -0.012831668369472027,
  0.0024421014823019505,
  0.005010136868804693,
  -0.03779385983943939,
  0.010558046400547028,
  -0.002896485384553671,
  0.015125712379813194,
 

In [6]:
for doc in collection.find({'embeddings':{"$exists": False}}):
	collection.update_one({'_id': doc['_id']},
                       {"$set": {"embeddings": generate_embedding(str(doc))}})

# Search with score

In [7]:
query = 'water usage in cities'

results = collection.aggregate([
    {
        '$search': {
            "index": "SemanticSearch",
            "knnBeta": {
                "vector": generate_embedding(query),
                "k": 4,
                "path": "embeddings"
                }
        }
    },
 {
    "$project": {
      "_id": 0,
      "title": 1,
      "description": 1,
      'type': 1,
      'measure': 1,
      'framework': 1,
      'category': 1,
      "score": { '$meta': "searchScore" }
    }
  }
])

In [32]:
results.alive # results.close

AttributeError: 'list' object has no attribute 'alive'

In [8]:
results = list(results)

In [9]:
results[0].keys()

dict_keys(['title', 'description', 'category', 'framework', 'score'])

In [10]:
results

[{'title': 'Water Consumed: Municipal',
  'description': "Volume of water drawn from municipal water sources used for the organization's operations during the reporting period.",
  'category': 'Water',
  'framework': ['IRIS+', 'SDG'],
  'score': 0.8927170634269714},
 {'title': 'Public Water Point Coverage',
  'description': 'Number of individuals within a coverage region (defined by a municipality, district, or designated service area) who have primary water access through a public tap as of the end of the reporting period.',
  'category': 'Water',
  'framework': ['IRIS+', 'SDG'],
  'score': 0.8884526491165161},
 {'title': 'Water Consumed: Wastewater',
  'description': "Volume of wastewater used for the organization's operations during the reporting period.",
  'category': 'Water',
  'framework': ['IRIS+', 'SDG'],
  'score': 0.8865753412246704},
 {'title': 'Water Consumption of Product',
  'description': 'Volume of water used (consumed) over the lifetime of a product or system operated

# alternative search operations

In [38]:
list(db.metricMeta.find({'title': {'$regex': query, '$options': 'i'}}))

[{'_id': ObjectId('6449b2faa2927cd3aa323dcb'),
  'code': 'ghg_emissions_scope_1',
  'title': 'GHG Emissions Scope 1',
  'description': '',
  'type': 'numeric',
  'measure': 'tCO2eq',
  'framework': ['PAIs', 'TCFD'],
  'category': ['Climate'],
  'embeddings': [0.01085751410573721,
   0.010137338191270828,
   -0.006995997857302427,
   -0.02041185274720192,
   -0.020658770576119423,
   0.009602350182831287,
   0.011323913931846619,
   0.006457580719143152,
   -0.01595361903309822,
   -0.020494159311056137,
   0.0061969454400241375,
   -0.0018450230127200484,
   -0.012922019697725773,
   -0.008669550530612469,
   -0.028669873252511024,
   0.01447211392223835,
   0.019245853647589684,
   -0.04022013023495674,
   -0.0022857023868709803,
   -0.009993302635848522,
   0.00589172774925828,
   -0.005274433642625809,
   -0.020905688405036926,
   0.0187794528901577,
   -0.004056992940604687,
   0.009225115180015564,
   0.025734297931194305,
   -0.01614566519856453,
   -0.003415693063288927,
   -0.0

In [34]:
query = "scope 1"

more operators: [here in the mongodb docs](https://www.mongodb.com/docs/manual/reference/operator/query/)


- $regex = match, see:  
    ```py
    list(db.metricMeta.find({'title': {'$regex': query, '$options': 'i'}}))
    ```
- $text = text search (faster but only on an indexed field)
- $type = filters on a type
- $rename = Renames a field.
- $set = Sets the value of a field in a document.
- $unset = Removes the specified field from a document.
- $push = Adds an item to an array.
    ```py
    copilot_log.update_one(
         {"_id": st.session_state.chat_id},
        {"$push": {"messages": message}})
    ```


$search only works in aggregate, if there's an index. 

$match is the same, but only if there's an identical match

In [43]:
results_filter = db.metricMeta.aggregate([{'$match': {'title': query }}])
list(results_filter)

<pymongo.command_cursor.CommandCursor at 0x2708d35fe80>

# To get stats on db size

In [5]:
db.command("dbstats")

{'db': 'General',
 'collections': 9,
 'views': 0,
 'objects': 256,
 'avgObjSize': 2482.86328125,
 'dataSize': 635613.0,
 'storageSize': 770048.0,
 'indexes': 9,
 'indexSize': 331776.0,
 'totalSize': 1101824.0,
 'scaleFactor': 1.0,
 'fsUsedSize': 1995612160.0,
 'fsTotalSize': 10726932480.0,
 'ok': 1.0,
 '$clusterTime': {'clusterTime': Timestamp(1691827267, 1),
  'signature': {'hash': b'\x8c9\x83\xb8\xff\xc8\x96\xfaR\x16h}\x1f\x0c\x94\xc2\x94\x08D\x9c',
   'keyId': 7225379746408824839}},
 'operationTime': Timestamp(1691827267, 1)}

In [20]:
db.command("collstats", db.copilot_log.name)

{'ns': 'General.copilot_log',
 'size': 3445,
 'count': 2,
 'avgObjSize': 1722,
 'numOrphanDocs': 0,
 'storageSize': 36864,
 'freeStorageSize': 16384,
 'capped': True,
 'max': 0,
 'maxSize': 104857600,
 'wiredTiger': {'metadata': {'formatVersion': 1},
  'creationString': 'access_pattern_hint=none,allocation_size=4KB,app_metadata=(formatVersion=1),assert=(commit_timestamp=none,durable_timestamp=none,read_timestamp=none,write_timestamp=off),block_allocation=best,block_compressor=snappy,cache_resident=false,checksum=on,colgroups=,collator=,columns=,dictionary=0,encryption=(keyid=,name=),exclusive=false,extractor=,format=btree,huffman_key=,huffman_value=,ignore_in_memory_cache_size=false,immutable=false,import=(compare_timestamp=oldest_timestamp,enabled=false,file_metadata=,metadata_file=,repair=false),internal_item_max=0,internal_key_max=0,internal_key_truncate=true,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=64MB,log=(enab

In [22]:
client.close()

## To format the results in the objective helper

In [23]:
import openai
import streamlit as st

openai.api_key = st.secrets["OPENAI_API_KEY"]

kwargs = {'model':"gpt-4",
            'messages':[
                {"role": "system", "content": 'you are an expert at feet'},
                {"role": "user", "content": 'what are toes'}]}

call = openai.ChatCompletion.create(**kwargs)       
call

<OpenAIObject chat.completion id=chatcmpl-7mesNX74fvnen8sfkWpmN3LnEwFcf at 0x18acd893dd0> JSON: {
  "id": "chatcmpl-7mesNX74fvnen8sfkWpmN3LnEwFcf",
  "object": "chat.completion",
  "created": 1691831227,
  "model": "gpt-4-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "Toes are digits located at the end of the feet, used in humans for balance, walking, running, and other activities. Each foot typically has five toes: the first one, called the big toe or hallux, is usually larger than the rest and plays a significant role in maintaining balance. The remaining four toes, from inner to outer, are the second toe, third toe, fourth toe, and the smallest one is the fifth toe or \"little toe\". Each toe consists of jointed bones, called phalanges, surrounded by soft tissues, skin, and toenails. The toes also contain muscles, tendons, ligaments, blood vessels, and nerves. They play crucial roles in foot strength and coordination

In [27]:
content = call.choices[0].message.content

In [30]:
content = "1. Each toe consists of jointed bones, called phalanges, surrounded by soft tissues, skin, and toenails. 2. The toes also contain muscles, tendons, ligaments, blood vessels, and nerves. 3. They play crucial roles in foot strength and coordination."

In [39]:
import re

splits = re.split('(\d\.)', content)
[num + text for num, text in zip(splits[1::2], splits[2::2])]

['1. Each toe consists of jointed bones, called phalanges, surrounded by soft tissues, skin, and toenails. ',
 '2. The toes also contain muscles, tendons, ligaments, blood vessels, and nerves. ',
 '3. They play crucial roles in foot strength and coordination.']

In [44]:
splits

['',
 '1.',
 ' Each toe consists of jointed bones, called phalanges, surrounded by soft tissues, skin, and toenails. ',
 '2.',
 ' The toes also contain muscles, tendons, ligaments, blood vessels, and nerves. ',
 '3.',
 ' They play crucial roles in foot strength and coordination.']

In [48]:
[s.strip() for s in splits if s]

['1.',
 'Each toe consists of jointed bones, called phalanges, surrounded by soft tissues, skin, and toenails.',
 '2.',
 'The toes also contain muscles, tendons, ligaments, blood vessels, and nerves.',
 '3.',
 'They play crucial roles in foot strength and coordination.']

In [49]:
metric_list = [{'db': 'General',
 'collections': 9,
 'views': 0.254,
 'objects': 256},
{'db': 'General',
 'collections': 9,
 'views': 0,
 'objects': 256}]

In [54]:
''.join([str(i['views']) for i in metric_list])

'00'

In [57]:
for i in metric_list:
    f"{i['views']:.2f}"

In [61]:
for n,m in enumerate(metric_list):
    print(f"{n+1} + {i['views']:.2f}")

1 + 0.00
2 + 0.00
