# Notebook for creating local vectorscore DBs from collected data

## Paths

In [1]:
# optimism governance docs
docs_path = "../../data/001-initial-dataset-governance-docs/file.txt"

# optimism governance forum
forum_path = "../../data/002-governance-forum-202406014/dataset/_out.jsonl"

## Embeddings and vectorstores that are going to be tested

In [2]:
# we are going to test the open ai api for embeddings
embedding_models = ["text-embedding-3-small", "text-embedding-3-large", "text-embedding-ada-002"]

# we are going to use faiss
vectorstores = ['faiss']

## Imports

In [3]:
from langchain_openai import OpenAIEmbeddings
# openai api key
openai_api_key = input("Enter the OpenAI API key: ")

if 'faiss' in vectorstores:
    from langchain_community.vectorstores import FAISS

# langchain documents handling
from langchain_core.documents.base import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter

import re, tiktoken, json
import pandas as pd


## Load and clean documentation

In [4]:
# load the documentation
with open(docs_path, "r") as f:
    docs_read = f.read()

# split "==> " and " <==" (to get the name of the files)
docs_read = re.split(r"==> | <==", docs_read)


docs = []
path = []
for d in docs_read:
    if not "\n" in d:
        # if it's a path
        path = d.split("/")
    else:
        # if it's a text document
        docs.append({
            "path": "/".join(path[:-1]),
            "document_name": path[-1],
            "content": d
        })

# remove entries where content is just whitespace or '\n' 
docs = [d for d in docs if d['content'].strip() != '']



In [5]:
# get repeated content entries
repeated_contents = []
repeated_entries = []
for i, d in enumerate(docs):
    for j, d2 in enumerate(docs):
        if i != j and d['content'] == d2['content']:
            if not d in repeated_entries:
                if not d['content'] in repeated_contents:
                    repeated_contents.append(d['content'])
                    repeated_entries.append(d2)
                else:
                    repeated_entries.append(d)
                    if not d2 in repeated_entries:
                        repeated_entries.append(d2)
            if d['path'] != d2['path']:
                print(f"Repeated content found in PATHS {d['path']} and {d2['path']}")
            else:
                print(f"Repeated content found in {d['path']}: DOCUMENTS {d['document_name']} and {d2['document_name']}")

# remove the repeated entries
docs = [d for d in docs if d not in repeated_entries]

Repeated content found in developers/bedrock: DOCUMENTS README.md and explainer.md
Repeated content found in developers/bedrock: DOCUMENTS bedrock.md and differences.md
Repeated content found in developers/bedrock: DOCUMENTS bedrock.md and how-is-bedrock-different.md
Repeated content found in developers/bedrock: DOCUMENTS differences.md and bedrock.md
Repeated content found in developers/bedrock: DOCUMENTS differences.md and how-is-bedrock-different.md
Repeated content found in developers/bedrock: DOCUMENTS explainer.md and README.md
Repeated content found in developers/bedrock: DOCUMENTS how-is-bedrock-different.md and bedrock.md
Repeated content found in developers/bedrock: DOCUMENTS how-is-bedrock-different.md and differences.md
Repeated content found in identity: DOCUMENTS README.md and intro.md
Repeated content found in identity: DOCUMENTS intro.md and README.md


In [6]:
full_docs = [
    Document(
        page_content = d['content'],
        metadata = {
            "path": d['path'],
            "document_name": d['document_name']
        }
    ) for d in docs
]

In [7]:
# split the markdown file into sections
headers_to_split_on = [
    ("##", "header 2"),
    ("###", "header 3"),
    ("####", "header 4"),
    ("#####", "header 5"),
    ("######", "header 6")
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# incorporate the doc metadata into the fragments
fragments_docs = []
for d in docs:
    f = markdown_splitter.split_text(d['content'])
    for fragment in f:
        fragment.metadata['path'] = d['path']
        fragment.metadata['document_name'] = d['document_name']
        fragments_docs.append(fragment)

## Load and clean the forum

In [20]:
def load_forum_posts(file_path):
    with open(file_path, 'r') as file:
        boards = {}
        threads = {}
        posts = {}
        for line in file:
            data_line = json.loads(line)
            type_line = data_line['type']
            try:
                id = data_line['item']['data']['id']
                match type_line:
                    case 'board':
                        boards[id] = {
                            'name': data_line['item']['data']['name'],
                            #"created_at": data_line['item']['data']['created_at'],
                            }
                    case 'thread':
                        threads[id] = {
                            'title': data_line['item']['data']['title'],
                            'category_id' : data_line['item']['data']['category_id'],
                            "created_at": data_line['item']['data']['created_at'],
                            "views": data_line['item']['data']['views'],
                            "like_count": data_line['item']['data']['like_count'],
                            }
                    case 'post':
                        posts[id] = {
                            #"cooked": data_line['item']['data']['cooked'],
                            #"url": data_line['item']['data']['url'],
                            #"link_counts": data_line['item']['data']['link_counts'],
                            "created_at": data_line['item']['data']['created_at'],
                            "username": data_line['item']['data']['username'],
                            "score": data_line['item']['data']['score'],
                            "readers_count": data_line['item']['data']['readers_count'],
                            "moderator": data_line['item']['data']['moderator'],
                            "admin": data_line['item']['data']['admin'],
                            "staff": data_line['item']['data']['staff'],
                            "trust_level": data_line['item']['data']['trust_level'],
                            "content": data_line['item']['content'],
                            "creation_time": data_line['item']['creation_time'],
                            "path": data_line['item']['path'],
                            "download_time": data_line['download_time'],
                        }
                    case _:
                        print(f"Unknown type: {type_line}")
            except:
                #print(f"Error processing line: {line}")
                None

    for id_post in posts:
        path = posts[id_post]['path']

        try:
            id_board = int(path[0])
            posts[id_post]['board_name'] = boards[id_board]['name']
            posts[id_post]['board_id'] = id_board
        except:
            posts[id_post]['board_name'] = None
            #print(f"Error processing board for post {id_post}")
        
        try:
            id_thread = int(path[1])
            posts[id_post]['thread_title'] = threads[id_thread]['title']
            posts[id_post]['thread_id'] = id_thread
        except:
            posts[id_post]['thread_title'] = None
            #print(f"Error processing thread for post {id_post}")

    return posts

posts_forum = [
    Document(
        page_content = d['content'],
        metadata = {
            'board_name': d['board_name'],
            'thread_title': d['thread_title'],
            'creation_time': d['creation_time'],
            'username': d['username'],
            'moderator': d['moderator'],
            'admin': d['admin'],
            'staff': d['staff'],
            'trust_level': d['trust_level'],
            'id': ".".join(d['path']) + '.' + str(id)
        }
    ) for id, d in load_forum_posts(forum_path).items()
]

In [22]:
posts_forum

[Document(page_content='<p><a href="https://calendar.google.com/calendar/u/0/r?cid=Y19mbm10Z3VoNm5vbzZxZ2JuaTJncGVyaWQ0a0Bncm91cC5jYWxlbmRhci5nb29nbGUuY29t">Governance Calendar</a></p>', metadata={'board_name': 'Get  Started 🌱', 'thread_title': 'How to Stay up to Date', 'creation_time': '2023-06-16T11:17:47.837000+00:00', 'username': 'system', 'moderator': True, 'admin': True, 'staff': True, 'trust_level': 4, 'id': '67.6124.26479'}),
 Document(page_content='', metadata={'board_name': 'Get  Started 🌱', 'thread_title': 'How to Stay up to Date', 'creation_time': '2023-06-16T11:17:56.495000+00:00', 'username': 'lavande', 'moderator': True, 'admin': True, 'staff': True, 'trust_level': 4, 'id': '67.6124.26480'}),
 Document(page_content='<p>Hello to everyone good to met you all<br>\nThat am in to always up date me for issue concern<br>\nThank you<br>\nFrom marcus safea Brima Togba</p>', metadata={'board_name': 'Get  Started 🌱', 'thread_title': 'How to Stay up to Date', 'creation_time': '2024-

## Create the embeddings and store vectors

In [23]:
data_sources = {
    "full_docs": full_docs,
    "fragments_docs": fragments_docs,
    "posts_forum": posts_forum
}

In [24]:
for model_embeddings in embedding_models:
        embeddings = OpenAIEmbeddings(model=model_embeddings, openai_api_key=openai_api_key)
        for store in vectorstores:
            for name, d in data_sources.items():
                if store == 'faiss':
                    db = FAISS.from_documents(d, embeddings)
                    db.save_local(f"dbs/{name}_db/faiss/{model_embeddings}")

In [25]:
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens

# cost per 1M tokens, from https://openai.com/api/pricing/
cost_rate = {
    "text-embedding-3-small": 0.02,
    "text-embedding-3-large": 0.13,
    "text-embedding-ada-002": 0.10
}

costs = []
# calculate the cost of the embeddings
for model in embedding_models:
    for name, data in data_sources.items():
        cost = {
            "embedding model" : model,
            "data source" : name,
            "num tokens" : sum([num_tokens_from_string(d.page_content) for d in data]),
            "cost ($)" : sum([num_tokens_from_string(d.page_content) for d in data]) * cost_rate[model]/1e6
        }
        costs.append(cost)

pd.DataFrame(costs)

Unnamed: 0,embedding model,data source,num tokens,cost ($)
0,text-embedding-3-small,full_docs,118749,0.002375
1,text-embedding-3-small,fragments_docs,114388,0.002288
2,text-embedding-3-small,posts_forum,7286582,0.145732
3,text-embedding-3-large,full_docs,118749,0.015437
4,text-embedding-3-large,fragments_docs,114388,0.01487
5,text-embedding-3-large,posts_forum,7286582,0.947256
6,text-embedding-ada-002,full_docs,118749,0.011875
7,text-embedding-ada-002,fragments_docs,114388,0.011439
8,text-embedding-ada-002,posts_forum,7286582,0.728658
