# Generating a dataset with Q&A for our fragments dataset using OpenAI API

In [64]:
# extracted from optimism governance docs
file_path = "../../../data/002-governance-forum-202406014/dataset/_out.jsonl"

# openai api key
openai_api_key = input("Enter the OpenAI API key: ")

In [65]:
model_embeddings = "text-embedding-3-small"
model_chat = "gpt-4o"

In [66]:
# imports
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

from langchain_text_splitters import RecursiveJsonSplitter
import pandas as pd
import json
import matplotlib.pyplot as plt

In [67]:
# select the model
llm = ChatOpenAI(
    model = model_chat,
    temperature = 0,
    max_tokens = None,
    timeout = None,
    max_retries = 2,
    api_key = openai_api_key
)

In [68]:
# ask the chat for creating questions and answers
def answer_template():
    return f"""From the following fragment from Optimism Governance Documentation, create some general questions (maximum 3, it is not a problem to do less) a non-specialist user could ask (the user didn't have any access to the fragment) and give a correct answer for each question that could be given by someone who had read the fragment. Avoid questions that are too silly or not about optimism, that are not well answered by the fragment or that could be done just by someone who knows the documentation already. Try to avoid repetitive questions, and ask only things that could really be asked by an user (it is ok to do just one or even zero questions if there are not many important stuff in the fragment). Make answers complete and simple to understand. If there is no question to be asked, just leave it empty.

<fragment>
{{context}} 
</fragment>

Questions and answers should be in the following format:
"question1", "answer1"
"question2", "answer2"
...
"""

prompt = ChatPromptTemplate.from_template(answer_template())

chain = prompt | llm

In [80]:
def load_forum_posts(file_path):
    with open(file_path, 'r') as file:
        boards = {}
        threads = {}
        posts = {}
        for line in file:
            data_line = json.loads(line)
            type_line = data_line['type']
            try:
                id = data_line['item']['data']['id']
                match type_line:
                    case 'board':
                        boards[id] = {
                            'name': data_line['item']['data']['name'],
                            #"created_at": data_line['item']['data']['created_at'],
                            }
                    case 'thread':
                        threads[id] = {
                            'title': data_line['item']['data']['title'],
                            'category_id' : data_line['item']['data']['category_id'],
                            "created_at": data_line['item']['data']['created_at'],
                            "views": data_line['item']['data']['views'],
                            "like_count": data_line['item']['data']['like_count'],
                            }
                    case 'post':
                        posts[id] = {
                            #"cooked": data_line['item']['data']['cooked'],
                            #"url": data_line['item']['data']['url'],
                            #"link_counts": data_line['item']['data']['link_counts'],
                            "created_at": data_line['item']['data']['created_at'],
                            "username": data_line['item']['data']['username'],
                            "score": data_line['item']['data']['score'],
                            "readers_count": data_line['item']['data']['readers_count'],
                            "moderator": data_line['item']['data']['moderator'],
                            "admin": data_line['item']['data']['admin'],
                            "staff": data_line['item']['data']['staff'],
                            "trust_level": data_line['item']['data']['trust_level'],
                            "content": data_line['item']['content'],
                            "creation_time": data_line['item']['creation_time'],
                            "path": data_line['item']['path'],
                            "download_time": data_line['download_time'],
                        }
                    case _:
                        print(f"Unknown type: {type_line}")
            except:
                #print(f"Error processing line: {line}")
                None

    for id_post in posts:
        path = posts[id_post]['path']

        try:
            id_board = int(path[0])
            posts[id_post]['board_name'] = boards[id_board]['name']
            posts[id_post]['board_id'] = id_board
        except:
            posts[id_post]['board_name'] = None
            #print(f"Error processing board for post {id_post}")
        
        try:
            id_thread = int(path[1])
            posts[id_post]['thread_title'] = threads[id_thread]['title']
            posts[id_post]['thread_id'] = id_thread
        except:
            posts[id_post]['thread_title'] = None
            #print(f"Error processing thread for post {id_post}")

    return posts

posts = load_forum_posts(file_path)
posts_df = pd.DataFrame(posts).T
posts_df

Unnamed: 0,created_at,username,score,readers_count,moderator,admin,staff,trust_level,content,creation_time,path,download_time,board_name,board_id,thread_title,thread_id
26479,2023-06-16T11:17:47.837Z,system,804.0,198,True,True,True,4,"<p><a href=""https://calendar.google.com/calend...",2023-06-16T11:17:47.837000+00:00,"[67, 6124]",2024-06-14T08:42:52.228729+00:00,Get Started 🌱,67,How to Stay up to Date,6124
26480,2023-06-16T11:17:56.495Z,lavande,53.6,196,True,True,True,4,,2023-06-16T11:17:56.495000+00:00,"[67, 6124]",2024-06-14T08:42:52.229289+00:00,Get Started 🌱,67,How to Stay up to Date,6124
35364,2024-03-24T20:07:47.816Z,1311832119,46.0,58,False,False,False,1,<p>Hello to everyone good to met you all<br>\n...,2024-03-24T20:07:47.816000+00:00,"[67, 6124]",2024-06-14T08:42:52.229688+00:00,Get Started 🌱,67,How to Stay up to Date,6124
26472,2023-06-16T10:29:31.456Z,system,185.2,100,True,True,True,4,"<h2><a name=""httpsgovoptimismiothow-to-get-a-g...",2023-06-16T10:29:31.456000+00:00,"[67, 6120]",2024-06-14T08:42:52.574981+00:00,Get Started 🌱,67,How to Navigate the Forum,6120
26475,2023-06-16T10:31:55.060Z,lavande,19.8,73,True,True,True,4,,2023-06-16T10:31:55.060000+00:00,"[67, 6120]",2024-06-14T08:42:52.576548+00:00,Get Started 🌱,67,How to Navigate the Forum,6120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,2022-05-03T17:55:42.099Z,system,439.4,121,True,True,True,4,<p>Submit and discuss proposals for Phase 0 of...,2022-05-03T17:55:42.099000+00:00,"[63, 39, 210]",2024-06-14T08:54:07.599862+00:00,ARCHIVED & OLD Missions,63,,
366,2022-05-03T18:42:03.344Z,bobby,79.2,120,True,True,True,4,,2022-05-03T18:42:03.344000+00:00,"[63, 39, 210]",2024-06-14T08:54:07.601708+00:00,ARCHIVED & OLD Missions,63,,
25274,2023-04-13T19:40:44.583Z,system,50.2,25,True,True,True,4,<p>The entire community aligns around Collecti...,2023-04-13T19:40:44.583000+00:00,"[63, 64, 5878]",2024-06-14T08:54:09.364076+00:00,ARCHIVED & OLD Missions,63,,
25275,2023-04-13T19:41:00.893Z,system,6.0,4,True,True,True,4,<p>An Alliance is a group of people (new or pr...,2023-04-13T19:41:00.893000+00:00,"[63, 65, 5879]",2024-06-14T08:54:10.281502+00:00,ARCHIVED & OLD Missions,63,,


In [81]:
# exclude posts with no content
posts_df = posts_df[posts_df['content'] != '']

# remove lines that contain nan
posts_df = posts_df.dropna()

posts_df

Unnamed: 0,created_at,username,score,readers_count,moderator,admin,staff,trust_level,content,creation_time,path,download_time,board_name,board_id,thread_title,thread_id
26479,2023-06-16T11:17:47.837Z,system,804.0,198,True,True,True,4,"<p><a href=""https://calendar.google.com/calend...",2023-06-16T11:17:47.837000+00:00,"[67, 6124]",2024-06-14T08:42:52.228729+00:00,Get Started 🌱,67,How to Stay up to Date,6124
35364,2024-03-24T20:07:47.816Z,1311832119,46.0,58,False,False,False,1,<p>Hello to everyone good to met you all<br>\n...,2024-03-24T20:07:47.816000+00:00,"[67, 6124]",2024-06-14T08:42:52.229688+00:00,Get Started 🌱,67,How to Stay up to Date,6124
26472,2023-06-16T10:29:31.456Z,system,185.2,100,True,True,True,4,"<h2><a name=""httpsgovoptimismiothow-to-get-a-g...",2023-06-16T10:29:31.456000+00:00,"[67, 6120]",2024-06-14T08:42:52.574981+00:00,Get Started 🌱,67,How to Navigate the Forum,6120
36160,2024-04-30T03:15:08.371Z,sikkha84,3.4,16,False,False,False,0,<p>This is a good stating point for a newbie t...,2024-04-30T03:15:08.371000+00:00,"[67, 6120]",2024-06-14T08:42:52.577280+00:00,Get Started 🌱,67,How to Navigate the Forum,6120
26468,2023-06-16T10:08:09.297Z,system,188.4,94,True,True,True,4,<p>Welcome to the Optimism Collective governan...,2023-06-16T10:08:09.297000+00:00,"[67, 6118]",2024-06-14T08:42:52.902138+00:00,Get Started 🌱,67,About the Optimism Collective,6118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2374,2022-05-26T02:11:09.213Z,cb0x,1493.0,114,False,False,False,1,<p><strong>Project Name:</strong> Aelin Protoc...,2022-05-26T02:11:09.213000+00:00,"[63, 1157]",2024-06-14T08:54:02.647224+00:00,ARCHIVED & OLD Missions,63,[GF: Phase 0 Proposal] Aelin Protocol,1157
3087,2022-05-27T06:31:10.275Z,Justin,22.6,62,False,False,False,2,<p>What i like about this proposal is that the...,2022-05-27T06:31:10.275000+00:00,"[63, 1157]",2024-06-14T08:54:02.648847+00:00,ARCHIVED & OLD Missions,63,[GF: Phase 0 Proposal] Aelin Protocol,1157
3846,2022-05-29T02:50:35.607Z,kryos.eth,10.0,49,False,False,False,2,"<p>TBH Aelin I’m checking out your site, and w...",2022-05-29T02:50:35.607000+00:00,"[63, 1157]",2024-06-14T08:54:02.649315+00:00,ARCHIVED & OLD Missions,63,[GF: Phase 0 Proposal] Aelin Protocol,1157
365,2022-05-03T18:39:36.920Z,system,1139.2,195,True,True,True,4,<blockquote>\n<p>We will stop accepting Phase ...,2022-05-03T18:39:36.920000+00:00,"[63, 215]",2024-06-14T08:54:02.994421+00:00,ARCHIVED & OLD Missions,63,Governance Fund Phase 0: How to Create a Proposal,215


In [82]:
# count the number of posts with score > 100
better_entries = posts_df[posts_df['score'] > 100]

# how many threads
posts_df['thread_id'].nunique()

1271

In [29]:
csv_location =  "first_test_dataset.csv"
out_csv_str = '"fragment", "question", "answer"'
frag = 0

with open(csv_location, 'w') as file:
    file.write(out_csv_str)
    
for doc in documents:
    context = doc.page_content
    response = chain.invoke(
        {
            "context" : doc.page_content
        }
    )

    response_str = "\n" + response.content
    response_str = response_str.replace("\n\n", "\n")

    # remove the \n at the end of the string if it exists
    if response_str[-1] == "\n":
        response_str = response_str[:-1]

    response_str = response_str.replace("\n", f'\n"{frag}", ')

    out_csv_str += response_str

    print(response_str)
    with open(csv_location, 'a') as file:
        file.write(response_str)
    frag += 1


"0", "How can I get support for my project on Optimism?", "To get support for your project on Optimism, you should first fill out the 'connect with Optimism' form. After that, you can access various resources such as metrics, a community on Discord, and exclusive Telegram channels for builders."
"0", "Where can I find the Optimism community to discuss my project?", "You can find the Optimism community on their Discord server and also join exclusive Telegram channels for builders. Links to these resources are provided after you fill out the 'connect with Optimism' form."
"0", "What should I do before deploying my project on OP Mainnet?", "Before deploying your project on OP Mainnet, you should fill out the 'connect with Optimism' form. This will give you access to important resources and support from the Optimism community."

"1", "Where can I find tutorials for developing on Optimism?", "You can find tutorials for developing on Optimism on their GitHub page at https://github.com/ether

In [75]:
def clean_str(out_csv_str):
    # split by \n
    lines = out_csv_str.split("\n")

    # number of " per line
    n_quotes = [line.count('"') for line in lines]

    # get indexes of lines with not 6 "
    indexes_less = [i for i, n in enumerate(n_quotes) if n < 6]
    indexes_more = [i for i, n in enumerate(n_quotes) if n > 6]
    indexes_equal = [i for i, n in enumerate(n_quotes) if n == 6]

    # get the lines with more than 6 "
    lines_with_lessthan_6_quotes = [lines[i] for i in indexes_less]
    lines_with_morethan_6_quotes = [lines[i] for i in indexes_more]
    lines_out = [lines[i] for i in indexes_equal]
    
    """lines_less_new = []
    l_old = []
    for l in lines_with_lessthan_6_quotes:
        if l_old[:4] == l[:4]:
            l = l_old + "\n " + l[6:]
        else:
            lines_less_new.append(l_old)
            None
        
        l_old = l

    lines_out.extend(lines_less_new)"""

    l_temp = []
    for l in lines_out:
        if len(l) > 0:
            # each line is in the format "fragment", "question", "answer", so we split
            parts = l.split('", "')
            parts[0] = parts[0][1:]
            parts[-1] = parts[-1][:-1]
            if len(parts) > 3:
                print(parts)
            l_temp.append(parts)
    
    # first line is the header
    lines_out = pd.DataFrame(l_temp[1:], columns=l_temp[0])
    
    # add the fragment column
    lines_out["fragment_text"] = lines_out["fragment"].apply(lambda x: documents[int(x)].page_content)

    # to new csv
    new_csv_location = "first_test_dataset_with_fragments.csv"
    lines_out.to_csv(new_csv_location, index=False)

    return lines_out

out_brut = clean_str(out_csv_str)
out_brut

Unnamed: 0,fragment,question,answer,fragment_text
0,0,How can I get support for my project on Optimism?,"To get support for your project on Optimism, y...",---\ntitle: How do I get project support (mark...
1,0,Where can I find the Optimism community to dis...,You can find the Optimism community on their D...,---\ntitle: How do I get project support (mark...
2,0,What should I do before deploying my project o...,"Before deploying your project on OP Mainnet, y...",---\ntitle: How do I get project support (mark...
3,1,Where can I find tutorials for developing on O...,You can find tutorials for developing on Optim...,Steps to take if you would like developer supp...
4,1,How can I get immediate developer support for ...,"For immediate developer support, you can first...",Steps to take if you would like developer supp...
...,...,...,...,...
1870,760,Are there any premium options available for OP...,"Yes, QuickNode provides an option to upgrade t...",[QuickNode](https://www.quicknode.com/) offers...
1871,760,Do I need to manage the infrastructure if I us...,"No, QuickNode manages the complex infrastructu...",[QuickNode](https://www.quicknode.com/) offers...
1872,761,Which networks does Optimism support?,Optimism supports the following networks: OP M...,- OP Mainnet\n- OP Goerli\n- OP Sepolia
1873,762,Who provides ERC-4337 account abstraction infr...,Stackup provides ERC-4337 account abstraction ...,[Stackup](https://www.stackup.sh/) provides ER...


In [89]:
out_brut

Unnamed: 0,fragment,question,answer,fragment_text
0,0,How can I get support for my project on Optimism?,"To get support for your project on Optimism, y...",---\ntitle: How do I get project support (mark...
1,0,Where can I find the Optimism community to dis...,You can find the Optimism community on their D...,---\ntitle: How do I get project support (mark...
2,0,What should I do before deploying my project o...,"Before deploying your project on OP Mainnet, y...",---\ntitle: How do I get project support (mark...
3,1,Where can I find tutorials for developing on O...,You can find tutorials for developing on Optim...,Steps to take if you would like developer supp...
4,1,How can I get immediate developer support for ...,"For immediate developer support, you can first...",Steps to take if you would like developer supp...
...,...,...,...,...
1870,760,Are there any premium options available for OP...,"Yes, QuickNode provides an option to upgrade t...",[QuickNode](https://www.quicknode.com/) offers...
1871,760,Do I need to manage the infrastructure if I us...,"No, QuickNode manages the complex infrastructu...",[QuickNode](https://www.quicknode.com/) offers...
1872,761,Which networks does Optimism support?,Optimism supports the following networks: OP M...,- OP Mainnet\n- OP Goerli\n- OP Sepolia
1873,762,Who provides ERC-4337 account abstraction infr...,Stackup provides ERC-4337 account abstraction ...,[Stackup](https://www.stackup.sh/) provides ER...
