In [118]:
import os
import pandas as pd
from tqdm.auto import tqdm
import json
import json_repair
from typing import Dict, List, Optional

import tiktoken

from langchain.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

from langchain.text_splitter import RecursiveCharacterTextSplitter

In [136]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
VSE_GPT_API_KEY = ""
os.environ["OPENAI_API_KEY"] = VSE_GPT_API_KEY

# Data

In [84]:
adv_df = pd.read_csv("/Users/alfa/Code/financial_assistant/data/interim/alfa_invest_advanced_paragraphs.csv")
beg_df = pd.read_csv("/Users/alfa/Code/financial_assistant/data/interim/alfa_invest_begginer_paragraphs.csv")

In [85]:
adv_df.info(), beg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   paragraph_id  29 non-null     object
 1   article_id    29 non-null     object
 2   heading       29 non-null     object
 3   text          29 non-null     object
dtypes: object(4)
memory usage: 1.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   paragraph_id  38 non-null     object
 1   article_id    38 non-null     object
 2   heading       38 non-null     object
 3   text          38 non-null     object
dtypes: object(4)
memory usage: 1.3+ KB


(None, None)

In [5]:
adv_df.head()

Unnamed: 0,chunk_id,article_id,heading,text
0,AIA_0000_0001,AIA_0000,# **–ò–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∏ —Ä–æ—Å—Å–∏–π—Å–∫–∏–µ –∞–∫—Ü–∏–∏**,"**–ê–∫—Ü–∏—è** ‚Äî —ç—Ç–æ —Ü–µ–Ω–Ω–∞—è –±—É–º–∞–≥–∞, –ø–æ–∑–≤–æ–ª—è—é—â–∞—è –∏–Ω–≤..."
1,AIA_0000_0002,AIA_0000,# **–õ–∏—Å—Ç–∏–Ω–≥ —Ü–µ–Ω–Ω—ã—Ö –±—É–º–∞–≥**,"–ö–æ–º–ø–∞–Ω–∏–∏, —á—å–∏ –∞–∫—Ü–∏–∏ –¥–æ—Å—Ç—É–ø–Ω—ã –Ω–µ–æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω–æ–º—É –∫..."
2,AIA_0000_0003,AIA_0000,"# **–ê–∫—Ü–∏–∏, –≤–∫–ª—é—á—ë–Ω–Ω—ã–µ –≤ –∏–Ω–¥–µ–∫—Å**",**–ò–Ω–¥–µ–∫—Å—ã** ‚Äî —ç—Ç–æ –≤–∞–∂–Ω—ã–µ –∏–Ω–¥–∏–∫–∞—Ç–æ—Ä—ã —Å–∏—Ç—É–∞—Ü–∏–∏ –Ω...
3,AIA_0000_0004,AIA_0000,"# **–†–∏—Å–∫–∏, —Å–≤—è–∑–∞–Ω–Ω—ã–µ —Å –ø–æ–∫—É–ø–∫–æ–π –∞–∫—Ü–∏–π –∏–Ω–æ—Å—Ç—Ä–∞–Ω...",–ò–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∫–æ–º–ø–∞–Ω–∏–∏ –∏ –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –±–∏—Ä–∂–∏ –ø–æ–¥—á–∏...
4,AIA_0001_0005,AIA_0001,–û–±–ª–∏–≥–∞—Ü–∏–∏ —Å –Ω–∏–∑–∫–∏–º –∫—Ä–µ–¥–∏—Ç–Ω—ã–º —Ä–µ–π—Ç–∏–Ω–≥–æ–º,"**–û–±–ª–∏–≥–∞—Ü–∏—è** ‚Äî —ç—Ç–æ –¥–æ–ª–≥–æ–≤–∞—è —Ü–µ–Ω–Ω–∞—è –±—É–º–∞–≥–∞, –æ–±..."


In [6]:
beg_df.head()

Unnamed: 0,chunk_id,article_id,heading,text
0,AIB_0000_0001,AIB_0000,–ß—Ç–æ —Ç–∞–∫–æ–µ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏?,"–î–µ–Ω—å–≥–∏ –Ω—É–∂–Ω—ã, —á—Ç–æ–±—ã –∏—Ö —Ç—Ä–∞—Ç–∏—Ç—å. –≠—Ç–æ –ø–æ–Ω—è—Ç–Ω–æ. –ù..."
1,AIB_0000_0002,AIB_0000,## **–ß—Ç–æ —ç—Ç–æ –∑–∞ —á–∏—Å–ª–∞ –∏ –ø–æ—á–µ–º—É –≤–¥—Ä—É–≥ —Ç–∞–∫–æ–π —Ä–µ–∑...,"üîê **–•—Ä–∞–Ω–∏–ª–∏ –¥–æ–º–∞** –ù–µ—Å–º–æ—Ç—Ä—è –Ω–∞ —Ç–æ, —á—Ç–æ –¥–µ–Ω—å–≥–∏ ..."
2,AIB_0001_0003,AIB_0001,–ì–¥–µ –∏ –≤–æ —á—Ç–æ –º–æ–∂–Ω–æ –∏–Ω–≤–µ—Å—Ç–∏—Ä–æ–≤–∞—Ç—å?,"–ü–ª–æ—â–∞–¥–∫–∞, –≥–¥–µ –∏–¥—ë—Ç —Ç–æ—Ä–≥–æ–≤–ª—è —Ü–µ–Ω–Ω—ã–º–∏ –±—É–º–∞–≥–∞–º–∏ ‚Äî..."
3,AIB_0001_0004,AIB_0001,# **–ö—Ç–æ —Ç–æ—Ä–≥—É–µ—Ç –Ω–∞ –±–∏—Ä–∂–µ?**,"–ü—Ä–æ–¥–∞–≤—Ü–∞–º–∏ –Ω–∞ –±–∏—Ä–∂–µ –º–æ–≥—É—Ç –±—ã—Ç—å —Å–∞–º–∏ –∫–æ–º–ø–∞–Ω–∏–∏, ..."
4,AIB_0001_0005,AIB_0001,### **–ß—Ç–æ –¥–µ–ª–∞–µ—Ç –±—Ä–æ–∫–µ—Ä:**,1. –û—Ç–∫—Ä—ã–≤–∞–µ—Ç –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º [—Å—á–µ—Ç–∞](https://alfaba...


## Analyze texts lengths

In [9]:
enc = tiktoken.get_encoding("cl100k_base")

In [39]:
get_tokens_num = lambda text: len(enc.encode(text))

In [40]:
adv_df['num_tokens'] = adv_df.text.apply(get_tokens_num)
beg_df['num_tokens'] = beg_df.text.apply(get_tokens_num)

In [41]:
adv_df.num_tokens.describe(), beg_df.num_tokens.describe()

(count      29.000000
 mean      973.896552
 std       637.543127
 min       187.000000
 25%       577.000000
 50%       797.000000
 75%      1193.000000
 max      3493.000000
 Name: num_tokens, dtype: float64,
 count     38.000000
 mean     280.973684
 std      178.128027
 min       53.000000
 25%      139.000000
 50%      231.000000
 75%      376.000000
 max      768.000000
 Name: num_tokens, dtype: float64)

## Chunkize

### Example

In [108]:
splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=16, length_function=get_tokens_num)

In [109]:
splitter.split_text(adv_df.iloc[5]['text'])

['–û–±–ª–∏–≥–∞—Ü–∏–∏ –º–æ–∂–Ω–æ –ø—Ä–∏–æ–±—Ä–µ—Å—Ç–∏ –Ω–∞–ø—Ä—è–º—É—é —É —ç–º–∏—Ç–µ–Ω—Ç–∞ –ø—Ä–∏ —Ä–∞–∑–º–µ—â–µ–Ω–∏–∏ –≤—ã–ø—É—Å–∫–∞ –∏–ª–∏ —É –¥—Ä—É–≥–∏—Ö –∏–Ω–≤–µ—Å—Ç–æ—Ä–æ–≤ –Ω–∞ —Ñ–æ–Ω–¥–æ–≤–æ–º —Ä—ã–Ω–∫–µ, –∏–ª–∏ –Ω–∞ –≤–Ω–µ–±–∏—Ä–∂–µ–≤–æ–º —Ä—ã–Ω–∫–µ.',
 '–ü—Ä–∏ –ø–æ–∫—É–ø–∫–µ –æ–±–ª–∏–≥–∞—Ü–∏–π –∏–Ω–≤–µ—Å—Ç–æ—Ä –∑–∞–ø–ª–∞—Ç–∏—Ç –∑–∞ –Ω–µ—ë –Ω–æ–º–∏–Ω–∞–ª—å–Ω—É—é —Å—Ç–æ–∏–º–æ—Å—Ç—å ‚Äî –µ—Å–ª–∏ –ø–æ–∫—É–ø–∞–µ—Ç –Ω–∞–ø—Ä—è–º—É—é —É —ç–º–∏—Ç–µ–Ω—Ç–∞ ‚Äî –∏–ª–∏ –±–∏—Ä–∂–µ–≤—É—é —Ü–µ–Ω—É, –µ—Å–ª–∏ –ø—Ä–∏–æ–±—Ä–µ—Ç–∞–µ—Ç –µ—ë –Ω–∞ –±–∏—Ä–∂–µ. –¶–µ–Ω—ã –æ–±–ª–∏–≥–∞—Ü–∏–π —É–∫–∞–∑—ã–≤–∞—é—Ç—Å—è –≤ –ø—Ä–æ—Ü–µ–Ω—Ç–∞—Ö –æ—Ç –Ω–æ–º–∏–Ω–∞–ª–∞, –∞ –æ–Ω –≤—Å–µ–≥–¥–∞ —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç 1000 –µ–¥–∏–Ω–∏—Ü –≤–∞–ª—é—Ç—ã –≤—ã–ø—É—Å–∫–∞. –ù–∞–ø—Ä–∏–º–µ—Ä, —Ü–µ–Ω–∞ —Ä—É–±–ª—ë–≤–æ–π –æ–±–ª–∏–≥–∞—Ü–∏–∏ 98,6% –æ–∑–Ω–∞—á–∞–µ—Ç, —á—Ç–æ –∑–∞ –±—É–º–∞–≥—É –Ω—É–∂–Ω–æ –∑–∞–ø–ª–∞—Ç–∏—Ç—å 98,6% –æ—Ç –Ω–æ–º–∏–Ω–∞–ª–∞, —Ç–æ –µ—Å—Ç—å 986 —Ä—É–±–ª–µ–π. –û–±–ª–∏–≥–∞—Ü–∏–∏ –º–æ–≥—É—Ç –≤—ã–ø—É—Å–∫–∞—Ç—å—Å—è –Ω–µ —Ç–æ

In [110]:
[get_tokens_num(c) for c in splitter.split_text(adv_df.iloc[5]['text'])]

[61, 252, 56, 255, 23, 107]

### Build all chunks

In [111]:
all_chunks = []

In [112]:
for idx, row in tqdm(adv_df.iterrows()):
    chunks = splitter.split_text(row.text)
    for chunk_id, chunk in enumerate(chunks):
        all_chunks.append(
            Document(
                page_content=chunk,
                metadata={
                    "chunk_id" : chunk_id,
                    "paragraph_id" : row.paragraph_id,
                    "article_id" : row.article_id
                }
            )
        )

0it [00:00, ?it/s]

In [113]:
len(all_chunks)

155

In [114]:
for idx, row in beg_df.iterrows():
    chunks = splitter.split_text(row.text)
    for chunk_id, chunk in enumerate(chunks):
        all_chunks.append(
            Document(
                page_content=chunk,
                metadata={
                    "chunk_id" : chunk_id,
                    "paragraph_id" : row.paragraph_id,
                    "article_id" : row.article_id
                }
            )
        )

In [115]:
len(all_chunks)

222

In [116]:
all_chunks[100]

Document(page_content='–ï—Å–ª–∏ –ø–∞–¥–∞–µ—Ç —Ü–µ–Ω–∞ –±–∞–∑–æ–≤–æ–≥–æ –∞–∫—Ç–∏–≤–∞, –Ω–∞–ø—Ä–∏–º–µ—Ä –∞–∫—Ü–∏–π, –¥–µ—à–µ–≤–µ–µ—Ç –∏ —Ñ—å—é—á–µ—Ä—Å –Ω–∞ –Ω–∏—Ö. –ï–≥–æ –≤–ª–∞–¥–µ–ª–µ—Ü –Ω–µ—Å—ë—Ç —É–±—ã—Ç–æ–∫ –∏–∑-–∑–∞ —Å–ø–∏—Å–∞–Ω–∏—è –≤–∞—Ä–∏–∞—Ü–∏–æ–Ω–Ω–æ–π –º–∞—Ä–∂–∏ –∏ –∏–∑–º–µ–Ω–µ–Ω–∏—è –≥–∞—Ä–∞–Ω—Ç–∏–π–Ω–æ–≥–æ –æ–±–µ—Å–ø–µ—á–µ–Ω–∏—è. –ü—Ä–∏ –Ω–µ—Ö–≤–∞—Ç–∫–µ —Å—Ä–µ–¥—Å—Ç–≤ –±—Ä–æ–∫–µ—Ä –ø—Ä–µ–¥—É–ø—Ä–µ–¥–∏—Ç, —á—Ç–æ –Ω–∞–¥–æ –ø–æ–ø–æ–ª–Ω–∏—Ç—å —Å—á—ë—Ç. –ï—Å–ª–∏ —ç—Ç–æ–≥–æ –Ω–µ —Å–¥–µ–ª–∞—Ç—å, –¥–∞–∂–µ –ø—Ä–∏ –∑–∞–∫—Ä—ã—Ç–∏–∏ –ø–æ–∑–∏—Ü–∏–∏ –±–∞–ª–∞–Ω—Å —Å—á—ë—Ç–∞ –º–æ–∂–µ—Ç —Å—Ç–∞—Ç—å –æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã–º. –ï—Å–ª–∏ —Ü–µ–Ω–∞ –∞–∫—Ç–∏–≤–∞ —Ä–µ–∑–∫–æ —É–ø–∞–ª–∞, –∞ –Ω–∞ —Å—á—ë—Ç–µ –Ω–µ—Ç –¥–µ–Ω–µ–≥ –¥–ª—è –ø–æ–ø–æ–ª–Ω–µ–Ω–∏—è –≥–∞—Ä–∞–Ω—Ç–∏–π–Ω–æ–≥–æ –æ–±–µ—Å–ø–µ—á–µ–Ω–∏—è –∏–ª–∏ –æ–±—Ä–∞–∑–æ–≤–∞–ª–∞—Å—å –∑–∞–¥–æ–ª–∂–µ–Ω–Ω–æ—Å—Ç—å ‚Äî –±—Ä–æ–∫–µ—Ä –∏–º–µ–µ—Ç –ø—Ä–∞–≤–æ –ø—Ä–∏–Ω—É–¥–∏—Ç–µ–ª—å–Ω–æ –∑–∞–∫—Ä—ã—Ç—å –ø–æ–∑–∏—Ü–∏—é –ø–æ —Ñ—å—é—á–µ—Ä—Å—É.', metadata={'chunk_id': 5, 

# Vector database

In [96]:
embedding_model = OpenAIEmbeddings(
    model="text-embedding-3-small", 
    openai_api_base = "https://api.vsegpt.ru/v1/"
)

In [92]:
db = FAISS.from_documents(all_chunks, embedding_model)
db.save_local("/Users/alfa/Code/financial_assistant/data/processed/docs_db_index")




## Vector DB search

In [97]:
user_query = """
–õ–∏–∫–≤–∏–¥–Ω–æ—Å—Ç—å –∞–∫—Ü–∏–∏ —Ö–∞—Ä–∞–∫—Ç–µ—Ä–∏–∑—É–µ—Ç
–ê) –°–ø–æ—Å–æ–±–Ω–æ—Å—Ç—å –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞ –ø—Ä–æ–¥–∞—Ç—å –∞–∫—Ü–∏—é —Å –º–∏–Ω–∏–º–∞–ª—å–Ω—ã–º–∏ –¥–ª—è –Ω–µ–≥–æ –ø–æ—Ç–µ—Ä—è–º–∏ –≤ –º–∏–Ω–∏–º–∞–ª—å–Ω—ã–π —Å—Ä–æ–∫. 
–ë) –†–∞–∑–Ω–∏—Ü—É —Ü–µ–Ω—ã —Ç–∞–∫–æ–π –∞–∫—Ü–∏–∏ –Ω–∞ —Ä–∞–∑–Ω—ã—Ö —Ç–æ—Ä–≥–æ–≤—ã—Ö –ø–ª–æ—â–∞–¥–∫–∞—Ö. 
–í) –í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å –ø–æ–≥–∞—à–µ–Ω–∏—è –∞–∫—Ü–∏–∏ –∫–æ–º–ø–∞–Ω–∏–µ–π ‚Äì —ç–º–∏—Ç–µ–Ω—Ç–æ–º. 
–ì) –ù–∏ –æ–¥–∏–Ω –∏–∑ –æ—Ç–≤–µ—Ç–æ–≤ –Ω–µ —è–≤–ª—è–µ—Ç—Å—è –ø—Ä–∞–≤–∏–ª—å–Ω—ã–º.
"""

In [98]:
db.similarity_search(user_query, 3)



[Document(page_content='**–õ–∏–∫–≤–∏–¥–Ω–æ—Å—Ç—å** ‚Äî —ç—Ç–æ —ç–∫–æ–Ω–æ–º–∏—á–µ—Å–∫–∏–π —Ç–µ—Ä–º–∏–Ω, –ø–æ–∫–∞–∑—ã–≤–∞—é—â–∏–π –∫–∞–∫ –±—ã—Å—Ç—Ä–æ –∞–∫—Ç–∏–≤ (–∞–∫—Ü–∏–∏) –º–æ–∂–Ω–æ –ø—Ä–æ–¥–∞—Ç—å –ø–æ —Ü–µ–Ω–µ, –±–ª–∏–∑–∫–æ–π –∫ —Ä—ã–Ω–æ—á–Ω–æ–π, —Ç. –µ. —Å –º–∏–Ω–∏–º–∞–ª—å–Ω—ã–º–∏ –ø–æ—Ç–µ—Ä—è–º–∏ –¥–ª—è –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞. –ï—Å–ª–∏ —Å–ø—Ä–æ—Å –Ω–∞ –ø–æ–∫—É–ø–∫—É –∏ –ø—Ä–æ–¥–∞–∂—É –∞–∫—Ç–∏–≤–∞ –µ—Å—Ç—å –≤—Å–µ–≥–¥–∞ –∏ —Å –Ω–∏–º –∑–∞–∫–ª—é—á–∞–µ—Ç—Å—è –º–Ω–æ–≥–æ —Å–¥–µ–ª–æ–∫, —Ç–∞–∫–æ–π –∞–∫—Ç–∏–≤ –Ω–∞–∑—ã–≤–∞—é—Ç –≤—ã—Å–æ–∫–æ–ª–∏–∫–≤–∏–¥–Ω—ã–º.\n**–í–æ–ª–∞—Ç–∏–ª—å–Ω–æ—Å—Ç—å** (–∞–Ω–≥–ª. volatility) ‚Äî —ç—Ç–æ—Ç —Å—Ç–∞—Ç–∏—Å—Ç–∏—á–µ—Å–∫–∏–π —Ç–µ—Ä–º–∏–Ω —Ö–∞—Ä–∞–∫—Ç–µ—Ä–∏–∑—É–µ—Ç –∏–∑–º–µ–Ω—á–∏–≤–æ—Å—Ç—å –ø–æ–∫–∞–∑–∞—Ç–µ–ª—è –≤ —Ç–µ—á–µ–Ω–∏–µ –æ–ø—Ä–µ–¥–µ–ª—ë–Ω–Ω–æ–≥–æ –≤—Ä–µ–º–µ–Ω–∏. –ü—Ä–∏–º–µ–Ω–∏—Ç–µ–ª—å–Ω–æ –∫ –∞–∫—Ü–∏—è–º –≥–æ–≤–æ—Ä—è—Ç, —á—Ç–æ –æ–Ω–∏ –≤–æ–ª–∞—Ç–∏–ª—å–Ω—ã, –µ—Å–ª–∏ –∏—Ö —Ü–µ–Ω–∞ –º–µ–Ω—è–µ—Ç—Å—è —Å–∏–ª—å–Ω–µ–µ –∏ –±—ã—Å—Ç—Ä–µ–µ, —á–µ–º —É –±–æ–ª—å—à–∏–Ω—Å—Ç–≤–∞ 

# RAG

In [None]:
from openai import OpenAI
VSE_GPT_API_KEY = ""

client = OpenAI(
    api_key=VSE_GPT_API_KEY,
    base_url="https://api.vsegpt.ru/v1",
)

## Questions

In [119]:
with open('/Users/alfa/Code/financial_assistant/data/interim/test_qual_investor/questions.json', 'r') as f:
    questions = json.load(f)

In [120]:
questions[0]

{'id': '1.4',
 'question': '–õ–∏–∫–≤–∏–¥–Ω–æ—Å—Ç—å –∞–∫—Ü–∏–∏ —Ö–∞—Ä–∞–∫—Ç–µ—Ä–∏–∑—É–µ—Ç',
 'options': [{'letter': '–ê',
   'option_text': '–°–ø–æ—Å–æ–±–Ω–æ—Å—Ç—å –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞ –ø—Ä–æ–¥–∞—Ç—å –∞–∫—Ü–∏—é —Å –º–∏–Ω–∏–º–∞–ª—å–Ω—ã–º–∏ –¥–ª—è –Ω–µ–≥–æ –ø–æ—Ç–µ—Ä—è–º–∏ –≤ –º–∏–Ω–∏–º–∞–ª—å–Ω—ã–π —Å—Ä–æ–∫.'},
  {'letter': '–ë',
   'option_text': '–†–∞–∑–Ω–∏—Ü—É —Ü–µ–Ω—ã —Ç–∞–∫–æ–π –∞–∫—Ü–∏–∏ –Ω–∞ —Ä–∞–∑–Ω—ã—Ö —Ç–æ—Ä–≥–æ–≤—ã—Ö –ø–ª–æ—â–∞–¥–∫–∞—Ö.'},
  {'letter': '–í',
   'option_text': '–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å –ø–æ–≥–∞—à–µ–Ω–∏—è –∞–∫—Ü–∏–∏ –∫–æ–º–ø–∞–Ω–∏–µ–π ‚Äì —ç–º–∏—Ç–µ–Ω—Ç–æ–º.'},
  {'letter': '–ì',
   'option_text': '–ù–∏ –æ–¥–∏–Ω –∏–∑ –æ—Ç–≤–µ—Ç–æ–≤ –Ω–µ —è–≤–ª—è–µ—Ç—Å—è –ø—Ä–∞–≤–∏–ª—å–Ω—ã–º.'}],
 'answer': '–ê',
 'chapter': 1}

In [121]:
with open('/Users/alfa/Code/financial_assistant/data/interim/test_qual_investor/chapters.json', 'r') as f:
    chapters = json.load(f)

In [122]:
chapters

{'1': '–ü–æ–∫—É–ø–∫–∞ –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã—Ö –∞–∫—Ü–∏–π',
 '2': '–ê–∫—Ü–∏–∏, –Ω–µ –≤–∫–ª—é—á–µ–Ω–Ω—ã–µ –≤ –∫–æ—Ç–∏—Ä–æ–≤–∞–ª—å–Ω—ã–µ —Å–ø–∏—Å–∫–∏',
 '3': '–î–æ–ø—É—Å–∫ –∫ –Ω–µ–æ–±–µ—Å–ø–µ—á–µ–Ω–Ω—ã–º —Å–¥–µ–ª–∫–∞–º (–º–∞—Ä–∂–∏–Ω–∞–ª—å–Ω–∞—è —Ç–æ—Ä–≥–æ–≤–ª—è)',
 '4': '–ó–∞–∫–ª—é—á–µ–Ω–∏–µ –¥–æ–≥–æ–≤–æ—Ä–æ–≤ –†–ï–ü–û',
 '5': '–û–ø—Ü–∏–æ–Ω—ã, —Ñ—å—é—á–µ—Ä—Å—ã, –ø—Ä–æ–∏–∑–≤–æ–¥–Ω—ã–µ —Ñ–∏–Ω–∞–Ω—Å–æ–≤—ã–µ –∏–Ω—Å—Ç—Ä—É–º–µ–Ω—Ç—ã',
 '6': '–°—Ç—Ä—É–∫—Ç—É—Ä–Ω—ã–µ –æ–±–ª–∏–≥–∞—Ü–∏–∏',
 '7': '–ü–∞–∏ –∑–∞–∫—Ä—ã—Ç—ã—Ö –ø–∞–µ–≤—ã—Ö –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–æ–Ω–Ω—ã—Ö —Ñ–æ–Ω–¥–æ–≤ (–ó–ü–ò–§)',
 '8': '–û–±–ª–∏–≥–∞—Ü–∏–∏ —Ä–æ—Å—Å–∏–π—Å–∫–∏—Ö —ç–º–∏—Ç–µ–Ω—Ç–æ–≤, –∫–æ—Ç–æ—Ä—ã–º –Ω–µ –ø—Ä–∏—Å–≤–æ–µ–Ω —Ä–µ–π—Ç–∏–Ω–≥ –∏–ª–∏ –æ–Ω –Ω–∏–∂–µ —É—Ä–æ–≤–Ω—è',
 '9': '–û–±–ª–∏–≥–∞—Ü–∏–∏ –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã—Ö —ç–º–∏—Ç–µ–Ω—Ç–æ–≤ –≤ –≤–∞–ª—é—Ç–µ (–µ–≤—Ä–æ–æ–±–ª–∏–≥–∞—Ü–∏–∏) –∫–æ—Ç–æ—Ä—ã–º –Ω–µ –ø—Ä–∏—Å–≤–æ–µ–Ω —Ä–µ–π—Ç–∏–Ω–≥ –∏–ª–∏ –æ–Ω –Ω–∏–∂–µ –Ω—É–∂–Ω–æ–≥–æ —É—Ä–æ–≤–Ω—è',
 '10': '–û–±–ª–∏–≥–∞—Ü–∏–∏ —Å–æ —Å—Ç—Ä—É–∫—Ç—É—Ä–Ω

## Prompt

In [132]:
with open("/Users/alfa/Code/financial_assistant/artifacts/prompts/system_v1.md", "r") as f:
    system_prompt = f.read()

print(system_prompt)

You are qualified financial and investment assistant. 
Provide helpful answers to any question.  
Stricly follow user instructions.  


In [123]:
with open("/Users/alfa/Code/financial_assistant/artifacts/prompts/rag_v1.md", "r") as f:
    prompt_template = f.read()

In [124]:
print(prompt_template)

### Instructions ###

Answer the multiple-choice ###Question### about Russian invest market based on ###Context###.
Follow ###Asnwer Format###.


### Answer Format ###
{{
    "reasoning" : "provide your brief (1-2 sentences) reasoning here",
    "answer": "–ë" # one of the first 4 cyrillyc letters: "–ê", "–ë", "–í" or "–ì"
}}

### Context ###
{context}

### Question ###
{question}

{option_1}
{option_2}
{option_3}
{option_4}




## Evaluation

In [138]:
def get_rag_response(
    question_dict : Dict, 
    prompt_template: str, 
    system_prompt: str, 
    db : FAISS,
    model="openai/gpt-4o-2024-11-20",
    sampling_params: Dict = {
        "max_tokens": 512,
        "temperature": 0.5,
        "top_p": 0.9
    }
):
    
    options = [
        opt["letter"] + '. ' + opt["option_text"]
        for opt in question_dict["options"]
    ]

    question_with_options_text = '\n'.join([question_dict['question']] + options)

    top_docs = db.similarity_search(question_with_options_text, 3)
    top_docs_texts = [f"{i}. {doc.page_content}" for i, doc in enumerate(top_docs)]
    context = '\n'.join(top_docs_texts)

    prompt = prompt_template.format(
        context=context,
        question=question_dict['question'], 
        option_1=options[0],
        option_2=options[1],
        option_3=options[2], 
        option_4=options[3],
    )

    # return prompt

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": "{"}
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        **sampling_params
    )

    return response

# GPT-4o-mini

In [139]:
results = []
responses = []
for question_dict in tqdm(questions):
    response = get_rag_response(
        question_dict,
        prompt_template,
        system_prompt,
        db,
        model="openai/gpt-4o-mini"
    )
    responses.append(response)
    try:
        response_text =  response.choices[0].message.content
        if '{' not in response_text[:5]:
            response_text = '{' + response_text
        response_dict = json_repair.loads(response_text)

        results.append(
            question_dict | {
                "llm_answer" : response_dict["answer"],
                "llm_reasoning" : response_dict["reasoning"]
            }
        )
    except Exception as e:
        results.append({})

  0%|          | 0/44 [00:00<?, ?it/s]



In [140]:
res_df = pd.DataFrame(results).set_index("id")
res_df['correct'] = (res_df.answer == res_df.llm_answer).astype(int)
res_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44 entries, 1.4 to 11.7
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   question       44 non-null     object
 1   options        44 non-null     object
 2   answer         44 non-null     object
 3   chapter        44 non-null     int64 
 4   llm_answer     44 non-null     object
 5   llm_reasoning  44 non-null     object
 6   correct        44 non-null     int64 
dtypes: int64(2), object(5)
memory usage: 2.8+ KB


In [141]:
res_df.head()

Unnamed: 0_level_0,question,options,answer,chapter,llm_answer,llm_reasoning,correct
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.4,–õ–∏–∫–≤–∏–¥–Ω–æ—Å—Ç—å –∞–∫—Ü–∏–∏ —Ö–∞—Ä–∞–∫—Ç–µ—Ä–∏–∑—É–µ—Ç,"[{'letter': '–ê', 'option_text': '–°–ø–æ—Å–æ–±–Ω–æ—Å—Ç—å –∏...",–ê,1,–ê,"–õ–∏–∫–≤–∏–¥–Ω–æ—Å—Ç—å –∞–∫—Ü–∏–∏ –æ–ø—Ä–µ–¥–µ–ª—è–µ—Ç, –Ω–∞—Å–∫–æ–ª—å–∫–æ –±—ã—Å—Ç—Ä–æ...",1
1.5,–ß—Ç–æ –∏–∑ –ø–µ—Ä–µ—á–∏—Å–ª–µ–Ω–Ω–æ–≥–æ –Ω–µ —è–≤–ª—è–µ—Ç—Å—è —Ä–∏—Å–∫–æ–º –ø–æ –ø—Ä...,"[{'letter': '–ê', 'option_text': '–†–∏—Å–∫ –∏–∑–º–µ–Ω–µ–Ω–∏...",–ê,1,–ê,–ò–∑–º–µ–Ω–µ–Ω–∏–µ —Å—É–≤–µ—Ä–µ–Ω–Ω–æ–≥–æ —Ä–µ–π—Ç–∏–Ω–≥–∞ –†–æ—Å—Å–∏–∏ –Ω–µ –≤–ª–∏—è–µ...,1
1.6,"–í —Ñ–æ–Ω–¥–æ–≤—ã–π –∏–Ω–¥–µ–∫—Å, —Ä–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ–º—ã–π –±–∏—Ä–∂–µ–π, –≤–∫–ª—é...","[{'letter': '–ê', 'option_text': '–í—Å–µ –∞–∫—Ü–∏–∏, –¥–æ...",–ë,1,–ë,"–ò–Ω–¥–µ–∫—Å –≤–∫–ª—é—á–∞–µ—Ç –∞–∫—Ü–∏–∏ —Ç–æ–ª—å–∫–æ —Ç–µ—Ö —ç–º–∏—Ç–µ–Ω—Ç–æ–≤, –∫–æ...",1
1.7,"–í —Å–ª—É—á–∞–µ, –µ—Å–ª–∏ –í—ã –∫—É–ø–∏–ª–∏ –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—É—é –∞–∫—Ü–∏—é –∑–∞ ...","[{'letter': '–ê', 'option_text': '500 —Ä—É–±–ª–µ–π.'}...",–í,1,–í,–ü—Ä–∏ –ø—Ä–æ–¥–∞–∂–µ –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω–æ–π –∞–∫—Ü–∏–∏ –¥–æ—Ö–æ–¥ –≤ —Ä—É–±–ª—è—Ö —Ä...,1
2.4,–í—ã –ø–æ–ª—É—á–∏–ª–∏ —É–±—ã—Ç–∫–∏ –æ—Ç —Å–æ–≤–µ—Ä—à–µ–Ω–∏—è —Å–¥–µ–ª–æ–∫ —Å –∞–∫—Ü–∏...,"[{'letter': '–ê', 'option_text': '–ù–µ—Ç, –Ω–µ –≤–æ–∑–º–µ...",–ê,2,–ê,–í –∫–æ–Ω—Ç–µ–∫—Å—Ç–µ —Ç–æ—Ä–≥–æ–≤–ª–∏ –∞–∫—Ü–∏—è–º–∏ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º ...,1


In [142]:
res_df.correct.sum(), res_df.correct.mean()

(42, 0.9545454545454546)

In [143]:
res_df.groupby("chapter")['correct'].mean()

chapter
1     1.00
2     1.00
3     1.00
4     1.00
5     1.00
6     1.00
7     1.00
8     1.00
9     1.00
10    0.75
11    0.75
Name: correct, dtype: float64

In [144]:
res_df.to_csv('/Users/alfa/Code/financial_assistant/data/results/gpt4o-mini_rag.csv')

# GPT-4o

In [145]:
results = []
responses = []
for question_dict in tqdm(questions):
    response = get_rag_response(
        question_dict,
        prompt_template,
        system_prompt,
        db,
    )
    responses.append(response)
    try:
        response_text =  response.choices[0].message.content
        if '{' not in response_text[:5]:
            response_text = '{' + response_text
        response_dict = json_repair.loads(response_text)

        results.append(
            question_dict | {
                "llm_answer" : response_dict["answer"],
                "llm_reasoning" : response_dict["reasoning"]
            }
        )
    except Exception as e:
        results.append({})

  0%|          | 0/44 [00:00<?, ?it/s]



In [146]:
res_df = pd.DataFrame(results).set_index("id")
res_df['correct'] = (res_df.answer == res_df.llm_answer).astype(int)
res_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44 entries, 1.4 to 11.7
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   question       44 non-null     object
 1   options        44 non-null     object
 2   answer         44 non-null     object
 3   chapter        44 non-null     int64 
 4   llm_answer     44 non-null     object
 5   llm_reasoning  44 non-null     object
 6   correct        44 non-null     int64 
dtypes: int64(2), object(5)
memory usage: 2.8+ KB


In [147]:
res_df.head()

Unnamed: 0_level_0,question,options,answer,chapter,llm_answer,llm_reasoning,correct
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.4,–õ–∏–∫–≤–∏–¥–Ω–æ—Å—Ç—å –∞–∫—Ü–∏–∏ —Ö–∞—Ä–∞–∫—Ç–µ—Ä–∏–∑—É–µ—Ç,"[{'letter': '–ê', 'option_text': '–°–ø–æ—Å–æ–±–Ω–æ—Å—Ç—å –∏...",–ê,1,–ê,–õ–∏–∫–≤–∏–¥–Ω–æ—Å—Ç—å –∞–∫—Ü–∏–∏ –æ–ø—Ä–µ–¥–µ–ª—è–µ—Ç—Å—è –µ—ë —Å–ø–æ—Å–æ–±–Ω–æ—Å—Ç—å—é...,1
1.5,–ß—Ç–æ –∏–∑ –ø–µ—Ä–µ—á–∏—Å–ª–µ–Ω–Ω–æ–≥–æ –Ω–µ —è–≤–ª—è–µ—Ç—Å—è —Ä–∏—Å–∫–æ–º –ø–æ –ø—Ä...,"[{'letter': '–ê', 'option_text': '–†–∏—Å–∫ –∏–∑–º–µ–Ω–µ–Ω–∏...",–ê,1,–ê,–†–∏—Å–∫ –∏–∑–º–µ–Ω–µ–Ω–∏—è —Å—É–≤–µ—Ä–µ–Ω–Ω–æ–≥–æ —Ä–µ–π—Ç–∏–Ω–≥–∞ –†–æ—Å—Å–∏–π—Å–∫–æ–π...,1
1.6,"–í —Ñ–æ–Ω–¥–æ–≤—ã–π –∏–Ω–¥–µ–∫—Å, —Ä–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ–º—ã–π –±–∏—Ä–∂–µ–π, –≤–∫–ª—é...","[{'letter': '–ê', 'option_text': '–í—Å–µ –∞–∫—Ü–∏–∏, –¥–æ...",–ë,1,–ë,"–°–æ–≥–ª–∞—Å–Ω–æ –∫–æ–Ω—Ç–µ–∫—Å—Ç—É, –∞–∫—Ü–∏–∏ –≤–∫–ª—é—á–∞—é—Ç—Å—è –≤ —Ñ–æ–Ω–¥–æ–≤—ã...",1
1.7,"–í —Å–ª—É—á–∞–µ, –µ—Å–ª–∏ –í—ã –∫—É–ø–∏–ª–∏ –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—É—é –∞–∫—Ü–∏—é –∑–∞ ...","[{'letter': '–ê', 'option_text': '500 —Ä—É–±–ª–µ–π.'}...",–í,1,–í,–ü—Ä–∏–±—ã–ª—å –æ—Ç –ø—Ä–æ–¥–∞–∂–∏ –∞–∫—Ü–∏–∏ –≤ –¥–æ–ª–ª–∞—Ä–∞—Ö —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç...,1
2.4,–í—ã –ø–æ–ª—É—á–∏–ª–∏ —É–±—ã—Ç–∫–∏ –æ—Ç —Å–æ–≤–µ—Ä—à–µ–Ω–∏—è —Å–¥–µ–ª–æ–∫ —Å –∞–∫—Ü–∏...,"[{'letter': '–ê', 'option_text': '–ù–µ—Ç, –Ω–µ –≤–æ–∑–º–µ...",–ê,2,–ê,"–í –∫–æ–Ω—Ç–µ–∫—Å—Ç–µ —É–∫–∞–∑–∞–Ω–æ, —á—Ç–æ —É–±—ã—Ç–∫–∏ –æ—Ç —Å–¥–µ–ª–æ–∫ —Å –∞–∫...",1


In [148]:
res_df.correct.sum(), res_df.correct.mean()

(42, 0.9545454545454546)

In [149]:
res_df.groupby("chapter")['correct'].mean()

chapter
1     1.00
2     1.00
3     1.00
4     1.00
5     1.00
6     1.00
7     1.00
8     1.00
9     1.00
10    0.75
11    0.75
Name: correct, dtype: float64

In [150]:
res_df.to_csv('/Users/alfa/Code/financial_assistant/data/results/gpt4o_rag.csv')