In [1]:
import pandas as pd
import numpy as np
import json
import faiss
from sentence_transformers import SentenceTransformer



Dataset loading and embedding dimensions

In [2]:
df = pd.read_csv("hf://datasets/sohomghosh/FinRAD_Financial_Readability_Assessment_Dataset/FinRAD_13K_terms_definitions_labels.csv")
df = df[['terms', 'definitions', 'source', 'assigned_readability']]
df = df.dropna(subset=['definitions'])
df['combined'] = df['terms'] + ": " + df['definitions']

df.head(10)

Unnamed: 0,terms,definitions,source,assigned_readability,combined
0,adjusted present value (apv),net present value of an asset if financed sole...,prin,0,adjusted present value (apv): net present valu...
1,agency costs,"costs that arise when an agent (e.g., a manage...",prin,0,agency costs: costs that arise when an agent (...
2,annual percentage rate (apr),"the interest rate per period (e.g., per month)...",prin,0,annual percentage rate (apr): the interest rat...
3,annuity,investment that produces a level stream of cas...,prin,0,annuity: investment that produces a level stre...
4,annuity due,annuity whose payments occur at the start of e...,prin,0,annuity due: annuity whose payments occur at t...
5,annuity factor,present value of $1 paid for each of t periods.,prin,0,annuity factor: present value of $1 paid for e...
6,apr,annual percentage rate.,prin,0,apr: annual percentage rate.
7,apt,arbitrage pricing theory.,prin,0,apt: arbitrage pricing theory.
8,apv,adjusted present value.,prin,0,apv: adjusted present value.
9,arbitrage,purchase of one security and simultaneous sale...,prin,0,arbitrage: purchase of one security and simult...


In [3]:
model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2')

definitions = df.combined.tolist()
embeddings = model.encode(definitions, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")

embeddings[0]

Batches:   0%|          | 0/410 [00:00<?, ?it/s]

array([ 9.32512339e-03,  4.77343388e-02, -1.37930121e-02, -2.46056337e-02,
       -1.89344753e-02,  1.02830835e-01,  3.40325944e-02,  2.60587465e-02,
        9.15180519e-02,  7.34160393e-02,  4.50099446e-02,  4.72288132e-02,
        4.13446268e-03, -2.56536296e-03, -4.11397517e-02, -2.92041674e-02,
        2.73405574e-02, -1.63149834e-02,  2.31581945e-02,  7.56385550e-02,
        3.66294980e-02,  1.20683550e-03,  6.10514954e-02, -6.50032470e-03,
        6.39065504e-02,  1.38741760e-02, -6.69310661e-03,  1.25044793e-01,
        1.02823697e-01, -4.42968607e-02,  2.55347714e-02,  6.50557503e-02,
        5.39314598e-02, -3.86615656e-02, -3.67391892e-02,  2.73482408e-02,
       -6.86454549e-02,  3.33623439e-02, -5.03214151e-02,  3.59417722e-02,
       -3.71708386e-02, -2.01442931e-02,  6.27672253e-03, -1.37663847e-02,
        2.59798765e-02, -2.79499870e-02,  1.01291630e-02,  1.77095551e-02,
       -4.32795845e-02,  2.39488613e-02,  6.69370145e-02, -1.00054881e-02,
       -1.27501115e-02, -

In [4]:
embeddings.shape

(13112, 384)

In [5]:
embedding_dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dimension)

In [6]:
index.add(embeddings)
print(f"Number of embeddings indexed: {index.ntotal}")

Number of embeddings indexed: 13112


In [7]:
query = "What is the meaning of annuity in general?"
query_embedding = model.encode([query]).astype("float32")

query_embedding

array([[-1.52819240e-02,  8.45330954e-02, -2.94869039e-02,
         8.71534832e-03, -2.58617979e-02, -2.20344635e-03,
         8.29834193e-02,  4.10373546e-02,  8.34113657e-02,
         8.78860429e-03,  2.71564536e-02,  1.36507698e-03,
        -4.78146374e-02, -2.92081553e-02, -3.65527086e-02,
        -7.86439888e-03, -3.76233570e-02,  3.21027450e-02,
         3.54461893e-02,  9.42598209e-02,  4.52506505e-02,
        -8.95656832e-03, -6.27751425e-02,  1.90205872e-02,
         9.56232622e-02,  9.76262987e-03, -2.67772563e-03,
         3.98072274e-03, -3.59938890e-02,  3.94146629e-02,
         3.27179994e-04,  8.54069144e-02,  4.43855822e-02,
        -2.96239611e-02,  5.75619098e-03,  3.81189957e-02,
        -3.12366448e-02,  7.95420110e-02, -3.89854535e-02,
         4.90897931e-02, -1.39438380e-02, -2.50134338e-03,
         8.12505260e-02, -4.02755179e-02,  1.46453986e-02,
         2.39973571e-02, -4.09620181e-02, -1.21002588e-02,
        -1.71546452e-02,  3.91449891e-02,  1.60279695e-0

In [8]:
k = 2
distances, indices = index.search(query_embedding, k)

indices

array([[9363,    3]])

In [9]:
distances

array([[0.40726367, 0.45648456]], dtype=float32)

In [10]:
def decode_indices(indices, texts):
    """
    Given a list/array of indices and a mapping list (texts),
    return the decoded text entries.
    """
    return [texts[i] for i in indices]

In [11]:
decoded_results = decode_indices(indices[0], df.combined.tolist())
print("\nDecoded similarity search results:")
for i, res in enumerate(decoded_results, 1):
    print(f"{i}. [{distances[0][i-1]}] {res}")


Decoded similarity search results:
1. [0.4072636663913727] ANNUITY: An INVESTMENT CONTRACT, generally purchased from an INSURER through single or multiple tax-deferred CAPITAL contributions, that guarantees fixed or variable payments to an ANNUITANT starting at some future date, and lasting for a stated period of time. See also CERTAIN , PERPETUITY, PRESENT VALUE, FUTURE VALUE, LIFE .
2. [0.4564845561981201] annuity: investment that produces a level stream of cash flows for a limited number of periods.


In [None]:
from typing import List

class Retriever:
    def __init__(self, knowledge: List[str], k: int = 3):
        if k <= 0: raise ValueError("Give a positive top k parameter.")

        self.knowledge = knowledge
        self.top_k = k
        self.type = "float32"
        self.model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2')
        self.embeddings = np.array(
            self.model.encode(knowledge)
        ).astype(self.type)
        self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
        self.index.add(self.embeddings)
        self.distances = None
    
    def search(self, query: str, top_k: int = None) -> (List[str], List[float]):
        """Search the query in the index by similarity"""
        if not top_k: top_k = self.top_k
        query_embedding = self.model.encode([query]).astype(self.type)
        distances, indices = self.index.search(query_embedding, top_k)
        self.distances = distances[0]
        self.indices = indices[0]

        def decode_indices(indices, texts):
            """
            Given a list/array of indices and a mapping list (texts),
            return the decoded text entries.
            """
            return [texts[i] for i in indices]
        
        return decode_indices(self.indices, self.knowledge), self.distances
        

In [13]:
pipeline = Retriever(df.combined.tolist(), 5)

In [14]:
top_k, distances = pipeline.search("What is the meaning of arbitrage in general?", top_k=5)

for k, dist in zip(top_k, distances):
    print(f"distance: [{dist:.4f}]: {k}")

distance: [0.3871]: arbitrage: purchase of one security and simultaneous sale of another to give a risk-free profit. often used loosely to describe the taking of offsetting positions in related securities, e.g., at the time of a takeover bid.
distance: [0.4583]: Arbitrage: The purchase of a good or asset in one market for immediate resale in another market in order  t from a price discrepancy.
distance: [0.4683]: ARBITRAGEUR: [FR] An individual or institution engaged in ARBITRAGE transactions. See also ARB.
distance: [0.5816]: Arbitrageur: An individual engaging in arbitrage.
distance: [0.5833]: Tax arbitrage: Creating FINANCIAL INSTRUMENTS or transactions that allow the parties involved to exploit loopholes in or differences between their tax exposures, so that all involved pay less tax.


In [15]:
df.columns

Index(['terms', 'definitions', 'source', 'assigned_readability', 'combined'], dtype='object')

In [16]:
from collections import Counter
import re

words_to_count = [word for word in df.terms.tolist()]
#words_to_count = sum(words_to_count, [])
words_to_count = [re.sub(r'\W+', '', word) for word in words_to_count]
words_to_count = [w for w in words_to_count if len(w) >= 3]

In [17]:
import logging
from typing import Optional
from pathlib import Path

def load_if_scraped(company_id: str) -> Optional[pd.DataFrame]:
    '''
    Transcript loader dataset based on company_id, if nothing found, returns None
    '''
    file_path = Path('..') / 'data' / f'{company_id}.csv'
    if file_path.exists():
        df = pd.read_csv(
            file_path,
            sep='\t',
            quoting=1,
            escapechar='\\',
            doublequote=True,
            quotechar='"',
        )
        logging.info('successfully loaded local transcripts')
        return df
    logging.debug('no local transcripts found')
    return None

company_id = '312932093'
df_company = load_if_scraped(company_id)

search_in = df_company.full_text.tolist()
search_in = [s.lower() for s in search_in]



full_text = ' '.join(search_in).lower()

counts = Counter({phrase: full_text.count(phrase.lower()) for phrase in words_to_count})

# Sort counts in descending order
sorted_counts = counts.most_common()

# Write to file
with open("phrase_counts.txt", "w") as f:
    for phrase, count in sorted_counts:
        f.write(f"{phrase}: {count}\n")
    
    total = sum(counts.values())
    f.write(f"\nTotal cumulative count: {total}\n")