In [2]:
import pandas as pd

# DataFrame 1: Business terms and their preferred forms
data1 = {
    'business_term': ['var', 'ROI', 'equity', 'profit'],
    'business_description': [
        'var in finance',
        'ROI for projects',
        'ownership in a company',
        'net income after expenses'
    ],
    'preferred_business_term': ['ValueAtRisk', 'ReturnOnInvestment', 'EquityShare', 'NetProfit'],
    'preferred_business_description': [
        'measure of the risk of loss for investments',
        'ratio between net profit and cost of investment',
        'value that represents part ownership in a corporation',
        'the amount of money that remains after all business expenses have been deducted from gross income'
    ]
}

df1 = pd.DataFrame(data1)

# DataFrame 2: Abbreviations and their full forms
data2 = {
    'abbreviation': ['var', 'ROI'],
    'full_form': ['value at risk', 'return on investment']
}

df2 = pd.DataFrame(data2)


In [3]:
abbreviation_dict = dict(zip(df2.abbreviation, df2.full_form))

def preprocess(text):
    # Handle abbreviations
    for abb, full in abbreviation_dict.items():
        text = text.replace(abb, full)

    # Tokenization, lower-casing, and filtering non-alphabetic tokens
    tokens = text.split()
    tokens = [token.lower() for token in tokens if token.isalpha()]

    return ' '.join(tokens)


In [4]:
!python -m spacy download en_core_web_md

2023-08-26 18:14:15.550765: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.6.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [5]:
import spacy

# Load the medium English model from Spacy
nlp = spacy.load('en_core_web_md')

def get_embedding(text):
    return nlp(text).vector


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

def get_most_similar(user_input, df):
    user_embedding = get_embedding(preprocess(user_input))
    similarities = []

    for index, row in df.iterrows():
        term_embedding = get_embedding(preprocess(row['preferred_business_description']))
        sim = cosine_similarity([user_embedding], [term_embedding])[0][0]
        similarities.append((row['preferred_business_term'], sim))

    # Sort by similarity and return top 4
    sorted_terms = sorted(similarities, key=lambda x: x[1], reverse=True)
    return [term[0] for term in sorted_terms[:4]]


In [7]:
user_input = "What is the return for my investment project?"
top_terms = get_most_similar(user_input, df1)
top_terms

['NetProfit', 'EquityShare', 'ValueAtRisk', 'ReturnOnInvestment']

In [8]:
def get_most_similar_v2(business_term, business_description, df):
    # Combine business term and description and get its embedding
    user_input = business_term + " " + business_description
    user_embedding = get_embedding(preprocess(user_input))
    similarities = []

    for index, row in df.iterrows():
        # Combine preferred business term and description from the dataframe and get its embedding
        combined_description = row['business_term'] + " " + row['business_description']
        term_embedding = get_embedding(preprocess(combined_description))
        sim = cosine_similarity([user_embedding], [term_embedding])[0][0]
        similarities.append((row['preferred_business_term'], sim))

    # Sort by similarity and return top 4
    sorted_terms = sorted(similarities, key=lambda x: x[1], reverse=True)
    return [term[0] for term in sorted_terms[:4]]

# Testing with a user input for both business term and business description
user_business_term = "ROI"
user_business_description = "ratio of net gain in the project"
top_terms_v2 = get_most_similar_v2(user_business_term, user_business_description, df1)
top_terms_v2


['ReturnOnInvestment', 'EquityShare', 'ValueAtRisk', 'NetProfit']

In [9]:
data1

{'business_term': ['var', 'ROI', 'equity', 'profit'],
 'business_description': ['var in finance',
  'ROI for projects',
  'ownership in a company',
  'net income after expenses'],
 'preferred_business_term': ['ValueAtRisk',
  'ReturnOnInvestment',
  'EquityShare',
  'NetProfit'],
 'preferred_business_description': ['measure of the risk of loss for investments',
  'ratio between net profit and cost of investment',
  'value that represents part ownership in a corporation',
  'the amount of money that remains after all business expenses have been deducted from gross income']}

In [10]:


# DataFrame 1: Business terms
data_terms = {
    'business_term': ['var', 'ROI', 'equity', 'profit'],
    'business_description': [
        'var in finance',
        'ROI for projects',
        'ownership in a company',
        'net income after expenses'
    ]
}
df_terms = pd.DataFrame(data_terms)

# DataFrame 2: Preferred business terms
data_preferred_terms = {
    'preferred_business_term': ['ValueAtRisk', 'ReturnOnInvestment', 'EquityShare', 'NetProfit'],
    'preferred_business_description': [
        'measure of the risk of loss for investments',
        'ratio between net profit and cost of investment',
        'value that represents part ownership in a corporation',
        'the amount of money that remains after all business expenses have been deducted from gross income'
    ]
}
df_preferred_terms = pd.DataFrame(data_preferred_terms)

# DataFrame 3: Abbreviations
data_abbreviations = {
    'abbreviation': ['var', 'ROI'],
    'full_form': ['value at risk', 'return on investment']
}
df_abbreviations = pd.DataFrame(data_abbreviations)

# Abbreviation dictionary from df_abbreviations
abbreviation_dict = dict(zip(df_abbreviations.abbreviation, df_abbreviations.full_form))

def preprocess(text):
    # Handle abbreviations
    for abb, full in abbreviation_dict.items():
        text = text.replace(abb, full)
    tokens = text.split()
    tokens = [token.lower() for token in tokens if token.isalpha()]
    return ' '.join(tokens)

# Pre-compute embeddings for df_preferred_terms
precomputed_embeddings = [get_embedding(preprocess(desc)) for desc in df_preferred_terms['preferred_business_description']]

def get_most_similar_optimized_v2(business_term, business_description):
    # Combine business term and description and get its embedding
    user_input = business_term + " " + business_description
    user_embedding = get_embedding(preprocess(user_input))

    # Compute the cosine similarity in batch
    similarities = cosine_similarity([user_embedding], precomputed_embeddings)[0]

    # Pair preferred business terms with their similarities and sort them
    sorted_terms = sorted(list(zip(df_preferred_terms['preferred_business_term'], similarities)), key=lambda x: x[1], reverse=True)

    return [term[0] for term in sorted_terms[:4]]

# Testing with the new approach
user_business_term = "ROI"
user_business_description = "ratio of net gain in the project"
top_terms_optimized_v2 = get_most_similar_optimized_v2(user_business_term, user_business_description)
top_terms_optimized_v2


['ValueAtRisk', 'ReturnOnInvestment', 'NetProfit', 'EquityShare']

In [3]:
# !pip install transformers
# !pip install torch
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [5]:
import numpy as np
import pandas as pd
import faiss
from transformers import BertTokenizer, BertModel
import torch

# Define the three dataframes

# Mock data for df_terms
data_terms = {
    'business_term': ['var', 'ROI', 'equity', 'profit'],
    'business_description': [
        'var in finance',
        'ROI for projects',
        'ownership in a company',
        'net income after expenses'
    ]
}
df_terms = pd.DataFrame(data_terms)

# Mock data for df_preferred_terms
data_preferred_terms = {
    'preferred_business_term': ['ValueAtRisk', 'ReturnOnInvestment', 'EquityShare', 'NetProfit', 'aojsdsd'],
    'preferred_business_description': [
        'measure of the risk of loss for investments',
        'ratio between net profit and cost of investment',
        'value that represents part ownership in a corporation',
        'the amount of money that remains after all business expenses have been deducted from gross income',
        'oeijsdfds'
    ]
}
df_preferred_terms = pd.DataFrame(data_preferred_terms)

# Mock data for df_abbreviations
data_abbreviations = {
    'abbreviation': ['var', 'ROI'],
    'full_form': ['value at risk', 'return on investment']
}
df_abbreviations = pd.DataFrame(data_abbreviations)

# Preprocessing: Abbreviation expansion
abbreviation_dict = dict(zip(df_abbreviations.abbreviation, df_abbreviations.full_form))

def preprocess(text):
    for abb, full in abbreviation_dict.items():
        text = text.replace(abb, full)
    return text

# Initialize BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs['pooler_output'].numpy()

# Precompute embeddings for df_preferred_terms
embeddings = []
for _, row in df_preferred_terms.iterrows():
    term_embedding = get_bert_embedding(preprocess(row['preferred_business_term']))
    desc_embedding = get_bert_embedding(preprocess(row['preferred_business_description']))
    combined_embedding = (term_embedding + desc_embedding) / 2.0
    embeddings.append(combined_embedding.squeeze())

# Convert embeddings list to a matrix
embedding_matrix = np.vstack(embeddings)

# Build a Faiss index
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix.astype('float32'))

# Enhanced search function
def enhanced_search(business_term, business_description, top_k=4):
    term_embedding = get_bert_embedding(preprocess(business_term))
    desc_embedding = get_bert_embedding(preprocess(business_description))
    query_embedding = ((term_embedding + desc_embedding) / 2.0).astype('float32')
    D, I = index.search(query_embedding, top_k)
    return [df_preferred_terms.iloc[i]['preferred_business_term'] for i in I[0]]

# Testing the enhanced search
user_business_term = "ROI"
user_business_description = "ratio of net gain in the project"
top_preferred_terms = enhanced_search(user_business_term, user_business_description)
top_preferred_terms


['NetProfit', 'EquityShare', 'ValueAtRisk', 'ReturnOnInvestment']

In [None]:
['ValueAtRisk', 'ReturnOnInvestment', 'NetProfit', 'EquityShare']
