In [2]:
import pandas as pd

# DataFrame 1: Business terms and their preferred forms
data1 = {
    'business_term': ['var', 'ROI', 'equity', 'profit'],
    'business_description': [
        'var in finance',
        'ROI for projects',
        'ownership in a company',
        'net income after expenses'
    ],
    'preferred_business_term': ['ValueAtRisk', 'ReturnOnInvestment', 'EquityShare', 'NetProfit'],
    'preferred_business_description': [
        'measure of the risk of loss for investments',
        'ratio between net profit and cost of investment',
        'value that represents part ownership in a corporation',
        'the amount of money that remains after all business expenses have been deducted from gross income'
    ]
}

df1 = pd.DataFrame(data1)

# DataFrame 2: Abbreviations and their full forms
data2 = {
    'abbreviation': ['var', 'ROI'],
    'full_form': ['value at risk', 'return on investment']
}

df2 = pd.DataFrame(data2)


In [3]:
abbreviation_dict = dict(zip(df2.abbreviation, df2.full_form))

def preprocess(text):
    # Handle abbreviations
    for abb, full in abbreviation_dict.items():
        text = text.replace(abb, full)

    # Tokenization, lower-casing, and filtering non-alphabetic tokens
    tokens = text.split()
    tokens = [token.lower() for token in tokens if token.isalpha()]

    return ' '.join(tokens)


In [5]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.6.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [6]:
import spacy

# Load the medium English model from Spacy
nlp = spacy.load('en_core_web_md')

def get_embedding(text):
    return nlp(text).vector


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

def get_most_similar(user_input, df):
    user_embedding = get_embedding(preprocess(user_input))
    similarities = []

    for index, row in df.iterrows():
        term_embedding = get_embedding(preprocess(row['preferred_business_description']))
        sim = cosine_similarity([user_embedding], [term_embedding])[0][0]
        similarities.append((row['preferred_business_term'], sim))

    # Sort by similarity and return top 4
    sorted_terms = sorted(similarities, key=lambda x: x[1], reverse=True)
    return [term[0] for term in sorted_terms[:4]]


In [8]:
user_input = "What is the return for my investment project?"
top_terms = get_most_similar(user_input, df1)
top_terms

['NetProfit', 'EquityShare', 'ValueAtRisk', 'ReturnOnInvestment']

In [9]:
def get_most_similar_v2(business_term, business_description, df):
    # Combine business term and description and get its embedding
    user_input = business_term + " " + business_description
    user_embedding = get_embedding(preprocess(user_input))
    similarities = []

    for index, row in df.iterrows():
        # Combine preferred business term and description from the dataframe and get its embedding
        combined_description = row['business_term'] + " " + row['business_description']
        term_embedding = get_embedding(preprocess(combined_description))
        sim = cosine_similarity([user_embedding], [term_embedding])[0][0]
        similarities.append((row['preferred_business_term'], sim))

    # Sort by similarity and return top 4
    sorted_terms = sorted(similarities, key=lambda x: x[1], reverse=True)
    return [term[0] for term in sorted_terms[:4]]

# Testing with a user input for both business term and business description
user_business_term = "ROI"
user_business_description = "ratio of net gain in the project"
top_terms_v2 = get_most_similar_v2(user_business_term, user_business_description, df1)
top_terms_v2


['ReturnOnInvestment', 'EquityShare', 'ValueAtRisk', 'NetProfit']

In [10]:
data1

{'business_term': ['var', 'ROI', 'equity', 'profit'],
 'business_description': ['var in finance',
  'ROI for projects',
  'ownership in a company',
  'net income after expenses'],
 'preferred_business_term': ['ValueAtRisk',
  'ReturnOnInvestment',
  'EquityShare',
  'NetProfit'],
 'preferred_business_description': ['measure of the risk of loss for investments',
  'ratio between net profit and cost of investment',
  'value that represents part ownership in a corporation',
  'the amount of money that remains after all business expenses have been deducted from gross income']}