In [13]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
dataset = pd.read_csv('/content/creative_mock_financial_dataset.csv')


In [15]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [16]:
from torch.utils.data import DataLoader
import numpy as np

# Function to vectorize a batch of texts
def bert_vectorize_batch(texts, batch_size=32):
    vectorized_texts = []
    dataloader = DataLoader(texts, batch_size=batch_size, shuffle=False)

    for batch in dataloader:
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=512)
        outputs = model(**inputs)
        batch_vectors = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
        vectorized_texts.extend(batch_vectors)

    return np.array(vectorized_texts)

# Applying the function to the dataset in batches
batch_size = 32  # You can adjust the batch size based on your system's capabilities
business_term_vectors = bert_vectorize_batch(dataset['Business Term Description'].tolist(), batch_size)
preferred_term_vectors = bert_vectorize_batch(dataset['Preferred Term Description'].tolist(), batch_size)

# Adding the vectors to the dataset
dataset['Business Term Vector'] = list(business_term_vectors)
dataset['Preferred Term Vector'] = list(preferred_term_vectors)


In [17]:
def calculate_similarity(vec1, vec2):
    return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]


In [18]:
dataset['Similarity Score'] = dataset.apply(lambda x: calculate_similarity(x['Business Term Vector'], x['Preferred Term Vector']), axis=1)


In [19]:
# threshold = 0.8
# outliers = dataset[dataset['Similarity Score'] < threshold]

# # Displaying potential outliers
# outliers
# Calculate the 25th percentile of the similarity scores as the threshold
threshold = dataset['Similarity Score'].quantile(0.25)

# Identifying outliers
outliers = dataset[dataset['Similarity Score'] < threshold]
outliers

Unnamed: 0,Business Term ID,Business Term Name,Business Term Description,Preferred Business Term,Preferred Term Description,Business Term Vector,Preferred Term Vector,Similarity Score
0,BTID_0001,High-Value Customer ID,Description of high-value customer id,Market Analysis,Description of market analysis,"[0.06376215, -0.17287911, -0.059349936, 0.0746...","[-0.19709682, -0.27392277, -0.5515117, -0.1885...",0.715076
1,BTID_0002,Budget Allocation Summary,Description of budget allocation summary,Investment Strategies,Description of investment strategies,"[-0.15764685, -0.21334161, -0.147244, 0.044736...","[-0.0749434, 0.0049636485, -0.38076004, -0.087...",0.719111
6,BTID_0007,Vendor Payment System,Description of vendor payment system,Market Analysis,Description of market analysis,"[-0.11818923, -0.21713245, 0.093217045, 0.0894...","[-0.19709682, -0.27392277, -0.5515117, -0.1885...",0.75058
17,BTID_0018,Budget Allocation Summary,Description of budget allocation summary,Market Analysis,Description of market analysis,"[-0.15764685, -0.21334161, -0.147244, 0.044736...","[-0.19709682, -0.27392277, -0.5515117, -0.1885...",0.708618
18,BTID_0019,Equity Investment Tracker,Description of equity investment tracker,Regulatory Affairs,Description of regulatory affairs,"[0.05248444, -0.31040645, 0.11933964, -0.11970...","[0.3134871, 0.057133112, -0.47340408, 0.099212...",0.719977
19,BTID_0020,Equity Investment Tracker,Description of equity investment tracker,Regulatory Affairs,Description of regulatory affairs,"[0.05248444, -0.31040645, 0.11933964, -0.11970...","[0.3134871, 0.057133112, -0.47340408, 0.099212...",0.719977
20,BTID_0021,Budget Allocation Summary,Description of budget allocation summary,Client Relations,Description of client relations,"[-0.15764685, -0.21334161, -0.147244, 0.044736...","[-0.23902147, 0.1319948, -0.29529908, -0.08826...",0.712706
25,BTID_0026,High-Value Customer ID,Description of high-value customer id,Market Analysis,Description of market analysis,"[0.06376215, -0.17287911, -0.059349936, 0.0746...","[-0.19709682, -0.27392277, -0.5515117, -0.1885...",0.715076
29,BTID_0030,Vendor Payment System,Description of vendor payment system,Market Analysis,Description of market analysis,"[-0.11818923, -0.21713245, 0.093217045, 0.0894...","[-0.19709682, -0.27392277, -0.5515117, -0.1885...",0.75058
37,BTID_0038,Vendor Payment System,Description of vendor payment system,Regulatory Affairs,Description of regulatory affairs,"[-0.11818923, -0.21713245, 0.093217045, 0.0894...","[0.3134871, 0.057133112, -0.47340408, 0.099212...",0.70175


In [20]:
outliers.shape

(24, 8)