In [6]:
from utils import open_json, clean_key, save_json, normalize_text
lad_catalog_data = list(filter(lambda i: i['category'] == 'tractor', open_json('lad_catalog_data.json')))
tractordata_catalog = open_json('tractordata_catalog.json')



In [7]:
# using nltk
from nltk.tokenize import word_tokenize
import nltk
from fuzzywuzzy import fuzz
from utils import open_json, clean_key, save_json, normalize_text
nltk.download('punkt_tab')
def nltk_fuzzy_similarity(a: str, b: str) -> float:
    normalized_a = normalize_text(a)
    normalized_b = normalize_text(b)

    tokenized_a = word_tokenize(normalized_a)
    tokenized_b = word_tokenize(normalized_b)

    return fuzz.token_sort_ratio(normalized_a, normalized_b) / 100

lad_model = "MULTI 4120"
tractordata_model = "4120 Multi"

similarity = nltk_fuzzy_similarity(lad_model, tractordata_model)


print(f"Similarity: {similarity * 100:.2f} %")

Similarity: 100.00 %


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lietotajs\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
# using sentence transformers
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

lad_model = "MULTI 4120"
tractordata_model = "4120 Multi"

def sentence_transformer_similarity(model: SentenceTransformer, a: str, b: str) -> float:
    normalized_a = normalize_text(a)
    normalized_b = normalize_text(b)
    embedding_a = model.encode(normalized_a)
    embedding_b = model.encode(normalized_b)

    return util.cos_sim(embedding_a, embedding_b).item()
similarity = sentence_transformer_similarity(model, lad_model, tractordata_model)
print(f"Similarity: {similarity * 100:.2f} %")

Similarity: 96.49 %


In [9]:
# rapid fuzz
from rapidfuzz import fuzz

lad_model = "MULTI 4120"
tractordata_model = "4120 Multi"
def rapidfuzz_similarity(a: str, b: str) -> float:
    normalized_a = normalize_text(a)
    normalized_b = normalize_text(b)
    return fuzz.token_sort_ratio(normalized_a, normalized_b) / 100

similarity = rapidfuzz_similarity(lad_model, tractordata_model)
print(f"Similarity: {similarity * 100:.2f} %")

Similarity: 100.00 %


In [14]:
# combined sentence transformer & rapid fuzz matching
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

def rapidfuzz_similarity(a: str, b: str) -> float:
    normalized_a = normalize_text(a)
    normalized_b = normalize_text(b)
    return fuzz.token_sort_ratio(normalized_a, normalized_b) / 100

def sentence_transformer_similarity(model: SentenceTransformer, a: str, b: str) -> float:
    normalized_a = normalize_text(a)
    normalized_b = normalize_text(b)
    embedding_a = model.encode(normalized_a)
    embedding_b = model.encode(normalized_b)

    return util.cos_sim(embedding_a, embedding_b).item()

# fuzzy and sentence matching
def sentence_transformer_fuzzy_similarity(model: SentenceTransformer, a: str, b: str) -> float: 
    fuzzy_score = rapidfuzz_similarity(a, b)
    sentence_similarity = sentence_transformer_similarity(model, a, b)
    return (fuzzy_score * 0.7) + (sentence_similarity * 0.3)

lad_model = "MULTI 4120"
tractordata_model = "4120 Multi"

similarity = sentence_transformer_fuzzy_similarity(model, lad_model, tractordata_model)
print(f"Similarity: {similarity * 100:.2f} %")

Similarity: 98.95 %
