In [30]:
from utils import open_json, clean_key, save_json, normalize_text
lad_catalog_data = list(filter(lambda i: i['category'] == 'tractor', open_json('lad_catalog_data.json')))
tractordata_catalog = open_json('tractordata_catalog.json')
lad_models = [item['model'] for item in filter(lambda i: i['manufacturer_key'] == 'steyr', lad_catalog_data)]
tractordata_models = [item['model'] for item in filter(lambda i: i['manufacturer_key'] == 'steyr', tractordata_catalog)]


In [31]:
# using nltk
from nltk.tokenize import word_tokenize
import nltk
from fuzzywuzzy import fuzz
from utils import open_json, clean_key, save_json, normalize_text
nltk.download('punkt_tab')
def nltk_fuzzy_similarity(a: str, b: str) -> float:
    normalized_a = normalize_text(a)
    normalized_b = normalize_text(b)

    tokenized_a = word_tokenize(normalized_a)
    tokenized_b = word_tokenize(normalized_b)

    return fuzz.token_sort_ratio(normalized_a, normalized_b) / 100

lad_model = "MULTI 4120"
tractordata_model = "4120 Multi"

similarity = nltk_fuzzy_similarity(lad_model, tractordata_model)


print(f"Similarity: {similarity * 100:.2f} %")

Similarity: 100.00 %


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lietotajs\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [32]:
# using sentence transformers
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

lad_model = "MULTI 4120"
tractordata_model = "4120 Multi"

def sentence_transformer_similarity(model: SentenceTransformer, a: str, b: str) -> float:
    normalized_a = normalize_text(a)
    normalized_b = normalize_text(b)
    embedding_a = model.encode(normalized_a)
    embedding_b = model.encode(normalized_b)

    return util.cos_sim(embedding_a, embedding_b).item()
similarity = sentence_transformer_similarity(model, lad_model, tractordata_model)
print(f"Similarity: {similarity * 100:.2f} %")

Similarity: 96.49 %


In [33]:
# rapid fuzz
from rapidfuzz import fuzz

lad_model = "MULTI 4120"
tractordata_model = "4120 Multi"
def rapidfuzz_similarity(a: str, b: str) -> float:
    normalized_a = normalize_text(a)
    normalized_b = normalize_text(b)
    return fuzz.token_sort_ratio(normalized_a, normalized_b) / 100

similarity = rapidfuzz_similarity(lad_model, tractordata_model)
print(f"Similarity: {similarity * 100:.2f} %")

Similarity: 100.00 %


In [34]:
# combined sentence transformer & rapid fuzz matching
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

def rapidfuzz_similarity(a: str, b: str) -> float:
    normalized_a = normalize_text(a)
    normalized_b = normalize_text(b)
    return fuzz.token_sort_ratio(normalized_a, normalized_b) / 100

def sentence_transformer_similarity(model: SentenceTransformer, a: str, b: str) -> float:
    normalized_a = normalize_text(a)
    normalized_b = normalize_text(b)
    embedding_a = model.encode(normalized_a)
    embedding_b = model.encode(normalized_b)

    return util.cos_sim(embedding_a, embedding_b).item()

# fuzzy and sentence matching
def sentence_transformer_fuzzy_similarity(model: SentenceTransformer, a: str, b: str) -> float: 
    fuzzy_score = rapidfuzz_similarity(a, b)
    sentence_similarity = sentence_transformer_similarity(model, a, b)
    return (fuzzy_score * 0.6) + (sentence_similarity * 0.4), fuzzy_score, sentence_similarity

lad_model = "MULTI 4120"
tractordata_model = "4120 Multi"

similarity, fuzzy, sentence_similarity = sentence_transformer_fuzzy_similarity(model, lad_model, tractordata_model)
print(f"Similarity: {similarity * 100:.2f} %")

Similarity: 98.24 %


In [35]:
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

for lmodel in lad_models:
    best_match = (-1, -1, -1, '')
    for tmodel in tractordata_models:
        similarity, fuzzy, sentence_similarity = sentence_transformer_fuzzy_similarity(model, lmodel, tmodel)
        if similarity > best_match[0]:
            best_match = (similarity, fuzzy, sentence_similarity , tmodel)
    print(f"'{lmodel}' best match '{best_match[3]}' with similarity {best_match[0]} (fuzzy: {best_match[1]}, sentence: {best_match[2]})")

'MULTI 4120' best match '4120 Multi' with similarity 0.9824487268924713 (fuzzy: 1.0, sentence: 0.9648974537849426)
'MULTI 4120' best match '4120 Multi' with similarity 0.9824487268924713 (fuzzy: 1.0, sentence: 0.9648974537849426)
'KOMPAKT 4100' best match '4100 Multi' with similarity 0.6419656330888921 (fuzzy: 0.6363636363636364, sentence: 0.647567629814148)
'KOMPAKT 4080' best match '4100 Multi' with similarity 0.4719266959212043 (fuzzy: 0.5454545454545454, sentence: 0.39839884638786316)
'PROFI 6150' best match '6140 Profi' with similarity 0.8886162459850311 (fuzzy: 0.9, sentence: 0.8772324919700623)
'PROFI 6150 CVT' best match '4120 Profi CVT' with similarity 0.82660254410335 (fuzzy: 0.8571428571428572, sentence: 0.7960622310638428)
'IMPULS 6165 CVT' best match '6150 CVT' with similarity 0.7168168853158536 (fuzzy: 0.6086956521739131, sentence: 0.8249381184577942)
'IMPULS 6175 CVT' best match '6175 CVT' with similarity 0.7614860767903535 (fuzzy: 0.6956521739130435, sentence: 0.8273199