In [1]:
from utils import open_json, clean_key, save_json, normalize_text
lad_catalog_data = list(filter(lambda i: i['category'] == 'tractor', open_json('lad_catalog_data.json')))
tractordata_catalog = open_json('tractordata_catalog.json')



In [15]:
from sentence_transformers import SentenceTransformer, util
from utils import open_json, clean_key, save_json, normalize_text
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
# "manufacturer": "NEW HOLLAND",
# "model": "T 7.190 CLASSIC",
# Create combined strings for both lists
def to_full_name(item):
    return f"{item['manufacturer']} {item['model']}"

lad_catalog_data_strings = list(set([normalize_text(item['manufacturer']) for item in lad_catalog_data]))
tractordata_catalog_strings = list(set([normalize_text(item['manufacturer']) for item in tractordata_catalog]))

common_manufacturers = []
for lad_manufacturer in lad_catalog_data_strings:
    for tractordata_manufacturer in tractordata_catalog_strings:
        if lad_manufacturer == tractordata_manufacturer and tractordata_manufacturer not in common_manufacturers:
            common_manufacturers.append(tractordata_manufacturer)
missing_lad_manufacturers = []
missing_tractordata_manufacturers = []
for manufacturer in lad_catalog_data_strings:
    if manufacturer not in common_manufacturers:
        missing_lad_manufacturers.append(manufacturer)
for manufacturer in tractordata_catalog_strings:
    if manufacturer not in common_manufacturers:
        missing_tractordata_manufacturers.append(manufacturer)

print("lad_missing: ", missing_lad_manufacturers)
print("tractor_data_missing: ", missing_tractordata_manufacturers)



lad_missing:  ['lovol']
tractor_data_missing:  ['baldwin', 'fox river tractor company', 'marshall', 'merlin', 'solis', 'big bud', 'ursus', 'ford', 'denning', 'bukh', 'lanz', 'suzue', 'jacobsen', 'ivel agricultural motors', 'rumely', 'zanello', 'grahambradley', 'stihl', 'agripower', 'apollo', 'yto', 'wagner', 'square turn', 'jackson', 'minneapolis', 'international harvester', 'hefty', 'muirhill', 'barreiros', 'imr rakovica', 'haas', 'custom', 'carraro', 'caseih', 'oliver', 'knudson', 'versatile', 'toro', 'cockshutt', 'erkunt', 'farmall', 'ransomes', 'satoh', 'earthforce', 'vst', 'holder', 'müller', 'simpson jumbo', 'fiathesston', 'volvo', 'pioneer tractor', 'county', 'taishanamerica', 'dutra', 'hitachi', 'ferguson', 'eagle manufacturing', 'cbt', 'schramm', 'valmet', 'coop', 'cletrac', 'waltanna', 'waterloo manufacturing company', 'shibaura', 'tafe', 'ag king', 'gibson', 'daedong', 'turnersimplicity', 'eicher', 'cod tractor company', 'acremaster', 'ranch hand', 'friday', 'xtz', 'bobcat',

In [16]:
lad_catalog_by_manufacturers = {}
tractor_data_catalog_by_manufacturers = {}
for manufacturer in common_manufacturers:
    if manufacturer not in lad_catalog_by_manufacturers:
        lad_catalog_by_manufacturers[manufacturer] = []
    if manufacturer not in tractor_data_catalog_by_manufacturers:
        tractor_data_catalog_by_manufacturers[manufacturer] = []
    for item in lad_catalog_data:
        if normalize_text(item['manufacturer']) == manufacturer:
            lad_catalog_by_manufacturers[manufacturer].append(item)
    for item in tractordata_catalog:
        if normalize_text(item['manufacturer']) == manufacturer:
            tractor_data_catalog_by_manufacturers[manufacturer].append(item)

In [17]:
# for manufacturer in common_manufacturers:
#     # Compute embeddings for both model lists
#     strings_lad = [normalize_text(item['model']) for item in lad_catalog_by_manufacturers[manufacturer]]
#     strings_tractordata = [normalize_text(item['model']) for item in tractor_data_catalog_by_manufacturers[manufacturer]]
#     embeddings_lad_catalog_data = model.encode(strings_lad)
#     embeddings_tractordata_catalog = model.encode(strings_tractordata)

#     # Find matches between the two lists
#     matches = util.pytorch_cos_sim(embeddings_lad_catalog_data, embeddings_tractordata_catalog)
#     threshold = 0.4  # Similarity threshold
#     combined_results = []

#     for i, row in enumerate(matches):
#         max_similarity = -1
#         model = ''
#         for j, similarity in enumerate(row):
#             if similarity > max_similarity:
#                 max_similarity = max(similarity, max_similarity)
#                 model = strings_tractordata[j]
#         print(f"{max_similarity} {strings_lad[i]} -> {model}")
for manufacturer in common_manufacturers:
    # Compute normalized model strings for both sources
    strings_lad = [normalize_text(item['model']) for item in lad_catalog_by_manufacturers[manufacturer]]
    strings_tractordata = [normalize_text(item['model']) for item in tractor_data_catalog_by_manufacturers[manufacturer]]

    # Compute embeddings
    embeddings_lad_catalog_data = model.encode(strings_lad)
    embeddings_tractordata_catalog = model.encode(strings_tractordata)

    # Find matches using cosine similarity
    matches = util.pytorch_cos_sim(embeddings_lad_catalog_data, embeddings_tractordata_catalog)
    threshold = 0.4  # Similarity threshold
    combined_results = []

    for i, row in enumerate(matches):
        # Find the best match for the current LAD model
        max_similarity = -1
        best_match_model = None
        for j, similarity in enumerate(row):
            if similarity > max_similarity:
                max_similarity = similarity
                best_match_model = strings_tractordata[j]
        
        # Append results if similarity is above threshold
        if max_similarity > threshold:
            combined_results.append({
                "lad_model": strings_lad[i],
                "matched_model": best_match_model,
                "similarity_score": max_similarity.item()
            })

    # Display matches for the current manufacturer
    print(f"Matches for manufacturer: {manufacturer}")
    for result in combined_results:
        print(f"{result['similarity_score']:.2f} | {result['lad_model']} -> {result['matched_model']}")
    # # Extract best matches and combine specifications
    # threshold = 0.98  # Similarity threshold
    # combined_results = []

    # for i, row in enumerate(matches):
    #     for j, similarity in enumerate(row):
    #         if similarity > threshold:
    #             combined_entry = {
    #                 "lad_manufacturer": lad_catalog_data[i]['manufacturer'],
    #                 "lad_model": lad_catalog_data[i]['model'],
    #                 "tractordata_manufacturer": tractordata_catalog[j]['manufacturer'],
    #                 "tractordata_model": tractordata_catalog[j]['model'],
    #                 "specifications_lad": {**lad_catalog_data[i]['specification']},
    #                 "specifications_tractordata": {**tractordata_catalog[j]['specification']},
    #                 "similarity_score": similarity.item()
    #             }
    #             combined_results.append(combined_entry)

    # print(len(combined_results))
    break

Matches for manufacturer: hinomoto
0.55 | hm475 -> md200


In [None]:
filter()

[{'lad_manufacturer': 'FENDT',
  'lad_model': '512 VARIO',
  'tractordata_manufacturer': 'Fendt',
  'tractordata_model': '512 Vario',
  'specifications_lad': {'powertrain': '4x4',
   'engine_power_kw': 92.4668,
   'engine_cylinders': 4,
   'hydrolic_pump_flow': 158.0,
   'lift_capacity': 8045.0},
  'specifications_tractordata': {'engine_power_kw': 96.0,
   'fuel_capacity': 297.9,
   'powertrain': '4x4',
   'hydrolic_pump_flow': 74.9,
   'lift_capacity': 7780.0,
   'front_lift_capacity': 3420.0,
   'wheelbase': 2.56,
   'weight': 6050.0},
  'similarity_score': 0.9999998211860657},
 {'lad_manufacturer': 'KUBOTA',
  'lad_model': 'M5112',
  'tractordata_manufacturer': 'Kubota',
  'tractordata_model': 'M5112',
  'specifications_lad': {'powertrain': '4x4',
   'engine_power_kw': 84.2641,
   'engine_cylinders': 4,
   'hydrolic_pump_flow': 89.0,
   'lift_capacity': 4100.0},
  'specifications_tractordata': {'engine_power_kw': 84.6,
   'fuel_capacity': 104.8,
   'powertrain': '4x4'},
  'similarit