In [29]:
import pandas as pd
import json

# File paths
model_output_file = "/home/hermann/ditto/output/matches.jsonl"  # Update with your Ditto output file path
ground_truth_file = "../ground_truth.csv"  # Update with your ground truth file path


columns = [
    "c1.company_id", "c1.company_name", "c1.industry", "c1.sector", "c1.categories",
    "c1.company_status", "c1.company_type", "c1.address", "c1.postal_code", "c1.city", "c1.country", "c1.state",
    "c1.foundation_year", "c1.registration_date", "c1.website", "c1.rank", "c1.market_cap_or_valuation",
    "c1.number_of_employees", "c1.ceo", "c1.assets", "c1.profit_or_net_income", "c1.revenue", 
    "c1.share_price", "c1.change_1_day", "c1.change_1_year", "c1.debts", "c1.phone", "c1.notes_or_description",
    "c2.company_id", "c2.company_name", "c2.industry", "c2.sector", "c2.categories",
    "c2.company_status", "c2.company_type", "c2.address", "c2.postal_code", "c2.city", "c2.country", "c2.state",
    "c2.foundation_year", "c2.registration_date", "c2.website", "c2.rank", "c2.market_cap_or_valuation",
    "c2.number_of_employees", "c2.ceo", "c2.assets", "c2.profit_or_net_income", "c2.revenue", 
    "c2.share_price", "c2.change_1_day", "c2.change_1_year", "c2.debts", "c2.phone", "c2.notes_or_description",
    "similarity_score", "is_match"
]


In [30]:
def process_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            record = json.loads(line)
            left = record.get("left", {})
            right = record.get("right", {})
            
            row = {
                "c1.company_id": left.get("company_id", ""),
                "c1.company_name": left.get("company_name", ""),
                "c1.industry": left.get("industry", ""),
                "c1.sector": left.get("sector", ""),
                "c1.categories": left.get("categories", ""),
                "c1.company_status": left.get("company_status", ""),
                "c1.company_type": left.get("company_type", ""),
                "c1.address": left.get("address", ""),
                "c1.postal_code": left.get("postal_code", ""),
                "c1.city": left.get("city", ""),
                "c1.country": left.get("country", ""),
                "c1.state": left.get("state", ""),
                "c1.foundation_year": left.get("foundation_year", ""),
                "c1.registration_date": left.get("registration_date", ""),
                "c1.website": left.get("website", ""),
                "c1.rank": left.get("rank", ""),
                "c1.market_cap_or_valuation": left.get("market_cap_or_valuation", ""),
                "c1.number_of_employees": left.get("number_of_employees", ""),
                "c1.ceo": left.get("ceo", ""),
                "c1.assets": left.get("assets", ""),
                "c1.profit_or_net_income": left.get("profit_or_net_income", ""),
                "c1.revenue": left.get("revenue", ""),
                "c1.share_price": left.get("share_price", ""),
                "c1.change_1_day": left.get("change_1_day", ""),
                "c1.change_1_year": left.get("change_1_year", ""),
                "c1.debts": left.get("debts", ""),
                "c1.phone": left.get("phone", ""),
                "c1.notes_or_description": left.get("notes_or_description", ""),
                "c2.company_id": right.get("company_id", ""),
                "c2.company_name": right.get("company_name", ""),
                "c2.industry": right.get("industry", ""),
                "c2.sector": right.get("sector", ""),
                "c2.categories": right.get("categories", ""),
                "c2.company_status": right.get("company_status", ""),
                "c2.company_type": right.get("company_type", ""),
                "c2.address": right.get("address", ""),
                "c2.postal_code": right.get("postal_code", ""),
                "c2.city": right.get("city", ""),
                "c2.country": right.get("country", ""),
                "c2.state": right.get("state", ""),
                "c2.foundation_year": right.get("foundation_year", ""),
                "c2.registration_date": right.get("registration_date", ""),
                "c2.website": right.get("website", ""),
                "c2.rank": right.get("rank", ""),
                "c2.market_cap_or_valuation": right.get("market_cap_or_valuation", ""),
                "c2.number_of_employees": right.get("number_of_employees", ""),
                "c2.ceo": right.get("ceo", ""),
                "c2.assets": right.get("assets", ""),
                "c2.profit_or_net_income": right.get("profit_or_net_income", ""),
                "c2.revenue": right.get("revenue", ""),
                "c2.share_price": right.get("share_price", ""),
                "c2.change_1_day": right.get("change_1_day", ""),
                "c2.change_1_year": right.get("change_1_year", ""),
                "c2.debts": right.get("debts", ""),
                "c2.phone": right.get("phone", ""),
                "c2.notes_or_description": right.get("notes_or_description", ""),
                "match_confidence": record.get("match_confidence", ""),
                "is_match": record.get("match", "")
            }
            data.append(row)
    
    df = pd.DataFrame(data, columns=columns)
    return df

In [31]:
matches_df = process_jsonl(model_output_file)
display(matches_df.columns)

Index(['c1.company_id', 'c1.company_name', 'c1.industry', 'c1.sector',
       'c1.categories', 'c1.company_status', 'c1.company_type', 'c1.address',
       'c1.postal_code', 'c1.city', 'c1.country', 'c1.state',
       'c1.foundation_year', 'c1.registration_date', 'c1.website', 'c1.rank',
       'c1.market_cap_or_valuation', 'c1.number_of_employees', 'c1.ceo',
       'c1.assets', 'c1.profit_or_net_income', 'c1.revenue', 'c1.share_price',
       'c1.change_1_day', 'c1.change_1_year', 'c1.debts', 'c1.phone',
       'c1.notes_or_description', 'c2.company_id', 'c2.company_name',
       'c2.industry', 'c2.sector', 'c2.categories', 'c2.company_status',
       'c2.company_type', 'c2.address', 'c2.postal_code', 'c2.city',
       'c2.country', 'c2.state', 'c2.foundation_year', 'c2.registration_date',
       'c2.website', 'c2.rank', 'c2.market_cap_or_valuation',
       'c2.number_of_employees', 'c2.ceo', 'c2.assets',
       'c2.profit_or_net_income', 'c2.revenue', 'c2.share_price',
       'c2.cha

In [32]:
ground_truth_df = pd.read_csv(ground_truth_file)
display(ground_truth_df.columns)

Index(['c1.company_id', 'c1.company_name', 'c1.industry', 'c1.sector',
       'c1.categories', 'c1.company_status', 'c1.company_type', 'c1.address',
       'c1.postal_code', 'c1.city', 'c1.country', 'c1.state',
       'c1.foundation_year', 'c1.registration_date', 'c1.website', 'c1.rank',
       'c1.market_cap_or_valuation', 'c1.number_of_employees', 'c1.ceo',
       'c1.assets', 'c1.profit_or_net_income', 'c1.revenue', 'c1.share_price',
       'c1.change_1_day', 'c1.change_1_year', 'c1.debts', 'c1.phone',
       'c1.notes_or_description', 'c2.company_id', 'c2.company_name',
       'c2.industry', 'c2.sector', 'c2.categories', 'c2.company_status',
       'c2.company_type', 'c2.address', 'c2.postal_code', 'c2.city',
       'c2.country', 'c2.state', 'c2.foundation_year', 'c2.registration_date',
       'c2.website', 'c2.rank', 'c2.market_cap_or_valuation',
       'c2.number_of_employees', 'c2.ceo', 'c2.assets',
       'c2.profit_or_net_income', 'c2.revenue', 'c2.share_price',
       'c2.cha

In [41]:
def keep_only_intersection(matches_df, ground_truth_df):
    # Identify the columns that should be used for comparison (excluding match_confidence, match, similarity_score, is_match)
    common_columns = [col for col in matches_df.columns if col not in ["match_confidence", "match", "similarity_score", "is_match"]]

    # Perform an intersection by merging on common columns
    intersection_df = pd.merge(matches_df, ground_truth_df, on=common_columns, how="inner")

    return intersection_df

In [None]:
filtered_df = keep_only_intersection(matches_df, ground_truth_df)

print(f"Ground truth size: {len(ground_truth_df)}")
print(f"Match size: {len(matches_df)}")
print(f"Filtered size: {len(filtered_df)}")

display(filtered_df.head())

Ground truth size: 504
Match size: 15030
Filtered size: 15030


Unnamed: 0,c1.company_id,c1.company_name,c1.industry,c1.sector,c1.categories,c1.company_status,c1.company_type,c1.address,c1.postal_code,c1.city,...,c2.profit_or_net_income,c2.revenue,c2.share_price,c2.change_1_day,c2.change_1_year,c2.debts,c2.phone,c2.notes_or_description,similarity_score,is_match
0,c11c4decba8742b8853058c68875e0b6,1-800-flowers.com,,,"['Consumer Cyclical', 'Retail']",,,,,,...,,,,,,,,headquarter,,1
1,1804,1-800-flowers.com,,,,,,2 jericho plz ste 200,117531681.0,jericho,...,,$2.2b,,,,,1 516 2376000,,,1
2,e33d30e1d309429198c1f23eafcb88db,10x genomics,,,"['Genomics', 'Biotech']",,,,,,...,-58 million usd,490 million usd,,,,201 million usd,,,,1
3,2feb10d62d3545c585a116727108a8d5,11,,,"['Telecommunication', 'Internet']",,,,,,...,,,,,,,,,,1
4,,11 bit studios,technology,software,,,,,,,...,,19.27,,,,,,,,1
