In [24]:
# This code combines all of the three gold standards into one big file

import pandas as pd

# EA and TM Dataset
perfect_matches_ea_tm = pd.read_csv("ea_tm/perfect_matches_ea_tm.csv")
corner_cases_ea_tm = pd.read_csv("ea_tm/corner_cases_ea_tm.csv")
non_matches_ea_tm = pd.read_csv("ea_tm/non_matches_ea_tm.csv")

# FM and EA Dataset
perfect_matches_fm_ea = pd.read_csv("fm_ea/perfect_matches_fm_ea_v2.csv")
corner_cases_fm_ea = pd.read_csv("fm_ea/corner_cases_fm_ea_v2.csv")
non_matches_fm_ea = pd.read_csv("fm_ea/non_matches_fm_ea_v2.csv")

# FM and TM Dataset
perfect_matches_fm_tm = pd.read_csv("fm_tm/perfect_matches_tm_fm_v2.csv")
corner_cases_fm_tm = pd.read_csv("fm_tm/corner_cases_tm_fm_v2.csv")
non_matches_fm_tm = pd.read_csv("fm_tm/non_matches_tm_fm_v2.csv")

# perfect_matches
perfect_matches = pd.concat([
    perfect_matches_ea_tm,
    perfect_matches_fm_ea,
    perfect_matches_fm_tm
])

# corner_cases
corner_cases = pd.concat([
    corner_cases_ea_tm,
    corner_cases_fm_ea,
    corner_cases_fm_tm
])

# non_matches
non_matches = pd.concat([
    non_matches_ea_tm,
    non_matches_fm_ea,
    non_matches_fm_tm
])


# Concat all of the datasets
gold_standard = pd.concat([perfect_matches, corner_cases, non_matches])

number_extract_perfect_matches = 100
number_extract_corner_cases = 200
number_extract_non_matches = 200

# Gold Standard ea_tm
gold_standard_ea_tm = pd.concat([
    perfect_matches_ea_tm.sample(n=number_extract_perfect_matches, random_state=42), 
    corner_cases_ea_tm.sample(n=number_extract_corner_cases, random_state=42), 
    non_matches_ea_tm.sample(n=number_extract_non_matches, random_state=42)
    ])

# Gold Standard fm_ea
gold_standard_fm_ea = pd.concat([
    perfect_matches_fm_ea.sample(n=number_extract_perfect_matches, random_state=42), 
    corner_cases_fm_ea.sample(n=number_extract_corner_cases, random_state=42), 
    non_matches_fm_ea.sample(n=number_extract_non_matches, random_state=42)
    ])

# Gold Standard fm_tm
gold_standard_fm_tm = pd.concat([
    perfect_matches_fm_tm.sample(n=number_extract_perfect_matches, random_state=42), 
    corner_cases_fm_tm, 
    non_matches_fm_tm.sample(n=number_extract_non_matches, random_state=42)
    ])

In [20]:
len(non_matches_fm_tm)

200

In [26]:
gold_standard_ea_tm.iloc[:, [0, 1, -1]].to_csv("gold_standard_ea_tm.csv", index=False)
gold_standard_fm_ea.iloc[:, [0, 1, -1]].to_csv("gold_standard_fm_ea.csv", index=False)
gold_standard_fm_tm.iloc[:, [0, 1, -1]].to_csv("gold_standard_fm_tm.csv", index=False)

In [2]:
def print_information(df, name):
    overall_number_matches = sum(df['match'] == True)
    overall_number_nonmatches = len(df) - overall_number_matches

    print("-"*5, name, "-"*5)
    print(f"Number of rows: {len(df)}")
    print(f"Number of matches: {overall_number_matches}, {round(overall_number_matches/len(df)*100, 2)}%")
    print(f"Number of non-matches: {overall_number_nonmatches}, {round(overall_number_nonmatches/len(df)*100, 2)}%")
    print()

    # df.head()

In [5]:
# Save gold standard to disk, only the keys and the match column
gold_standard.iloc[:, [0, 1, -1]].to_csv("gold_standard.csv", index=False)

In [4]:
print_information(gold_standard, "Gold Standard")

print_information(perfect_matches_ea_tm, "Perfect Matches EA and TM")
print_information(corner_cases_ea_tm, "Corner Cases EA and TM")
print_information(non_matches_ea_tm, "Non Matches EA and TM")

print_information(perfect_matches_fm_ea, "Perfect Matches FM and EA")
print_information(corner_cases_fm_ea, "Corner Cases FM and EA")
print_information(non_matches_fm_ea, "Non Matches FM and EA")

print_information(perfect_matches_fm_tm, "Perfect Matches FM and TM")
print_information(corner_cases_fm_tm, "Corner Cases FM and TM")
print_information(non_matches_fm_tm, "Non Matches FM and TM")

----- Gold Standard -----
Number of rows: 2820
Number of matches: 1703, 60.39%
Number of non-matches: 1117, 39.61%

----- Perfect Matches EA and TM -----
Number of rows: 158
Number of matches: 158, 100.0%
Number of non-matches: 0, 0.0%

----- Corner Cases EA and TM -----
Number of rows: 666
Number of matches: 527, 79.13%
Number of non-matches: 139, 20.87%

----- Non Matches EA and TM -----
Number of rows: 216
Number of matches: 0, 0.0%
Number of non-matches: 216, 100.0%

----- Perfect Matches FM and EA -----
Number of rows: 128
Number of matches: 128, 100.0%
Number of non-matches: 0, 0.0%

----- Corner Cases FM and EA -----
Number of rows: 356
Number of matches: 264, 74.16%
Number of non-matches: 92, 25.84%

----- Non Matches FM and EA -----
Number of rows: 396
Number of matches: 0, 0.0%
Number of non-matches: 396, 100.0%

----- Perfect Matches FM and TM -----
Number of rows: 544
Number of matches: 544, 100.0%
Number of non-matches: 0, 0.0%

----- Corner Cases FM and TM -----
Number of

# Create Train-Test-Splits

In [14]:
from sklearn.model_selection import train_test_split

def create_goldstandard_train_test_split(df, test_size=0.2, min_train_rows=500):
    """Create a train-test split of the goldstandard data.

    Args:
        df (pd.DataFrame): The goldstandard data.
        test_size (float): The proportion of the data to be used for testing.
        min_train_rows (int): The minimum number of rows in the training dataset.

    Returns:
        pd.DataFrame: The training data.
        pd.DataFrame: The testing data.
    """
    # Check if the minimum number of rows in the training dataset is less than the total number of rows
    if min_train_rows >= len(df):
        print(f"The minimum number of rows in the training dataset should be less than the total number of rows. Dataset has {len(df)} rows. Minimum number of rows in the training dataset is {min_train_rows}.")

    # Perform the train-test split
    train, test = train_test_split(df, test_size=test_size, random_state=42)
    return train, test


def create_goldstandard_balance_1_2_2(perfect_matches:pd.DataFrame, corner_cases:pd.DataFrame, non_matches:pd.DataFrame) -> pd.DataFrame:

    num_perfect_matches = len(perfect_matches)
    num_corner_cases    = len(corner_cases)
    num_non_matches     = len(non_matches)

    num_corner_cases = min(num_corner_cases, num_non_matches)
    num_non_matches  = min(num_corner_cases, num_non_matches)

    if num_corner_cases < (2 * num_perfect_matches):
        num_perfect_matches = num_corner_cases / 2
    else:
        num_corner_cases = 2 * num_perfect_matches
        num_non_matches  = 2 * num_perfect_matches
    print(num_perfect_matches, num_corner_cases, num_non_matches)

    # Gold Standard
    gold_standard = pd.concat([
        perfect_matches.sample(n=int(num_perfect_matches), random_state=42), 
        corner_cases.sample(n=int(num_corner_cases), random_state=42), 
        non_matches.sample(n=int(num_non_matches), random_state=42)
        ])
    
    return gold_standard

In [15]:
gold_standards = [
    'ea_tm',
    'fm_ea',
    'fm_tm'
]


for gold_standard in gold_standards:
    try:
        df_perfect_matches = pd.read_csv(f"{gold_standard}/perfect_matches_{gold_standard}_v2.csv")
    except:
        df_perfect_matches = pd.read_csv(f"{gold_standard}/perfect_matches_{gold_standard}.csv")

    try:
        df_corner_cases = pd.read_csv(f"{gold_standard}/corner_cases_{gold_standard}_v2.csv")
    except:
        df_corner_cases = pd.read_csv(f"{gold_standard}/corner_cases_{gold_standard}.csv")

    try:
        df_non_matches = pd.read_csv(f"{gold_standard}/non_matches_{gold_standard}_v2.csv")
    except:
        df_non_matches = pd.read_csv(f"{gold_standard}/non_matches_{gold_standard}.csv")

    df_balanced = create_goldstandard_balance_1_2_2(df_perfect_matches, df_corner_cases, df_non_matches)
    train, test = create_goldstandard_train_test_split(df_balanced)
    train.to_csv(f"gold_standard_{gold_standard}_train.csv", index=False)
    test.to_csv(f"gold_standard_{gold_standard}_test.csv", index=False)

108.0 216 216
128 256 256
92.0 184 184
The minimum number of rows in the training dataset should be less than the total number of rows. Dataset has 460 rows. Minimum number of rows in the training dataset is 500.
