In [None]:
import os

from typing import List

import pandas as pd
import numpy as np

from matching.utils.pair_generation import naive_pair_generation
from matching.utils.db.extract import extract_labelled_samples
from matching.utils.db.features import get_relevant_features

import matching.config as conf

config = conf.Settings()

# Data Loading


In [None]:
from matching.utils.db.dump import dump_database

data = dump_database(
    path="data/db.csv",
    uri=os.getenv("NEO4J_URI"),
    username=os.getenv("NEO4J_URI"),
    password=os.getenv("NEO4J_PASSWORD"),
)

In [None]:
data = pd.read_csv("data/db.csv", index_col=0)

data = get_relevant_features(data)

## Embedding Generation

Only run this code if you do not have an embeddings database available to you.


In [None]:
from matching.embed.openai import OpenAIEmbedder
from matching.utils.db.features import get_relevant_features

data = pd.read_csv("./data/db.csv")

data = get_relevant_features(data)
embedder = OpenAIEmbedder(collection_name="openai_small")

embedder.batch_embed(data)

# Data augmentation prior to pair generation


# Labelled Sample extraction

This cell iterates over our dataset and extracts all pairs where the munichNumber / wccpNumber etc. foreign keys are present. We assume a clean to clean mapping between the foreign keys and the actual object.


In [None]:
if os.path.exists("data/found_samples.csv"):
    found_samples = pd.read_csv("data/found_samples.csv")
else:
    found_samples = extract_labelled_samples(data)
    found_samples.to_csv("data/found_samples.csv", index=False)

found_samples.head()

In [None]:
if config.drop_foreign_keys:
    data.drop(
        columns=[
            "munichNumber",
            "marburgNumber",
            "errNumber",
            "linzNumber",
            "wccpNumber",
        ],
        inplace=True,
    )

# Pair generation


In [None]:
matches = pd.DataFrame()

available_data = pd.DataFrame(columns=["match_id"])

for index, row in found_samples.iterrows():

    matched_data: pd.DataFrame = pd.DataFrame()

    for col in found_samples.columns:
        if str(row[col]) != "nan" and row[col] is not None:
            test = data[data["uri"] == row[col]]
            if len(test) == 0:
                print("No match found for", row[col])
                print(row)
            matched_data = pd.concat([matched_data, data[data["uri"] == row[col]]])

    pairs = naive_pair_generation(matched_data)

    pairs["match_id"] = index

    matches = pd.concat([matches, pairs])

    matched_data["match_id"] = int(index)

    available_data = pd.concat([available_data, matched_data])

matches.reset_index(drop=True, inplace=True)
matches

## Add negative samples


In [None]:
from matching.generation.neighbourhood import Neighbourhood
from matching.generation.pair_generator import PairGenerator


pair_generator = PairGenerator(data=available_data)

neighbourhood = Neighbourhood(
    data=data,
    collection_name="openai_small",
    num_neighbours=2,
)

pair_generator.neighbourhood_generator = neighbourhood

In [None]:
negative_samples = pd.DataFrame()


np.random.seed(0)
count = 0


for index, row in matches.iterrows():
    left = row[[col for col in row.index if col.startswith("1_")]]
    right = row[[col for col in row.index if col.startswith("2_")]]

    neighbourhood_left = neighbourhood.find_neighbourhood(left.rename(lambda x: x[2:]))
    neighbourhood_right = neighbourhood.find_neighbourhood(
        right.rename(lambda x: x[2:])
    )

    neighbours = []

    for uri, neighbour in neighbourhood_right.iterrows():
        neighbour = neighbour.rename(lambda x: "2_" + x)
        neighbours.append(pd.concat([left, neighbour]))

    for uri, neighbour in neighbourhood_left.iterrows():
        neighbour = neighbour.rename(lambda x: "1_" + x)
        neighbours.append(pd.concat([neighbour, right]))

    random_index = np.random.randint(0, len(neighbours))
    random_select = neighbours[random_index]

    negative_samples = pd.concat([negative_samples, random_select.to_frame().T])

negative_samples["label"] = 0

negative_samples = negative_samples.loc[
    :, ~negative_samples.columns.str.contains("match_id")
].copy()

negative_samples.head()

In [None]:
labelled_data = pd.concat([matches, negative_samples])
labelled_data = labelled_data.sample(frac=1).reset_index(drop=True)
labelled_data = labelled_data.loc[:, ~labelled_data.columns.str.contains("match_id")]
labelled_data.to_csv("data/labelled_data.csv", index=False)

labelled_data.drop_duplicates().to_csv("data/labelled_data_dropped.csv", index=False)

In [None]:
print("Positive samples: ", len(matches))
print("Negative samples: ", len(negative_samples))