In [80]:
# Find Incremental Matches Demo

In [2]:
% pip install -r requirements.txt

UsageError: Line magic function `%` not found.


In [3]:
# Standard Library
import csv
import string
import random
from typing import List, Dict

# Third Party Library
import attr
from attrs_mate import AttrsClass
from pathlib_mate import Path

from boto_session_manager import BotoSesManager
from s3pathlib import S3Path, context

import pandas as pd
import awswrangler as wr

from faker import Faker
from rich import print as rprint

# Define Some Utility Functions

First, we need to define some utility functions to keep our code clean.

In [22]:
def add_noise_to_text(text: str, n_noise: int) -> str:
    """
    Randomly add noise character to string.

    Example::

        >>> add_noise_to_text("1234567890", n_noise=3)
        123d56e890
    """
    length = len(text)
    if n_noise > length:
        raise ValueError
    chars = list(text)
    for _ in range(n_noise):
        chars[random.randint(1, length) - 1] = random.choice(string.ascii_lowercase)
    return "".join(chars)


def rand_phone_number() -> str:
    """
    Generate random phone number.

    Example::

        123-456-7890
    """
    numbers = [str(random.randint(0, 9)) for _ in range(10)]
    return "".join(
        numbers[:3]
        + [
            "-",
        ]
        + numbers[3:6]
        + [
            "-",
        ]
        + numbers[6:]
    )


def add_noise_to_phone(phone: str, n_noise: int) -> str:
    """
    Example::

        >>> add_noise_to_phone("111-222-3333", n_noise=1)
        111-222-3353
    """
    chars = list(phone)
    positions = [0, 1, 2, 4, 5, 6, 8, 9, 10]
    for ind in random.sample(positions, n_noise):
        chars[ind] = random.choice(string.digits)
    return "".join(chars)


fake = Faker(locale="en-US")


@attr.define
class Person(AttrsClass):
    """
    A variation of a true person.
    """
    firstname: str = attr.ib()
    lastname: str = attr.ib()
    phone: str = attr.ib()


@attr.define
class TruePerson(AttrsClass):
    """
    represent a concrete human person. he/she could have multiple
    name, phone.
    """
    id: int = attr.ib()
    firstname_list: str = attr.ib()
    lastname_list: str = attr.ib()
    phone_list: str = attr.ib()

    @classmethod
    def random(cls, id: int) -> 'TruePerson':
        firstname = fake.first_name()
        lastname = fake.last_name()
        phone = rand_phone_number()

        firstname_list = [
            firstname,
        ]
        lastname_list = [
            lastname,
        ]
        phone_list = [
            phone,
        ]
        for _ in range(2):
            firstname_list.append(add_noise_to_text(firstname, random.randint(1, 2)))
            lastname_list.append(add_noise_to_text(lastname, random.randint(1, 2)))
            phone_list.append(add_noise_to_phone(phone, random.randint(0, 1)))

        return cls(
            id=id,
            firstname_list=firstname_list,
            lastname_list=lastname_list,
            phone_list=phone_list,
        )

    def to_person(self) -> 'Person':
        return Person(
            firstname=random.choice(self.firstname_list),
            lastname=random.choice(self.lastname_list),
            phone=random.choice(self.phone_list),
        )

In [23]:
print("------ Sample TruePerson ------")
t_person = TruePerson.random(id=1)
rprint(t_person.to_dict())

------ Sample TruePerson ------


In [24]:
print("------ Sample Person ------")
person = t_person.to_person()
rprint(person.to_dict())

------ Sample Person ------


## Define Your Test Environment Configuration

We need to define the AWS Credential, the AWS S3 bucket we use to store the data and the Glue Catalog database / table name.

In [55]:
# Define AWS boto3 credentials for session
bsm = BotoSesManager(profile_name="aws_data_lab_sanhe_us_east_2")
context.attach_boto_session(boto_ses=bsm.boto_ses)

# Where you want to store your test data locally
dir_here = Path.cwd()

# Where you want to store your test data on S3
s3path_prefix = S3Path.from_s3_uri(
    "s3://aws-data-lab-sanhe-for-everything-us-east-2/poc/2022-05-18-glue-find-matches/find-incr-matches/"
).to_dir()

# Clear all existing data
for p in dir_here.select_by_ext(".csv"):
    p.remove_if_exists()
s3path_prefix.delete_if_exists()

path_all_csv = Path(dir_here, "all.csv")
path_records_csv = Path(dir_here, "records.csv")
path_labels_csv = Path(dir_here, "labels.csv")
path_tests_csv = Path(dir_here, "tests.csv")
path_predicts_csv = Path(dir_here, "predicts.csv")
path_compares_csv = Path(dir_here, "compares.csv")

s3path_records = S3Path(s3path_prefix, "records")
s3path_labels = S3Path(s3path_prefix, "labels")
s3path_tests = S3Path(s3path_prefix, "tests")
s3path_predicts = S3Path(s3path_prefix, "predicts")

# Glue Catalog config
db_name = "learn_glue_find_matches"
tb_name_records = "records"
tb_name_labels = "labels"
tb_name_tests = "tests"
tb_name_predicts = "predicts"

# pandas.to_csv default keyword arguments
quoting = csv.QUOTE_NONNUMERIC
pd_to_csv_kwargs = dict(
    sep=",",
    index=False,
    header=True,
    quoting=quoting,
)
# awswrangler.s3.to_csv default keyword arguments
wr_to_csv_kwargs = dict(
    sep=",",
    index=False,
    header=True,
    quoting=quoting,
)

## Define Your Test Dataset Configuration

We need to define some statistics information about our test dataset

- ``n_label_set_id``: based on [official document](https://docs.aws.amazon.com/glue/latest/dg/machine-learning.html), the training dataset should be split into "chunks". Each chunk is a "labeling_set". Within each "labeling_set", you could have many records (no more than 300, it helps the ML training fast and efficient), and those records can be clustered into different "group". Each group should have a "label". Records belongs to the same "group" considered as a "match". With in each "labeling_set", you should have some "match group" that has many records, and also have some "non match group" that has only one record, which indicate that this record doesn't match any of the others. **This config defines the total number of labeling set you want to generate**.
- ``n_label_per_set``: The number of records in each labeling set.
- ``n_sample_list``: records in each labeling set will be divided into different group, this config defines the number of records for each group.


# Generate Training Data

In [73]:
columns_all = "labeling_set_id,label,id,tid,firstname,lastname,phone".split(",")
columns_train = "id,firstname,lastname,phone".split(",")
columns_label = "labeling_set_id,label,id,firstname,lastname,phone".split(",")


def generate_dataset(
    start_id: int,
    start_tid: int,
    n_labeling_set_id: int,
    hist: List[int],
) -> pd.DataFrame:
    """
    :param start_id: 全局唯一的 Id 的初始值. 例如生成 1000 条数据, 起始值是 1,
        那么就会 Id 就是 1 ~ 1000.
    :param start_tid: True Person Id 的初始值. 数据集包含 100 个 TruePerson,
        起始值是 1, 那么就会 Tid 就是 1 ~ 100.
    :param n_labeling_set_id: 整个数据集分成多少个 labeling set. 对于 training
        而言, 每个 labeling set 是一个独立的数据集, 不同 labeling set 之间的
        数据不会被互相匹配.
    :param hist: 在每个 labeling set 内, 属于同一个 True Person 数据的频次统计.
        例如 [10, 6, 3, 1] 就代表一个 labeling set 内, 有 4 个 True Person,
        其中 10 条数据都是同一个人的数据的变种. 其中 6, 3, 1 条数据都是 (下略).
    """
    # global unique identifier for each records
    id = start_id - 1

    # the True Person id, if multiple records has the same tid,
    # they should considered as a match
    # We can use this value to validate the ML predict
    tid = start_tid - 1

    rows = list()
    for labeling_set_id in range(1, 1 + n_labeling_set_id):
        for label, n_sample in enumerate(hist, start=1):
            tid += 1
            true_person = TruePerson.random(tid)
            for _ in range(n_sample):
                id += 1
                person = true_person.to_person()
                row = dict(
                    labeling_set_id=f"LabelingSetId-{labeling_set_id}",
                    label=f"Label-{label}",
                    id=f"PersonId-{str(id).zfill(5)}",
                    tid=f"TrueId-{tid}",
                    firstname=person.firstname,
                    lastname=person.lastname,
                    phone=person.phone,
                )
                rows.append(row)

    df = pd.DataFrame(
        rows,
        columns=columns_all,
    )
    return df

In [74]:
df_all_for_train = generate_dataset(
    start_id=1,
    start_tid=1,
    n_labeling_set_id=1000,
    hist=[20, 10, 6, 3, 1],
)

In [75]:
print("------ training dataset with label ------")
print(f"{df_all_for_train.shape[0]} rows, {df_all_for_train.shape[1]} columns")
df_all_for_train.head(40)

------ training dataset with label ------
40000 rows, 7 columns


Unnamed: 0,labeling_set_id,label,id,tid,firstname,lastname,phone
0,LabelingSetId-1,Label-1,PersonId-00001,TrueId-1,Christopher,rjott,675-862-4727
1,LabelingSetId-1,Label-1,PersonId-00002,TrueId-1,Chhistophnr,Scott,675-832-4727
2,LabelingSetId-1,Label-1,PersonId-00003,TrueId-1,Chhistophnr,Scoyt,675-862-4727
3,LabelingSetId-1,Label-1,PersonId-00004,TrueId-1,Chhistophnr,Scoyt,675-832-4727
4,LabelingSetId-1,Label-1,PersonId-00005,TrueId-1,Christopher,Scott,675-832-4727
5,LabelingSetId-1,Label-1,PersonId-00006,TrueId-1,Christopher,rjott,675-832-4727
6,LabelingSetId-1,Label-1,PersonId-00007,TrueId-1,Chriztorher,Scoyt,675-862-4727
7,LabelingSetId-1,Label-1,PersonId-00008,TrueId-1,Chhistophnr,Scott,675-832-4727
8,LabelingSetId-1,Label-1,PersonId-00009,TrueId-1,Christopher,Scoyt,675-832-4727
9,LabelingSetId-1,Label-1,PersonId-00010,TrueId-1,Christopher,rjott,675-832-4727


In [76]:
df_train = df_all_for_train[columns_train]
df_label = df_all_for_train[columns_label]

In [77]:
print("------ training dataset WITHOUT label ------")
print(f"{df_train.shape[0]} rows, {df_train.shape[1]} columns")
df_train.head(40)

------ training dataset WITHOUT label ------
40000 rows, 4 columns


Unnamed: 0,id,firstname,lastname,phone
0,PersonId-00001,Christopher,rjott,675-862-4727
1,PersonId-00002,Chhistophnr,Scott,675-832-4727
2,PersonId-00003,Chhistophnr,Scoyt,675-862-4727
3,PersonId-00004,Chhistophnr,Scoyt,675-832-4727
4,PersonId-00005,Christopher,Scott,675-832-4727
5,PersonId-00006,Christopher,rjott,675-832-4727
6,PersonId-00007,Chriztorher,Scoyt,675-862-4727
7,PersonId-00008,Chhistophnr,Scott,675-832-4727
8,PersonId-00009,Christopher,Scoyt,675-832-4727
9,PersonId-00010,Christopher,rjott,675-832-4727


In [78]:
print("------ training dataset label ONLY  ------")
print(f"{df_label.shape[0]} rows, {df_label.shape[1]} columns")
df_label.head(40)

------ training dataset label ONLY  ------
40000 rows, 6 columns


Unnamed: 0,labeling_set_id,label,id,firstname,lastname,phone
0,LabelingSetId-1,Label-1,PersonId-00001,Christopher,rjott,675-862-4727
1,LabelingSetId-1,Label-1,PersonId-00002,Chhistophnr,Scott,675-832-4727
2,LabelingSetId-1,Label-1,PersonId-00003,Chhistophnr,Scoyt,675-862-4727
3,LabelingSetId-1,Label-1,PersonId-00004,Chhistophnr,Scoyt,675-832-4727
4,LabelingSetId-1,Label-1,PersonId-00005,Christopher,Scott,675-832-4727
5,LabelingSetId-1,Label-1,PersonId-00006,Christopher,rjott,675-832-4727
6,LabelingSetId-1,Label-1,PersonId-00007,Chriztorher,Scoyt,675-862-4727
7,LabelingSetId-1,Label-1,PersonId-00008,Chhistophnr,Scott,675-832-4727
8,LabelingSetId-1,Label-1,PersonId-00009,Christopher,Scoyt,675-832-4727
9,LabelingSetId-1,Label-1,PersonId-00010,Christopher,rjott,675-832-4727


In [79]:
# Dump to local and s3
path_train_csv = Path(dir_here, "01-train.csv")
df_train.to_csv(path_train_csv, **pd_to_csv_kwargs)

path_label_csv = Path(dir_here, "02-label.csv")
df_label.to_csv(path_label_csv, **pd_to_csv_kwargs)

s3path_train_csv = S3Path(s3path_prefix, "01-train", "1.csv")
wr.s3.to_csv(df_train, path=s3path_train_csv.uri, **wr_to_csv_kwargs)
print(f"created {s3path_train_csv.uri}")
print(f"  preview at {s3path_train_csv.uri}")

s3path_label_csv = S3Path(s3path_prefix, "02-label", "1.csv")
wr.s3.to_csv(df_label, path=s3path_label_csv.uri, **wr_to_csv_kwargs)
print(f"created {s3path_label_csv.uri}")
print(f"  preview at {s3path_label_csv.uri}")

created s3://aws-data-lab-sanhe-for-everything-us-east-2/poc/2022-05-18-glue-find-matches/find-incr-matches/01-train/1.csv
  preview at s3://aws-data-lab-sanhe-for-everything-us-east-2/poc/2022-05-18-glue-find-matches/find-incr-matches/01-train/1.csv
created s3://aws-data-lab-sanhe-for-everything-us-east-2/poc/2022-05-18-glue-find-matches/find-incr-matches/02-label/1.csv
  preview at s3://aws-data-lab-sanhe-for-everything-us-east-2/poc/2022-05-18-glue-find-matches/find-incr-matches/02-label/1.csv


# Genereate Test Data

Now, let's take a look at the dataset.

In [111]:
df_all_for_test = generate_dataset(
    start_id=40001,
    start_tid=1,
    n_labeling_set_id=1000,
    hist=[20, 10, 6, 3, 1],
).drop(columns=["labeling_set_id", "label"])

df_initial_match = df_all_for_test.sample(frac=0.7)[columns_train]
df_initial_match = df_initial_match.sort_values(by="id")
df_incremental_match = df_all_for_test.loc[df_all_for_test.index.difference(df_initial_match.index),columns_train]

我们把测试数据分成两份, 一份 70% 作为 initial match, 另一份 30% 作为 incremental match. 这两份中必然有一些数据是属于同一个 TruePerson 的. initial match 之后很多数据会被标上 ``match_id``, 我们希望用 incremental match 之后, 属于同一个 TruePerson 的数据能跟 initial match 中标记的 ``match_id`` 对应上.

我们先简单的预览一下 initial 和 incremental 两份数据. 可以看出 initial 和 incremental 分别有 28k 和 12k 条数据, 总和能跟前面的 ``1000 * sum([20, 10, 6, 3, 1]) = 40k`` 对应上. 并且互相之间没有重叠.

In [112]:
df_initial_match[["id",]]

Unnamed: 0,id
0,PersonId-40001
1,PersonId-40002
2,PersonId-40003
5,PersonId-40006
6,PersonId-40007
...,...
39995,PersonId-79996
39996,PersonId-79997
39997,PersonId-79998
39998,PersonId-79999


In [113]:
df_incremental_match[["id",]]


Unnamed: 0,id
3,PersonId-40004
4,PersonId-40005
11,PersonId-40012
12,PersonId-40013
17,PersonId-40018
...,...
39978,PersonId-79979
39979,PersonId-79980
39980,PersonId-79981
39981,PersonId-79982


然后我们将 initial 和 incremental 都 load 到 S3 中, 这里要把我们用来验证的 label 信息去掉. 并且把原始数据在本地做一个备份, 以便之后验证.

In [114]:
path_test_csv = Path(dir_here, "03-test.csv")
df_all_for_test.to_csv(path_test_csv, **pd_to_csv_kwargs)

path_initial_csv = Path(dir_here, "04-initial.csv")
df_initial_match.to_csv(path_initial_csv, **pd_to_csv_kwargs)

path_incremental_csv = Path(dir_here, "05-incremental.csv")
df_incremental_match.to_csv(path_incremental_csv, **pd_to_csv_kwargs)

s3path_initial_csv = S3Path(s3path_prefix, "04-initial", "1.csv")
wr.s3.to_csv(df_initial_match, path=s3path_initial_csv.uri, **wr_to_csv_kwargs)
print(f"created {s3path_initial_csv.uri}")
print(f"  preview at {s3path_initial_csv.uri}")

s3path_incremental_csv = S3Path(s3path_prefix, "05-incremental", "1.csv")
wr.s3.to_csv(df_incremental_match, path=s3path_incremental_csv.uri, **wr_to_csv_kwargs)
print(f"created {s3path_incremental_csv.uri}")
print(f"  preview at {s3path_incremental_csv.uri}")

created s3://aws-data-lab-sanhe-for-everything-us-east-2/poc/2022-05-18-glue-find-matches/find-incr-matches/04-initial/1.csv
  preview at s3://aws-data-lab-sanhe-for-everything-us-east-2/poc/2022-05-18-glue-find-matches/find-incr-matches/04-initial/1.csv
created s3://aws-data-lab-sanhe-for-everything-us-east-2/poc/2022-05-18-glue-find-matches/find-incr-matches/05-incremental/1.csv
  preview at s3://aws-data-lab-sanhe-for-everything-us-east-2/poc/2022-05-18-glue-find-matches/find-incr-matches/05-incremental/1.csv


In [115]:
def read_match_results(s3path_prefix: S3Path) -> pd.DataFrame:
    """
    Merge and download the predict output data from S3 to Local.
    The Glue ML job usually use multiple worker and dump the results
    to many small files in parallel,
    """
    df_list = list()
    for s3path in S3Path(s3path_prefix).iter_objects():
        with s3path.open("r") as f:
            df = pd.read_csv(f)
            df_list.append(df)

    # concatenate all data frame
    df_match_results = pd.concat(df_list, axis=0)

    # re order by ID
    df_match_results = df_match_results.sort_values(by="id")
    return df_match_results



In [117]:
df_initial_match_results = read_match_results(S3Path(s3path_prefix, "06-match-results").to_dir())

In [118]:
print("------ initial match results ------")
print(f"{df_initial_match_results.shape[0]} rows, {df_initial_match_results.shape[1]} columns")
df_initial_match_results.head(20)

------ initial match results ------
28000 rows, 6 columns


Unnamed: 0,id,firstname,lastname,phone,match_id,match_confidence_score
1,PersonId-40001,Jyfemy,Martynez,083-618-7494,0,1.0
9,PersonId-40002,Jeqemy,Mvrtinez,083-618-7494,0,1.0
3,PersonId-40003,Jeremy,Martinez,083-618-7294,0,1.0
7,PersonId-40006,Jyfemy,Martinez,083-618-7494,0,1.0
6,PersonId-40007,Jeremy,Martynez,083-618-7494,0,1.0
8,PersonId-40008,Jyfemy,Martinez,083-618-7494,0,1.0
2,PersonId-40009,Jeremy,Martynez,083-618-7494,0,1.0
5,PersonId-40010,Jeremy,Mvrtinez,083-618-7294,0,1.0
11,PersonId-40011,Jeqemy,Mvrtinez,083-618-7494,0,1.0
4,PersonId-40014,Jeremy,Martinez,083-618-7494,0,1.0


In [119]:
print("------ incremental match test data ------")
print(f"{df_incremental_match.shape[0]} rows, {df_incremental_match.shape[1]} columns")
df_incremental_match.head(20)


------ incremental match test data ------
12000 rows, 4 columns


Unnamed: 0,id,firstname,lastname,phone
3,PersonId-40004,Jeqemy,Martynez,083-618-7494
4,PersonId-40005,Jeqemy,Martinez,083-618-7494
11,PersonId-40012,Jeremy,Martynez,083-618-7494
12,PersonId-40013,Jyfemy,Mvrtinez,083-618-7494
17,PersonId-40018,Jeremy,Martinez,083-618-7494
18,PersonId-40019,Jeqemy,Mvrtinez,083-618-7294
19,PersonId-40020,Jyfemy,Martinez,083-618-7494
24,PersonId-40025,Kazis,Heraandez,698-082-5508
25,PersonId-40026,Kazis,Hernandez,698-082-5508
30,PersonId-40031,Nicholar,Cuay,548-319-0208


In [120]:
df_initial_match.index.intersection(df_incremental_match.index)

Int64Index([], dtype='int64')

In [138]:
def download_the_predicts():
    """
    Merge and download the predict output data from S3 to Local.
    The Glue ML job usually use multiple worker and dump the results
    to many small files in parallel,
    """
    df_list = list()
    for s3path in S3Path(s3path_prefix, "predict").iter_objects():
        with s3path.open("r") as f:
            df = pd.read_csv(f)
            df_list.append(df)
    df_predict = pd.concat(df_list, axis=0)
    # re order by ID to ensure the order
    df_predict = df_predict.sort_values(by="id")
    df_predict.to_csv(
        path_predicts_csv,
        sep=",",
        index=False,
    )


download_the_predicts()

In [139]:
df_predict = pd.read_csv(path_predicts_csv)

In [140]:
print("------ Predict Results ------")
print(f"{df_predict.shape[0]} rows, {df_predict.shape[1]} columns")
df_predict.head(10)

------ Predict Results ------
20000 rows, 6 columns


Unnamed: 0,id,firstname,lastname,phone,match_id,match_confidence_score
0,PersonId-00001,John,aadden,672-615-3608,0,1.0
1,PersonId-00002,Jjcn,aadden,642-615-3608,0,1.0
2,PersonId-00003,Joln,aadden,602-615-3608,0,1.0
3,PersonId-00004,Jjcn,Mbddvn,672-615-3608,0,1.0
4,PersonId-00005,Jjcn,aadden,672-615-3608,0,1.0
5,PersonId-00006,Jjcn,aadden,642-615-3608,0,1.0
6,PersonId-00007,Jjcn,Madden,602-615-3608,0,1.0
7,PersonId-00008,John,Mbddvn,672-615-3608,0,1.0
8,PersonId-00009,John,Mbddvn,672-615-3608,0,1.0
9,PersonId-00010,Jjcn,Mbddvn,602-615-3608,0,1.0


In [141]:
# merge the predicted match id and the true person id
# so we can compare visually
df_compare = df_predict.copy()
df_compare["tid"] = df_all["tid"]
df_compare = df_compare["id,firstname,lastname,phone,tid,match_id,match_confidence_score".split(",")]
df_compare.to_csv(
    path_compares_csv,
    sep=",",
    index=False,
)

In [143]:
print("You can visually check the 'tid' and 'match_id' columns")
df_compare.head(50)

You can visually check the 'tid' and 'match_id' columns


Unnamed: 0,id,firstname,lastname,phone,tid,match_id,match_confidence_score
0,PersonId-00001,John,aadden,672-615-3608,TrueId-1,0,1.0
1,PersonId-00002,Jjcn,aadden,642-615-3608,TrueId-1,0,1.0
2,PersonId-00003,Joln,aadden,602-615-3608,TrueId-1,0,1.0
3,PersonId-00004,Jjcn,Mbddvn,672-615-3608,TrueId-1,0,1.0
4,PersonId-00005,Jjcn,aadden,672-615-3608,TrueId-1,0,1.0
5,PersonId-00006,Jjcn,aadden,642-615-3608,TrueId-1,0,1.0
6,PersonId-00007,Jjcn,Madden,602-615-3608,TrueId-1,0,1.0
7,PersonId-00008,John,Mbddvn,672-615-3608,TrueId-1,0,1.0
8,PersonId-00009,John,Mbddvn,672-615-3608,TrueId-1,0,1.0
9,PersonId-00010,Jjcn,Mbddvn,602-615-3608,TrueId-1,0,1.0
