In [1]:
from datetime import datetime

from pyspark.sql import functions as f
from pyspark.sql.types import DoubleType, LongType, StringType, StructField, StructType, TimestampType

from lib.entity_resolution import EntityResolution, MatchState
from thetaray.api.context import init_context
from thetaray.api.dataset import dataset_functions
from thetaray.api.graph import publish_edges, publish_nodes, read_nodes

spark_conf={"spark.sql.jsonGenerator.ignoreNullFields": False}

context = init_context(execution_date=datetime(2025, 6, 12, 10, 4, 25), spark_conf=spark_conf)
spark = context.get_spark_session()

2025-06-12 10:56:10,848:INFO:thetaray.common.logging:start loading solution.....[ load_risks=True , solution_path=/thetaray/git/solutions/domains , settings_path=/thetaray/git/solutions/settings ]
2025-06-12 10:56:11,000:INFO:thetaray.common.logging:load_risks took: 0.0562131404876709
2025-06-12 10:56:11,738:INFO:thetaray.common.logging:=== Started updating schema ===
2025-06-12 10:56:11,790:INFO:thetaray.common.logging:=== Started updating schema on Postgres ===
2025-06-12 10:56:15,695:INFO:thetaray.common.logging:found 43 tables in solution public schema
2025-06-12 10:56:15,704:INFO:thetaray.common.logging:demo_ret_smb_ef
2025-06-12 10:56:15,713:INFO:thetaray.common.logging:found 43 tables in solution public schema
2025-06-12 10:56:15,714:INFO:thetaray.common.logging:demo_ret_indiv_ef
2025-06-12 10:56:15,718:INFO:thetaray.common.logging:=== Finished updating schema ===
2025-06-12 10:56:16,434:INFO:thetaray.common.logging:=== Started updating schema for Datasets on Minio ===
Setting d

In [2]:
graph = "public"

## Prepare / normalize data

In [3]:
accounts = dataset_functions.read(context, "account").drop("tr_timestamp")

accounts = accounts.withColumn("normalized_name", f.lower(accounts["name"]))
accounts = accounts.withColumnRenamed("account_id", "id")
accounts = accounts.withColumnRenamed("date", "effective_date")
accounts = accounts.select("id", "name", "address", "normalized_name", "country", "effective_date")
accounts = accounts.withColumnRenamed("name", "NM")
accounts = accounts.withColumnRenamed("address", "AD")
accounts = accounts.withColumnRenamed("country", "CT")

accounts.show()

[Stage 3:>                                                          (0 + 1) / 1]

+-----+--------------------+--------------------+--------------------+---+-------------------+
|   id|                  NM|                  AD|     normalized_name| CT|     effective_date|
+-----+--------------------+--------------------+--------------------+---+-------------------+
|10018|    Jamylah BASHIOUM|215 S WILLOW ST S...|    jamylah bashioum| US|1995-07-23 00:00:00|
| 1002|       Naloni WIESER|32 NE WILDROSE DR...|       naloni wieser| FI|1996-09-17 00:00:00|
|10022|Caelynn PETERSCHMIDT|263 NORTH RIDGE S...|caelynn peterschmidt| IR|1997-05-07 00:00:00|
|10049|    Dimitrios KLEINE|241 Rd 11.9 NW St...|    dimitrios kleine| BY|1996-03-28 00:00:00|
| 1011|  Denisha EIDEMILLER|280 167TH CT NE W...|  denisha eidemiller| PL|1994-12-17 00:00:00|
|10148|      Lauren REINCKE|183 SE 184TH ST H...|      lauren reincke| SO|1997-01-13 00:00:00|
|10233|        Niyla GEBBIA|100 N Forest Blvd...|        niyla gebbia| CD|1996-02-21 00:00:00|
| 1026|    Kathlynn FICARRO|47 SH-12 GRADNVIE...| 

                                                                                

In [4]:
accounts.count()

4500

## Run matching

In [5]:
er = EntityResolution(context=context, graph_id=graph,)
res_pdf = er.resolve(accounts)

2025-06-12 10:48:14,099:INFO:thetaray.common.logging:Started computing minhashes for 1 chunks with size 50000 each
2025-06-12 10:48:25,351:INFO:thetaray.common.logging:Start computing minhashes for chunk 0
2025-06-12 10:48:25,659:INFO:thetaray.common.logging:Computing minhashes for chunk 0 is done. Output saved to /tmp/minhashes_0
2025-06-12 10:48:26,356:INFO:thetaray.common.logging:Finished computing minhashes
2025-06-12 10:48:26,364:INFO:thetaray.common.logging:Loading persisted MinHashes into in memory LSH structure
2025-06-12 10:48:26,441:INFO:thetaray.common.logging:Performing parallel queries over the LHS structure from 1 file in 8 processes
2025-06-12 10:48:37,433:INFO:thetaray.common.logging:Performing query on file /tmp/minhashes_0
2025-06-12 10:48:37,492:INFO:thetaray.common.logging:Output saved to file /tmp/candidates_0
2025-06-12 10:48:38,125:INFO:thetaray.common.logging:Finding matches started. Processing 1 files with 8 workers
2025-06-12 10:48:48,980:INFO:thetaray.common.

In [6]:
normalized_cols = [
    field["normalized"]
    for field in context.solution.eresolution_settings.matching_fields_parameters
    if field.get("normalized")
]
res_pdf.drop(normalized_cols, inplace=True, axis=1)

In [7]:
res_pdf

Unnamed: 0,source,clique,mean_score,clique_size,NM,AD,CT,effective_date
198,1911,008c4551-89dd-49ef-b917-dd6cd873249a,0.984375,2,Dash CHIZ,139 BENEDICT ST REDMOND AL 98854,FI,1994-08-21
199,8225,008c4551-89dd-49ef-b917-dd6cd873249a,0.984375,2,Dash CHIZ,139 BENEDICT ST REDMOND AL 98853,CD,1996-12-07
306,2578,00c6bcbb-d510-4387-947a-c8da249a8dbb,0.986111,2,Dashel ALDERMAN,298 E CENTRAL AVE SUNNYSIDE MI 98058,SG,1996-03-25
307,8327,00c6bcbb-d510-4387-947a-c8da249a8dbb,0.986111,2,Dashel ALDERMAN,298 E CENTRAL AVE SUNNYSIDE MI 98059,CD,1995-08-08
61,1956,00cd74c6-201d-4a94-b0aa-e9532971ae3b,0.980769,2,Noach DRAPALA,65 POLO RD Deming MS 98102,CD,1997-05-23
...,...,...,...,...,...,...,...,...
188,1823,fbfb15a1-5667-4789-ba56-be440c640fbd,0.988372,2,Kacelyn MACMILLEN,220 HILLCREST DR/WALNUT AVE Quincy LA 98039,SK,1996-09-28
216,2009,fc0d8392-cc3d-4dd5-af29-9de16425374d,0.982143,2,Yasmina SHIRREFFS,273 W EDISON Warden IA 81630,UZ,1994-04-14
217,2128,fc0d8392-cc3d-4dd5-af29-9de16425374d,0.982143,2,Yasmina SHIRREFFS,273 W EDISON Warden IA 81631,FK,1993-12-10
312,260,fe666a94-1025-4691-87ec-c13645509699,0.984848,2,Brek WEDAN,232 N HERON DR NEWCASTLE UT 98245,ES,1995-10-14


In [None]:
schema = StructType(
    [
        StructField("source", StringType()),
        StructField("clique", StringType()),
        StructField("mean_score", DoubleType()),
        StructField("clique_size", LongType()),
        StructField("NM", StringType()),
        StructField("AD", StringType()),
        StructField("CT", StringType()),
        StructField("effective_date", TimestampType()),
    ],
)
res = spark.createDataFrame(res_pdf, schema=schema)

## Publish parties, we always republish new parties to be able provide party status
#### Make sure you are included to dataframe all party properties and not republishing existing parties
#### it is expected to have all properties you perfomed matching on so user in IC will be able to select if rom party
#### consolidation, updated_on, updated_by is technical properties that should be present

In [None]:
existing_parties = read_nodes(context=context, graph_identifier=graph, type="PR")

In [None]:
# parse graph
parties_nodes_df = res.select("clique")
parties_nodes_df = parties_nodes_df.dropDuplicates(["clique"])
parties_nodes_df = parties_nodes_df.withColumnRenamed("clique", "PI")
parties_nodes_df = parties_nodes_df.withColumn("effective_date", f.lit(context.execution_date))
parties_nodes_df = parties_nodes_df.withColumn("id", f.col("PI"))

# add properties
parties_nodes_df = parties_nodes_df.withColumn("NM", f.lit(None).cast("string"))
parties_nodes_df = parties_nodes_df.withColumn("AD", f.lit(None).cast("string"))
parties_nodes_df = parties_nodes_df.withColumn("CT", f.lit(None).cast("string"))

# add technical columns
parties_nodes_df = parties_nodes_df.withColumn("CN", f.lit(1).cast("long"))
parties_nodes_df = parties_nodes_df.withColumn("UON", f.lit(None).cast("timestamp"))
parties_nodes_df = parties_nodes_df.withColumn("UBY", f.lit(None).cast("string"))
# remove existing parties
parties_nodes_df = parties_nodes_df.join(existing_parties, on="id", how="left_anti")

parties_nodes_df.printSchema()

In [None]:
parties_nodes_df.show()

In [None]:
publish_nodes(
    context=context, 
    nodes_df=parties_nodes_df, 
    graph_identifier=graph, 
    node_type="PR",
)

## Publish accounts, avoid republishing accounts make sure you publishing all properties, in case of missing country fill it with nulls 

In [None]:
existing_accounts = read_nodes(context=context, graph_identifier="public", type="AC")

In [None]:
accounts_nodes_df = accounts.withColumn("effective_date", f.to_timestamp(f.lit(context.execution_date)))
accounts_nodes_df = accounts_nodes_df.join(existing_accounts, on="id", how="left_anti")
accounts_nodes_df = accounts_nodes_df.withColumn("AN", f.col("id"))
accounts_nodes_df.printSchema()

In [None]:
accounts_nodes_df.show()

In [None]:
publish_nodes(
    context=context, 
    nodes_df=accounts_nodes_df, 
    graph_identifier="public", 
    node_type="AC",
)

## Publish party - node edges - we always publshin only new edges as matcher return only mathes that does not exists in graph
#### consider chosing state relying on autoconfirmation threshold 

In [None]:
edges_df = res.select("mean_score", "clique", "source")
edges_df = edges_df.withColumnRenamed("source", "id")
edges_df = edges_df.withColumn("effective_date", f.to_timestamp(f.lit(context.execution_date)))
edges_df = edges_df.withColumnRenamed("clique", 'source_node')
edges_df = edges_df.withColumn("target_node", f.col("id"))
edges_df = edges_df.withColumnRenamed("mean_score", "SC")
threshold_col = f.lit(context.solution.eresolution_settings.auto_confirmation_threshold)
edges_df = edges_df.withColumn(
    "ST",
    f.when(
        (f.col("SC") >= threshold_col) & (threshold_col != -1),
        f.lit(MatchState.AUTO_CONFIRM.value)
    ).otherwise(MatchState.CANDIDATE.value).cast("long")
)
edges_df.printSchema()

In [None]:
edges_df.show()

In [None]:
publish_edges(
    context=context, 
    edges_df=edges_df, 
    graph_identifier="public", 
    edge_type="ER", 
    source_node_type="PR", 
    target_node_type="AC",
)

In [None]:
context.close()