In [0]:
from pyspark.sql import functions as fn, types as T
from pyspark.sql import Window

In [0]:
data_1 = [
  {
    'person': 'jack',
    'true_path_id': 'abc',
    'previous_reference': None,
    'reference': 'a',
    'time_step': 1,
  },
  {
    'person': 'jack',
    'true_path_id': 'abc',
    'previous_reference': 'a',
    'reference': 'b',
    'time_step': 2,
  },
  {
    'person': 'jack',
    'true_path_id': 'abc',
    'previous_reference': 'b',
    'reference': 'c',
    'time_step': 3,
  },
  {
    'person': 'jack',
    'true_path_id': 'jkl',
    'previous_reference': None,
    'reference': 'j',
    'time_step': 14,
  },
  {
    'person': 'jack',
    'true_path_id': 'jkl',
    'previous_reference':'j',
    'reference': 'k',
    'time_step': 15,
  },
  {
    'person': 'jack',
    'true_path_id': 'jkl',
    'previous_reference':'k',
    'reference': 'l',
    'time_step': 16,
  },
  {
    'person': 'jill',
    'true_path_id': 'pqr',
    'previous_reference': None,
    'reference': 'p',
    'time_step': 1,
  },
  {
    'person': 'jill',
    'true_path_id': 'pqr',
    'previous_reference': 'p',
    'reference': 'q',
    'time_step': 2,
  },
  {
    'person': 'jill',
    'true_path_id': 'pqr',
    'previous_reference': 'q',
    'reference': 'r',
    'time_step': 3,
  },
  {
    'person': 'jill',
    'true_path_id': 'xyz',
    'previous_reference': None,
    'reference': 'x',
    'time_step': 24,
  },
  {
    'person': 'jill',
    'true_path_id': 'xyz',
    'previous_reference':'x',
    'reference': 'y',
    'time_step': 25,
  },
  {
    'person': 'jill',
    'true_path_id': 'xyz',
    'previous_reference':'y',
    'reference': 'z',
    'time_step': 26,
  },
]

# assemble sample data into dataframe
df_1 = spark.createDataFrame(data_1)
display(df_1)

person,previous_reference,reference,time_step,true_path_id
jack,,a,1,abc
jack,a,b,2,abc
jack,b,c,3,abc
jack,,j,14,jkl
jack,j,k,15,jkl
jack,k,l,16,jkl
jill,,p,1,pqr
jill,p,q,2,pqr
jill,q,r,3,pqr
jill,,x,24,xyz


# If each person had a single path this would be simple
We could simply add a path UUID as below or even just use the person colum as the ID
However as we can see in this case the assigned values do not match the pattern of the true_path_id column

In [0]:
window = Window.partitionBy('person').orderBy('time_step')

df_1_simple = df_1.withColumn(
  # create uuids for all rows first since uuid cannot be used in window functions
  'uuid', fn.expr('uuid()')
).withColumn(
  # then take the first from each path block
  'path_id', fn.first('uuid').over(window)
)

display(df_1_simple)

person,previous_reference,reference,time_step,true_path_id,uuid,path_id
jack,,a,1,abc,875cd687-8eff-465c-ac86-f45b9876143f,875cd687-8eff-465c-ac86-f45b9876143f
jack,a,b,2,abc,bca493d9-f7f2-41c2-a8d0-e3ae4757a576,875cd687-8eff-465c-ac86-f45b9876143f
jack,b,c,3,abc,79aeb33b-7d92-4628-a010-22caf19cf098,875cd687-8eff-465c-ac86-f45b9876143f
jack,,j,14,jkl,f7812ff1-d4fd-477a-aadb-79f2dcc94eeb,875cd687-8eff-465c-ac86-f45b9876143f
jack,j,k,15,jkl,8a23ce0d-e90c-480b-a165-f0921aa07170,875cd687-8eff-465c-ac86-f45b9876143f
jack,k,l,16,jkl,742187f1-8bc2-4f7e-845f-972a327a3406,875cd687-8eff-465c-ac86-f45b9876143f
jill,,p,1,pqr,9ea851cf-0859-4055-9ac4-0cb9fd11b6c9,9ea851cf-0859-4055-9ac4-0cb9fd11b6c9
jill,p,q,2,pqr,ad424d64-1a63-4211-bae9-903b8e41144d,9ea851cf-0859-4055-9ac4-0cb9fd11b6c9
jill,q,r,3,pqr,a06d53ed-4f87-4377-8b07-0e2234b8f5ac,9ea851cf-0859-4055-9ac4-0cb9fd11b6c9
jill,,x,24,xyz,bfd5c113-c6a1-48e6-b129-9eb52ebb306f,9ea851cf-0859-4055-9ac4-0cb9fd11b6c9


# Where people can have multiple paths things need a little more thought
Let us suppose that paths do not overlap in time. In that case we can use knowledge of the preceding step in the path to correctly joing steps into paths. To do this we take 3 steps:
- We use a lagged window to push information from the previous timestep for each person into the next timestep
- we compare this preciding information with the current information to check for a match assigning1 for a NON-MATCH and 0 for a MATCH
- we can then sum across all prior events for each person to give equally numbered groups that should be turned into paths

In practice this looks like this. Notice that the newly assigned path_id values correspond in grouping to the true path_id values

Note that there are a couple of assumptions here:
- each person can only have one path at a time
- each step is aware of some useful indicator about the previous step (here the previous_reference)

In [0]:
person_window = Window.partitionBy(
  'person',
).orderBy(
  'time_step',
)

cumulative_window = Window.partitionBy(
  'person',
).orderBy(
  'time_step',
).rowsBetween(
  Window.unboundedPreceding, 0
)

preceding_window = Window.partitionBy(
  'person', 
  'path_group',
).orderBy(
  'time_step',
)

df_1_separate_paths = df_1.withColumn(
  # get information from the preceding step
  'preceding_reference', fn.lag('reference').over(person_window)
).withColumn(
  # and compare to the current step
  'not_same_as_preceding', fn.when(
    fn.col('previous_reference') == fn.col('preceding_reference'),
    # be careful to use 0 for matches
    0
  ).otherwise(
    # and 1 for non-matches
    1
  )
).withColumn(
  # we can then separate into paths correctly
  'path_group', fn.sum('not_same_as_preceding').over(cumulative_window)
).withColumn(
  # assign uuids if we want to have unique rather than numbered path groups
  'uuid', fn.expr('uuid()')
).withColumn(
  # by taking the first uuid from each path group
  'path_id', fn.first('uuid').over(preceding_window)
)

display(df_1_lagged)

person,previous_reference,reference,time_step,true_path_id,uuid,preceding_reference,not_same_as_preceding,path_group,path_id
jack,,a,1,abc,91b6fa90-d9f7-4db5-96f9-0c6f8e404ff0,,1,1,91b6fa90-d9f7-4db5-96f9-0c6f8e404ff0
jack,a,b,2,abc,9c9b3acd-2ddf-4c5c-9091-e305a27e95cc,a,0,1,91b6fa90-d9f7-4db5-96f9-0c6f8e404ff0
jack,b,c,3,abc,801cb175-20b6-4d53-a4d9-7c8becef86a5,b,0,1,91b6fa90-d9f7-4db5-96f9-0c6f8e404ff0
jack,,j,14,jkl,cf84937d-cc9c-4684-92a2-a6596b5b678c,c,1,2,cf84937d-cc9c-4684-92a2-a6596b5b678c
jack,j,k,15,jkl,e1a45f1f-a314-4d36-8f34-1489e20ff5de,j,0,2,cf84937d-cc9c-4684-92a2-a6596b5b678c
jack,k,l,16,jkl,977446e4-48e8-46c0-8d93-a37320afbca2,k,0,2,cf84937d-cc9c-4684-92a2-a6596b5b678c
jill,,p,1,pqr,fd474918-f821-4c4f-afba-8249c03902f8,,1,1,fd474918-f821-4c4f-afba-8249c03902f8
jill,p,q,2,pqr,d53f4833-7208-43da-b4e6-5829f0b08aab,p,0,1,fd474918-f821-4c4f-afba-8249c03902f8
jill,q,r,3,pqr,1c6808cf-7bc5-4cb2-b215-e178bdb3f7b5,q,0,1,fd474918-f821-4c4f-afba-8249c03902f8
jill,,x,24,xyz,e6c34ff7-3ff8-43c1-a733-d804f80362ee,r,1,2,e6c34ff7-3ff8-43c1-a733-d804f80362ee


comparing the true_path_id (which would be missing from the data ans is what we are trying to proxy for) with the path_isd, we can see this method has correctly assigned path identities