In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,udf
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import ArrayType, StringType

import pseudopeople as pseudo

print("Loading Spark")
spark = SparkSession.builder.appName("Record Linkage").config("spark.memory.offHeap.enabled","true").config("spark.memory.offHeap.size","10g").getOrCreate()


Loading Spark


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/01 21:38:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
print("Generating Pseudopeople Data")
src_1 = '/home/nachiket/RLA_CL_EXTRACT/data/pse/pse_sample4.1.1'
src_2 = '/home/nachiket/RLA_CL_EXTRACT/data/pse/pse_sample4.1.2'
df_1 = spark.read.csv(src_1, header=True, sep='\t')
df_2 = spark.read.csv(src_2, header=True, sep='\t')

Generating Pseudopeople Data


In [3]:
df_1.sort(col("last_name"),col("first_name"),col("middle_initial"),col("age"),col("street_name")).show(5)
df_1 = df_1.show(5)

+------------+----------+--------------+---------+---+-------------+-------------+-------------+------------+----------+-----+-------+----------------------------+------+--------------+
| simulant_id|first_name|middle_initial|last_name|age|date_of_birth|street_number|  street_name| unit_number|      city|state|zipcode|relation_to_reference_person|   sex|race_ethnicity|
+------------+----------+--------------+---------+---+-------------+-------------+-------------+------------+----------+-----+-------+----------------------------+------+--------------+
|9402_1006023|   Allisom|             M|        A|  0|   05/04/2019|         2415|    dale st s|office # 308|providence|   RI|  02833|            Biological child|Female|         White|
|9402_1006023|   Allison|             M|        A|  0|   05/04/2019|         2415|    dale st s|office # 308|providence|   RI|  02833|            Biological child|Female|         White|
|1299_1016030|    Alyssa|             M|        A| 18|   01/16/2002|  

In [4]:
df_2.show(5)

+-----------+----------+--------------+---------+---+-------------+-------------+------------+-----------+------------+-----+-------+----------------------------+------+--------------+
|simulant_id|first_name|middle_initial|last_name|age|date_of_birth|street_number| street_name|unit_number|        city|state|zipcode|relation_to_reference_person|   sex|race_ethnicity|
+-----------+----------+--------------+---------+---+-------------+-------------+------------+-----------+------------+-----+-------+----------------------------+------+--------------+
|3298_700403|    Joshua|             H|     Nava| 20|   04/21/1999|          nan|    pansy rd|        nan|  portsmouth|   RI|  02852|        Noninstitutionali...|  Male|         White|
|4943_978644|   William|             C|  Acevedo| 50|   03/18/1970|         1710|flying a trl|        nan|         nan|   RI|  02864|        Noninstitutionali...|  Male|        Latino|
|  734_60972|     Kenny|             J|   Gatian| 32|   06/14/1988|        

In [5]:

df_2_new = df_2.drop_duplicates(subset= list(df_2.columns)[1:])

df_2_new.sort(col("simulant_id")).show(100)





+------------+-----------+--------------+--------------------+---+-------------+-------------+--------------------+-----------+----------------+-----+-------+----------------------------+------+--------------------+
| simulant_id| first_name|middle_initial|           last_name|age|date_of_birth|street_number|         street_name|unit_number|            city|state|zipcode|relation_to_reference_person|   sex|      race_ethnicity|
+------------+-----------+--------------+--------------------+---+-------------+-------------+--------------------+-----------+----------------+-----+-------+----------------------------+------+--------------------+
|1007_1001056|       Judy|             C|           Zimmerman| 31|   10/14/1988|         1957|            ctrl ave|        nan|        scituate|   RI|  02895|              Other relative|Female|               Asian|
|1007_1011208|      Jilan|             E|              Potter|  0|   08/28/2019|          177|     st clements ave|        nan|       pa

                                                                                

In [None]:

k = 3

def generate_k_mer(str_d):
    if len(str_d) <= k:
        return [str_d]

    return [str_d[i:i+k] for i in range(0, len(str_d)-(k-1))]


kmer_udf = udf(lambda seq: generate_k_mer(seq), ArrayType(StringType()))

df_2_new = df_2_new.withColumn("kmers", kmer_udf(col("last_name")))

df_2_new.show(10)



+------------+----------+--------------+---------+---+-------------+-------------+-------------------+-----------+------------+-----+-------+----------------------------+------+--------------+--------------------+
| simulant_id|first_name|middle_initial|last_name|age|date_of_birth|street_number|        street_name|unit_number|        city|state|zipcode|relation_to_reference_person|   sex|race_ethnicity|               kmers|
+------------+----------+--------------+---------+---+-------------+-------------+-------------------+-----------+------------+-----+-------+----------------------------+------+--------------+--------------------+
|  6760_48339|         A|             M|  Lindsey|  4|   12/31/2015|          604|           c.r. 655|        nan|     warwick|   RI|  02914|              Other relative|  Male|           nan|[Lin, ind, nds, d...|
|  7016_43372|         A|             M| Jauregui| 17|   01/11/2003|         1509|          timber dr|        nan|    westerly|   RI|  02907|   

In [None]:
## Blocking Code

from pyspark.sql.types import MapType,IntegerType

## 1. Add index to the dataframe
## 2. Determine the datatype in the value of the MapType declared below
## 3. Go through all the k-mers and do blocking

map_type_non_nullable = MapType(StringType(), ArrayType(IntegerType(),False), False)


