## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import recordlinkage

## Importing datasets

In [2]:
facility=pd.read_csv(r"C:\Users\DATA-JOHN\Desktop\Hackathon\synthetic_facility_v3.csv")
facility.head()

Unnamed: 0,recnr,firstname,lastname,petname,dob,sex,nationalid,patientid,visitdate
0,2,Fatuma,,Zaina,24-08-2017 00:00,2,N_ID_5000,2069,10-09-2018
1,3,Gloria,Rashida,,11-07-1993 00:00,2,N_ID_11861,2079,14-12-2022
2,4,Ali,Hakram,Igomu,17-05-2014 00:00,1,N_ID_11864,2080,09-06-2023
3,5,Nakalema,,Nkwanga,27-02-2026 00:00,2,N_ID_11867,2081,07-02-2019
4,6,Asuman,Sempa,Aguti,02-03-2002 00:00,1,N_ID_11870,2082,18-08-2020


In [3]:
hdss=pd.read_csv(r"C:\Users\DATA-JOHN\Desktop\Hackathon\synthetic_hdss_v3.csv")
hdss.head()

Unnamed: 0,recnr,firstname,lastname,petname,dob,sex,nationalid,hdssid,hdsshhid
0,1,Zaina,Hanifa,Ula,22-09-1930 00:00,2,,I20001,HH100001
1,2,Godfrey,Maganda,Mukama,15-07-1934 00:00,1,,I20002,HH100002
2,3,Kasim,Ngobi,Galabuzi,03-03-1983 00:00,1,,I20003,HH100003
3,4,Esther,,Inara,30-07-1968 00:00,2,,I20004,HH100004
4,5,Sumaya,Swabula,,13-12-1930 00:00,2,,I20005,HH100005


## Make record pairs

In [4]:
indexer = recordlinkage.Index()
indexer.full()
pairs = indexer.index(facility, hdss)



In [5]:
print(len(facility), len(hdss), len(pairs))

2902 4115 11941730


## Using Blocking to reduce the number of pairs

In [6]:
indexer = recordlinkage.Index()
indexer.block("firstname")
candidate_links = indexer.index(facility, hdss)
len(candidate_links)

34093

## Compare records

In [7]:
compare_cl = recordlinkage.Compare()
compare_cl.exact("firstname", "firstname", label="first_name")
compare_cl.string(
    "lastname", "lastname", method="jarowinkler", threshold=0.85, label="last_name"
)
compare_cl.exact("petname", "petname", label="pet_name")
compare_cl.exact("dob", "dob", label="date_of_birth")
compare_cl.exact("sex", "sex", label="sex")
compare_cl.string("nationalid", "nationalid", threshold=0.85, label="national_id")
features = compare_cl.compute(candidate_links, facility, hdss)

In [8]:
features

Unnamed: 0,Unnamed: 1,first_name,last_name,pet_name,date_of_birth,sex,national_id
0,68,1,0.0,0,0,1,0.0
0,202,1,0.0,0,0,1,0.0
0,588,1,0.0,0,0,1,0.0
0,742,1,0.0,0,0,1,0.0
0,808,1,0.0,0,0,1,0.0
...,...,...,...,...,...,...,...
2857,3588,1,0.0,0,0,1,0.0
2864,777,1,1.0,0,0,1,0.0
2873,1191,1,1.0,0,0,1,0.0
2878,1626,1,1.0,0,0,1,0.0


In [10]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

5.0      587
4.0      629
3.0     1966
2.0    30156
1.0      755
dtype: int64

In [11]:
features[features.sum(axis=1) > 3]

Unnamed: 0,Unnamed: 1,first_name,last_name,pet_name,date_of_birth,sex,national_id
275,2524,1,1.0,0,1,1,0.0
568,3170,1,1.0,0,1,1,0.0
629,1370,1,1.0,1,0,1,0.0
778,202,1,1.0,1,0,1,0.0
919,1702,1,1.0,1,1,1,0.0
...,...,...,...,...,...,...,...
2759,2390,1,1.0,1,0,1,0.0
2762,2620,1,1.0,1,1,1,0.0
2768,2778,1,1.0,0,1,1,0.0
2788,2432,1,1.0,1,1,1,0.0


In [12]:
potential_matches = features[features.sum(axis=1) > 3].reset_index()
potential_matches['Score'] = potential_matches.loc[:, 'first_name':'national_id'].sum(axis=1)

In [13]:
potential_matches

Unnamed: 0,level_0,level_1,first_name,last_name,pet_name,date_of_birth,sex,national_id,Score
0,275,2524,1,1.0,0,1,1,0.0,4.0
1,568,3170,1,1.0,0,1,1,0.0,4.0
2,629,1370,1,1.0,1,0,1,0.0,4.0
3,778,202,1,1.0,1,0,1,0.0,4.0
4,919,1702,1,1.0,1,1,1,0.0,5.0
...,...,...,...,...,...,...,...,...,...
1211,2759,2390,1,1.0,1,0,1,0.0,4.0
1212,2762,2620,1,1.0,1,1,1,0.0,5.0
1213,2768,2778,1,1.0,0,1,1,0.0,4.0
1214,2788,2432,1,1.0,1,1,1,0.0,5.0


## likelihood of matching

In [14]:
facility.loc[275,:]

recnr                      277
firstname               Fatuma
lastname                Shadia
petname                    NaN
dob           23-05-2006 00:00
sex                          2
nationalid           N_ID_5789
patientid                 2347
visitdate           30-05-2023
Name: 275, dtype: object

In [15]:
hdss.loc[2524,:]

recnr                     2525
firstname               Fatuma
lastname                Shadia
petname                    NaN
dob           23-05-2006 00:00
sex                          2
nationalid                 NaN
hdssid                  I22525
hdsshhid              HH102525
Name: 2524, dtype: object

In [16]:
facility['facility_Lookup'] = facility[[
    'firstname', 'lastname', 'dob', 'sex','nationalid','patientid','visitdate'
]].apply(lambda x: '_'.join(str(i) for i in x), axis=1)

hdss['hdss_Lookup'] = hdss[[
    'firstname', 'lastname', 'dob','sex','nationalid','hdssid','hdsshhid'
]].apply(lambda x: '_'.join(str(i) for i in x), axis=1)

facility_lookup = facility[['facility_Lookup']].reset_index()
hdss_lookup = hdss[['hdss_Lookup']].reset_index()

In [17]:
facility_lookup

Unnamed: 0,index,facility_Lookup
0,0,Fatuma_nan_24-08-2017 00:00_2_N_ID_5000_2069_1...
1,1,Gloria_Rashida_11-07-1993 00:00_2_N_ID_11861_2...
2,2,Ali_Hakram_17-05-2014 00:00_1_N_ID_11864_2080_...
3,3,Nakalema_nan_27-02-2026 00:00_2_N_ID_11867_208...
4,4,Asuman_Sempa_02-03-2002 00:00_1_N_ID_11870_208...
...,...,...
2897,2897,Madina_Nakagolo_28-07-1955 00:00_2_N_ID_13676_...
2898,2898,Namulondo_Namugabwe_14-09-1933 00:00_2_N_ID_13...
2899,2899,Ziriya_Kauma_11-11-1963 00:00_2_N_ID_13682_496...
2900,2900,Faizo_Buyinza_09-06-1956 00:00_1_N_ID_13685_49...


In [18]:
facility_merge = potential_matches.merge(facility_lookup, how='left', left_index=True, right_index=True)


In [19]:
facility_merge

Unnamed: 0,level_0,level_1,first_name,last_name,pet_name,date_of_birth,sex,national_id,Score,index,facility_Lookup
0,275,2524,1,1.0,0,1,1,0.0,4.0,0,Fatuma_nan_24-08-2017 00:00_2_N_ID_5000_2069_1...
1,568,3170,1,1.0,0,1,1,0.0,4.0,1,Gloria_Rashida_11-07-1993 00:00_2_N_ID_11861_2...
2,629,1370,1,1.0,1,0,1,0.0,4.0,2,Ali_Hakram_17-05-2014 00:00_1_N_ID_11864_2080_...
3,778,202,1,1.0,1,0,1,0.0,4.0,3,Nakalema_nan_27-02-2026 00:00_2_N_ID_11867_208...
4,919,1702,1,1.0,1,1,1,0.0,5.0,4,Asuman_Sempa_02-03-2002 00:00_1_N_ID_11870_208...
...,...,...,...,...,...,...,...,...,...,...,...
1211,2759,2390,1,1.0,1,0,1,0.0,4.0,1211,Ntono_Nakakande_09-09-1987 00:00_2_N_ID_8597_3...
1212,2762,2620,1,1.0,1,1,1,0.0,5.0,1212,Hadija_Namuzu_02-12-1989 00:00_2_N_ID_8600_328...
1213,2768,2778,1,1.0,0,1,1,0.0,4.0,1213,Retisha_Naigaga_17-03-2011 00:00_2_N_ID_8603_3...
1214,2788,2432,1,1.0,1,1,1,0.0,5.0,1214,Najirah_Nankabirwa_27-04-2015 00:00_2_N_ID_860...


In [20]:
hdss_lookup

Unnamed: 0,index,hdss_Lookup
0,0,Zaina_Hanifa_22-09-1930 00:00_2_nan_I20001_HH1...
1,1,Godfrey_Maganda_15-07-1934 00:00_1_nan_I20002_...
2,2,Kasim_Ngobi_03-03-1983 00:00_1_nan_I20003_HH10...
3,3,Esther_nan_30-07-1968 00:00_2_nan_I20004_HH100004
4,4,Sumaya_Swabula_13-12-1930 00:00_2_nan_I20005_H...
...,...,...
4110,4110,Sharita_Nabirye_18-03-1981 00:00_2_nan_I24111_...
4111,4111,Tenywa_Kapiso_28-06-1998 00:00_1_nan_I24112_HH...
4112,4112,Mariam_Babirye_03-04-1986 00:00_2_nan_I24113_H...
4113,4113,Nangobi_Tracy_19-12-2001 00:00_2_nan_I24114_HH...


In [21]:
hdss_merge = potential_matches.merge(hdss_lookup, how='left', left_index=True, right_index=True)

In [22]:
hdss_merge

Unnamed: 0,level_0,level_1,first_name,last_name,pet_name,date_of_birth,sex,national_id,Score,index,hdss_Lookup
0,275,2524,1,1.0,0,1,1,0.0,4.0,0,Zaina_Hanifa_22-09-1930 00:00_2_nan_I20001_HH1...
1,568,3170,1,1.0,0,1,1,0.0,4.0,1,Godfrey_Maganda_15-07-1934 00:00_1_nan_I20002_...
2,629,1370,1,1.0,1,0,1,0.0,4.0,2,Kasim_Ngobi_03-03-1983 00:00_1_nan_I20003_HH10...
3,778,202,1,1.0,1,0,1,0.0,4.0,3,Esther_nan_30-07-1968 00:00_2_nan_I20004_HH100004
4,919,1702,1,1.0,1,1,1,0.0,5.0,4,Sumaya_Swabula_13-12-1930 00:00_2_nan_I20005_H...
...,...,...,...,...,...,...,...,...,...,...,...
1211,2759,2390,1,1.0,1,0,1,0.0,4.0,1211,Nasabu_Mutesi_27-05-1981 00:00_2_nan_I21212_HH...
1212,2762,2620,1,1.0,1,1,1,0.0,5.0,1212,Viola_Shamira_22-10-2015 00:00_2_nan_I21213_HH...
1213,2768,2778,1,1.0,0,1,1,0.0,4.0,1213,Fauza_Nakagolo_04-08-2020 00:00_2_nan_I21214_H...
1214,2788,2432,1,1.0,1,1,1,0.0,5.0,1214,Faima_Nkwanga_10-11-1992 00:00_2_nan_I21215_HH...
