In [1]:
import pandas as pd
import py_stringmatching as sm

In [2]:
sunshine = pd.read_csv('../Data/Outputs_Cleanup/physicians_info_dedup.csv',dtype=str)

In [3]:
part_d = pd.read_csv('../Data/Outputs_Cleanup/prescriber_dedup.csv',dtype=str)

In [4]:
def processType(x):
    s = x.split('|')
    return s[-1]

In [5]:
def processData(df):
    cols = df.columns.to_list()
    for col in cols:
        df[col] = df[col].astype(str)
        df[col] = df[col].apply(lambda x: x.lower().strip())
    df['type'] = df['type'].apply(lambda x: processType(x))
    df['combined'] = df['fname'] + ' ' + df['lname'] + ' ' + df['city'] + ' ' + df['type']
    return df

In [6]:
sunshine = processData(sunshine)

In [7]:
part_d = processData(part_d)

There are duplicates where the same person has two ids in both dataset.<br>
The solution is to hash the combined column to create a new unique id.<br>
Then we will create lookup tables for the new ID and the hash.<br>

In [8]:
sunshine['hashID'] = sunshine['combined'].apply(lambda x: hash(x))
part_d['hashID'] = part_d['combined'].apply(lambda x: hash(x))

Make the lookup tables

In [9]:
sunshine.rename(columns={'id':'Physician_Profile_ID'})[['hashID','Physician_Profile_ID']].to_csv('../Data/Outputs_Cleanup/sunshineNewID.csv',index=False)
part_d.rename(columns={'id':'Prscrbr_NPI'})[['hashID','Prscrbr_NPI']].to_csv('../Data/Outputs_Cleanup/partDNewID.csv',index=False)

In [10]:
sunshine.drop_duplicates(subset='hashID',inplace=True)
part_d.drop_duplicates(subset='hashID',inplace=True)

In [11]:
sunshine_org = sunshine.copy()

In [12]:
part_d_org = part_d.copy()

Here we will do a naive matching. Based on just the 'hashID' column. If there is an exact match we will accept this.

In [13]:
merged_part_d = part_d.merge(right=sunshine,on='hashID',how='left')
merged_part_d

Unnamed: 0,id_x,fname_x,lname_x,type_x,city_x,combined_x,hashID,id_y,fname_y,lname_y,type_y,city_y,combined_y
0,1326172685,john,janikowski,family practice,apo ap,john janikowski apo ap family practice,-5087808358276745535,,,,,,
1,1083012520,satvir,dhaliwal,dentist,acampo,satvir dhaliwal acampo dentist,643611855345337541,,,,,,
2,1730178724,cheryl,leonard,nurse practitioner,acton,cheryl leonard acton nurse practitioner,2552305003076301146,,,,,,
3,1194118836,debbie,welch,nurse practitioner,acton,debbie welch acton nurse practitioner,6835368002882478069,,,,,,
4,1427275114,jeffrey,hempel,dentist,acton,jeffrey hempel acton dentist,2347854178743044497,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
102642,1790885259,vicente,arano,family practice,yucca valley,vicente arano yucca valley family practice,7676442175793933461,,,,,,
102643,1851495550,xu shao,huang,optometry,yucca valley,xu shao huang yucca valley optometry,8986341279889783381,,,,,,
102644,1851816888,yizhou,zheng,pharmacist,yucca valley,yizhou zheng yucca valley pharmacist,5772377211478474367,,,,,,
102645,1770672719,yoonho,chang,dentist,yucca valley,yoonho chang yucca valley dentist,5100369588910286926,,,,,,


Now we will remove the matches and set them aside. We will do a more careful inspection of the remaining unmatched ones.

In [14]:
#Setting aside matches, keeping pairs together
matches = merged_part_d.drop(merged_part_d[(merged_part_d['city_x'].isnull()==True) | (merged_part_d['city_y'].isnull()==True)].index)
col = matches.columns.to_list()
col.remove('hashID')
matches.drop(columns = col,inplace=True)
matches['hashID2'] = matches['hashID']
matches.rename(columns={'hashID':'part_d_Hash','hashID2':'sunshine_Hash'},inplace=True)

In [15]:
part_d.drop(part_d[part_d['hashID'].isin(matches['part_d_Hash'].to_list())].index,inplace=True)
sunshine.drop(sunshine[sunshine['hashID'].isin(matches['sunshine_Hash'].to_list())].index,inplace=True)

In [16]:
del merged_part_d

In [17]:
#Create a new column in sunshine that is the combined column whitespace tokenized    
#Create a new column in sunshine that is the combined column unigram tokenized
#Create a new column in part_d that is the combined column whitespace tokenized
#Create a new column in part_d that is the combined column unigram tokenized
whitespace_tokenizer = sm.WhitespaceTokenizer()
unigram_tokenizer = sm.QgramTokenizer(qval=2)
sunshine['whitespace_tok'] = sunshine['combined'].apply(lambda x: whitespace_tokenizer.tokenize(x))
sunshine['Qgram_tok'] = sunshine['combined'].apply(lambda x: unigram_tokenizer.tokenize(x))
part_d['whitespace_tok'] = part_d['combined'].apply(lambda x: whitespace_tokenizer.tokenize(x))
part_d['Qgram_tok'] = part_d['combined'].apply(lambda x: unigram_tokenizer.tokenize(x))

In [18]:
part_d

Unnamed: 0,id,fname,lname,type,city,combined,hashID,whitespace_tok,Qgram_tok
0,1326172685,john,janikowski,family practice,apo ap,john janikowski apo ap family practice,-5087808358276745535,"[john, janikowski, apo, ap, family, practice]","[#j, jo, oh, hn, n , j, ja, an, ni, ik, ko, o..."
1,1083012520,satvir,dhaliwal,dentist,acampo,satvir dhaliwal acampo dentist,643611855345337541,"[satvir, dhaliwal, acampo, dentist]","[#s, sa, at, tv, vi, ir, r , d, dh, ha, al, l..."
2,1730178724,cheryl,leonard,nurse practitioner,acton,cheryl leonard acton nurse practitioner,2552305003076301146,"[cheryl, leonard, acton, nurse, practitioner]","[#c, ch, he, er, ry, yl, l , l, le, eo, on, n..."
3,1194118836,debbie,welch,nurse practitioner,acton,debbie welch acton nurse practitioner,6835368002882478069,"[debbie, welch, acton, nurse, practitioner]","[#d, de, eb, bb, bi, ie, e , w, we, el, lc, c..."
4,1427275114,jeffrey,hempel,dentist,acton,jeffrey hempel acton dentist,2347854178743044497,"[jeffrey, hempel, acton, dentist]","[#j, je, ef, ff, fr, re, ey, y , h, he, em, m..."
...,...,...,...,...,...,...,...,...,...
102678,1790885259,vicente,arano,family practice,yucca valley,vicente arano yucca valley family practice,7676442175793933461,"[vicente, arano, yucca, valley, family, practice]","[#v, vi, ic, ce, en, nt, te, e , a, ar, ra, a..."
102679,1851495550,xu shao,huang,optometry,yucca valley,xu shao huang yucca valley optometry,8986341279889783381,"[xu, shao, huang, yucca, valley, optometry]","[#x, xu, u , s, sh, ha, ao, o , h, hu, ua, a..."
102680,1851816888,yizhou,zheng,pharmacist,yucca valley,yizhou zheng yucca valley pharmacist,5772377211478474367,"[yizhou, zheng, yucca, valley, pharmacist]","[#y, yi, iz, zh, ho, ou, u , z, zh, he, en, n..."
102681,1770672719,yoonho,chang,dentist,yucca valley,yoonho chang yucca valley dentist,5100369588910286926,"[yoonho, chang, yucca, valley, dentist]","[#y, yo, oo, on, nh, ho, o , c, ch, ha, an, n..."


In [19]:
sunshine

Unnamed: 0,id,fname,lname,type,city,combined,hashID,whitespace_tok,Qgram_tok
0,360308,maheshkumar,vyas,allergy & immunology,anaheim,maheshkumar vyas anaheim allergy & immunology,950771680717603933,"[maheshkumar, vyas, anaheim, allergy, &, immun...","[#m, ma, ah, he, es, sh, hk, ku, um, ma, ar, r..."
1,1236637,nataliya,kushnir,allergy & immunology,berkeley,nataliya kushnir berkeley allergy & immunology,5156923147507326745,"[nataliya, kushnir, berkeley, allergy, &, immu...","[#n, na, at, ta, al, li, iy, ya, a , k, ku, u..."
2,264541,alan,khadavi,allergy & immunology,beverly hills,alan khadavi beverly hills allergy & immunology,739111513759428767,"[alan, khadavi, beverly, hills, allergy, &, im...","[#a, al, la, an, n , k, kh, ha, ad, da, av, v..."
3,122649,joseph,chun,allergy & immunology,buena park,joseph chun buena park allergy & immunology,7794950148132110112,"[joseph, chun, buena, park, allergy, &, immuno...","[#j, jo, os, se, ep, ph, h , c, ch, hu, un, n..."
4,210594,binita,mandal,allergy & immunology,carmichael,binita mandal carmichael allergy & immunology,-3101263856835332465,"[binita, mandal, carmichael, allergy, &, immun...","[#b, bi, in, ni, it, ta, a , m, ma, an, nd, d..."
...,...,...,...,...,...,...,...,...,...
69137,706131,robert,nisson,,cameron park,robert nisson cameron park nan,-8759410643471435429,"[robert, nisson, cameron, park, nan]","[#r, ro, ob, be, er, rt, t , n, ni, is, ss, s..."
69138,272029,joseph,cerni,,huntington beach,joseph cerni huntington beach nan,6821547776827399224,"[joseph, cerni, huntington, beach, nan]","[#j, jo, os, se, ep, ph, h , c, ce, er, rn, n..."
69139,89364,michael,miyasaki,,sacramento,michael miyasaki sacramento nan,-4842060767044964599,"[michael, miyasaki, sacramento, nan]","[#m, mi, ic, ch, ha, ae, el, l , m, mi, iy, y..."
69140,206670,stephanie,walton,,sacramento,stephanie walton sacramento nan,-824853984260158172,"[stephanie, walton, sacramento, nan]","[#s, st, te, ep, ph, ha, an, ni, ie, e , w, w..."


In [20]:
#We will find matches for the sunshine data in the part_d data
#This is because the sunshine data is the smaller set
#Here is the procedure:    
    #For each row in sunshine
        #Filter the city in part_d
        #Perform Jaccard on the whitespace token set between the remainder and the row
        #Filter out for scores higher than .75
        #If the remaining dataframe is 0
            #Return None
        #Else
            #Grab the highest matching one as the match
            #Save the match into sunshine_match dataframe and part_d_match dataframe
        #Remove the row from both part_d_noDups and sunshine_noDups

In [21]:
sim_measure = sm.OverlapCoefficient()

In [22]:
matched = []

In [23]:
def find_match(sunshine_row):
    #filter part_d for the last name:
    df_partD = part_d[part_d['lname']==sunshine_row['lname']].copy()
    df_partD.drop(df_partD[df_partD['hashID'].isin(matched)].index,inplace=True)
    if len(df_partD) == 0:
        #Return None
        return None
    #Perform similarity on the token set between the remainder and the row
    df_partD['score'] = df_partD['Qgram_tok'].apply(lambda x: sim_measure.get_sim_score(x, sunshine_row['Qgram_tok']))
    #Filter out for scores higher than .75
    df_partD.drop(df_partD[df_partD['score']<.8].index,inplace=True)
    #If the remaining dataframe is 0
    if len(df_partD) == 0:
        #Return None
        return None
    #Else if the remaining dataframe is 1
    elif len(df_partD) == 1:
        #Return the remaining row's hash_id
        match = df_partD.iloc[0]['hashID']
        matched.append(match)
        return match
    else:
        #Sort the remaining rows by score
        df_partD.sort_values(by=['score'],axis=0,ascending=False,ignore_index=True,inplace=True)
        #Return the highest one
        match = df_partD.iloc[0]['hashID']
        matched.append(match)
        return match

In [24]:
from tqdm import tqdm
import itertools
import multiprocess as mp
import collections

In [25]:
%%time
tqdm.pandas(desc="Progress")
sunshine['match'] = sunshine.progress_apply(lambda x: find_match(x),axis=1)

  from pandas import Panel
Progress: 100%|█████████████████████████| 53480/53480 [08:34<00:00, 103.90it/s]

Wall time: 8min 34s





In [26]:
sunshine[sunshine['match'].isnull()==False]

Unnamed: 0,id,fname,lname,type,city,combined,hashID,whitespace_tok,Qgram_tok,match
0,360308,maheshkumar,vyas,allergy & immunology,anaheim,maheshkumar vyas anaheim allergy & immunology,950771680717603933,"[maheshkumar, vyas, anaheim, allergy, &, immun...","[#m, ma, ah, he, es, sh, hk, ku, um, ma, ar, r...",4.677985e+18
1,1236637,nataliya,kushnir,allergy & immunology,berkeley,nataliya kushnir berkeley allergy & immunology,5156923147507326745,"[nataliya, kushnir, berkeley, allergy, &, immu...","[#n, na, at, ta, al, li, iy, ya, a , k, ku, u...",4.894086e+18
2,264541,alan,khadavi,allergy & immunology,beverly hills,alan khadavi beverly hills allergy & immunology,739111513759428767,"[alan, khadavi, beverly, hills, allergy, &, im...","[#a, al, la, an, n , k, kh, ha, ad, da, av, v...",-3.468648e+18
4,210594,binita,mandal,allergy & immunology,carmichael,binita mandal carmichael allergy & immunology,-3101263856835332465,"[binita, mandal, carmichael, allergy, &, immun...","[#b, bi, in, ni, it, ta, a , m, ma, an, nd, d...",4.717534e+18
6,278244,denis,yoshii,allergy & immunology,costa mesa,denis yoshii costa mesa allergy & immunology,-5714655725987488865,"[denis, yoshii, costa, mesa, allergy, &, immun...","[#d, de, en, ni, is, s , y, yo, os, sh, hi, i...",7.714554e+18
...,...,...,...,...,...,...,...,...,...,...
69124,601749,victor,krall,primary podiatric medicine,torrance,victor krall torrance primary podiatric medicine,934684939386281221,"[victor, krall, torrance, primary, podiatric, ...","[#v, vi, ic, ct, to, or, r , k, kr, ra, al, l...",4.864518e+17
69125,354996,michael,cosenza,primary podiatric medicine,ukiah,michael cosenza ukiah primary podiatric medicine,4984737393828319095,"[michael, cosenza, ukiah, primary, podiatric, ...","[#m, mi, ic, ch, ha, ae, el, l , c, co, os, s...",-3.495028e+18
69126,114594,douglas,taylor,primary podiatric medicine,walnut creek,douglas taylor walnut creek primary podiatric ...,-731028777500946032,"[douglas, taylor, walnut, creek, primary, podi...","[#d, do, ou, ug, gl, la, as, s , t, ta, ay, y...",1.064676e+18
69128,819823,raymond,bautista,sports medicine,long beach,raymond bautista long beach sports medicine,-8286556233800964530,"[raymond, bautista, long, beach, sports, medic...","[#r, ra, ay, ym, mo, on, nd, d , b, ba, au, u...",-5.223144e+18


In [27]:
other_matches = sunshine[sunshine['match'].isnull()==False][['match','hashID']].copy()
other_matches.rename(columns={'match':'part_d_Hash','hashID':'sunshine_Hash'},inplace=True)

In [28]:
matches = pd.concat([matches,other_matches],ignore_index=True)

In [29]:
len(part_d)

87015

In [30]:
len(sunshine)

53480

In [31]:
part_d.drop(part_d[part_d['hashID'].isin(matches['part_d_Hash'].to_list())].index,inplace=True)
sunshine.drop(sunshine[sunshine['hashID'].isin(matches['sunshine_Hash'].to_list())].index,inplace=True)

In [32]:
len(part_d)

86921

In [33]:
len(sunshine)

36214

In [34]:
sunshine[['id','fname','lname','type','city']].to_csv('../Data/Outputs_Cleanup/sunshine_no_matches.csv',index=False)
part_d[['id','fname','lname','type','city']].to_csv('../Data/Outputs_Cleanup/part_d_no_matches.csv',index=False)

In [35]:
matches.to_csv('../Data/Outputs_Cleanup/matches.csv',index=False)

In [36]:
matches.shape

(32898, 2)

In [37]:
matches_org = matches.copy()

In [60]:
matches = matches_org.copy()

In [61]:
matches['part_d_Hash'] = matches['part_d_Hash'].astype('int64')

In [62]:
type(matches['part_d_Hash'].iloc[0])

numpy.int64

In [64]:
matches = matches.merge(right=part_d_org,right_on='hashID',left_on='part_d_Hash',how='left')
#matches.rename(columns={'match':'part_d_Hash','hashID':'sunshine_Hash'},inplace=True)
matches.head()

Unnamed: 0,part_d_Hash,sunshine_Hash,id,fname,lname,type,city,combined,hashID
0,7402850750187914240,7402850750187914100,,,,,,,
1,8712277499010128896,8712277499010128640,,,,,,,
2,1982864612118341632,1982864612118341538,,,,,,,
3,5775354003505987584,5775354003505987514,,,,,,,
4,5623180525988655104,5623180525988654985,,,,,,,


In [46]:
matches.shape

(32898, 9)

In [49]:
s = matches['part_d_Hash'].iloc[0]

In [50]:
part_d_org[part_d_org['hashID']==s]

Unnamed: 0,id,fname,lname,type,city,combined,hashID
9,1669817615,aaron,glucksman,internal medicine,agoura hills,aaron glucksman agoura hills internal medicine,7402850750187914100


In [66]:
sunshine_org[sunshine_org['hashID']==s]

Unnamed: 0,id,fname,lname,type,city,combined,hashID
13215,5692526,aaron,glucksman,internal medicine,agoura hills,aaron glucksman agoura hills internal medicine,7402850750187914100
