In [None]:
import pandas as pd
import py_stringmatching as sm

In [None]:
sunshine = pd.read_csv('../Data/Outputs_Cleanup/Sunshine/physicians_info.csv',dtype=str)
sunshine.fillna(value='',inplace=True)

In [None]:
def add_suffix(lname, suffix):
    s = lname + ' ' + suffix
    return s.strip()

sunshine['Physician_Last_Name'] = sunshine.apply(lambda x: add_suffix(x['Physician_Last_Name'],x['Physician_Name_Suffix']),axis=1)

In [None]:
sunshine.rename(columns={'Physician_Profile_ID':'id','Physician_First_Name':'fname','Physician_Specialty':'type',
                         'Physician_Last_Name':'lname','Recipient_City':'city'},inplace=True)
sunshine.drop(columns=['Physician_Middle_Name','Physician_Name_Suffix','Recipient_State','Physician_Primary_Type',
                       'Physician_License_State_code1','Physician_License_State_code2','Physician_License_State_code3',
                       'Physician_License_State_code4','Physician_License_State_code5','Recipient_Zip_Code'],inplace=True)

In [None]:
part_d = pd.read_csv('../Data/Outputs_Cleanup/Part_d/prescriber_information.csv',dtype=str)
part_d.fillna(value='',inplace=True)
part_d.rename(columns={'Prscrbr_NPI':'id','Prscrbr_Last_Org_Name':'lname','Prscrbr_First_Name':'fname',
                       'Prscrbr_City':'city','Prscrbr_Type':'type'},inplace=True)

Cities were hand deduped during this first run. However, in the future we will need to make a city deduper

In [None]:
sunshine = pd.read_csv('CMS_Sunshine_Physicians.csv',dtype=str)
part_d = pd.read_csv('Medicare_PartD_Physicians.csv',dtype=str)

In [None]:
def processType(x):
    s = x.split('|')
    return s[-1]

In [None]:
def processData(df):
    cols = df.columns.to_list()
    for col in cols:
        df[col] = df[col].astype(str)
        df[col] = df[col].apply(lambda x: x.lower().strip())
    df['type'] = df['type'].apply(lambda x: processType(x))
    df['combined'] = df['fname'] + ' ' + df['lname'] + ' ' + df['city'] + ' ' + df['type']
    return df

In [None]:
sunshine = processData(sunshine)
part_d = processData(part_d)

In [None]:
sunshine_combined_id = sunshine.groupby('combined')['id'].apply(list).reset_index(name='id')
part_d_id = part_d.groupby('combined')['id'].apply(list).reset_index(name='id')

In [None]:
sunshine.drop(columns=['id'],inplace=True)
part_d.drop(columns=['id'],inplace=True)

In [None]:
sunshine = sunshine.merge(right=sunshine_combined_id,how='left',on='combined')
part_d = part_d.merge(right=sunshine_combined_id,how='left',on='combined')

In [None]:
del sunshine_combined_id
del part_d_id

There are duplicates where the same person has two ids in both dataset.<br>
The solution is to use the combined column to create a new unique id.<br>
Then we will create lookup tables for the new ID and the combined string.<br>

Make the lookup tables

In [None]:
# sunshine.rename(columns={'id':'Physician_Profile_ID'})[['combined','Physician_Profile_ID']].to_csv('../Data/Physicians_Deduplication/Outputs/CMS_Sunshine_Physicians_NewID.csv',index=False)
# part_d.rename(columns={'id':'Prscrbr_NPI'})[['combined','Prscrbr_NPI']].to_csv('../Data/Physicians_Deduplication/Outputs/Medicare_PartD_Physicians_NewID.csv',index=False)

In [None]:
sunshine.drop_duplicates(subset='combined',inplace=True)
part_d.drop_duplicates(subset='combined',inplace=True)

In [None]:
sunshine_org = sunshine.copy()
part_d_org = part_d.copy()

Here we will do a naive matching. Based on just the 'hashID' column. If there is an exact match we will accept this.

In [None]:
merged_part_d = part_d.merge(right=sunshine,on='combined',how='left')

Now we will remove the matches and set them aside. We will do a more careful inspection of the remaining unmatched ones.

In [None]:
#Setting aside matches, keeping pairs together
matches = merged_part_d.drop(merged_part_d[(merged_part_d['city_x'].isnull()==True) | (merged_part_d['city_y'].isnull()==True)].index)
col = matches.columns.to_list()
col.remove('combined')
matches.drop(columns = col,inplace=True)
matches['combined2'] = matches['combined']
matches.rename(columns={'combined':'part_d','combined2':'sunshine'},inplace=True)
matches.head()

In [None]:
part_d.drop(part_d[part_d['combined'].isin(matches['part_d'].to_list())].index,inplace=True)
sunshine.drop(sunshine[sunshine['combined'].isin(matches['sunshine'].to_list())].index,inplace=True)

In [None]:
del merged_part_d

In [None]:
#Create a new column in sunshine that is the combined column whitespace tokenized    
#Create a new column in sunshine that is the combined column unigram tokenized
#Create a new column in part_d that is the combined column whitespace tokenized
#Create a new column in part_d that is the combined column unigram tokenized
whitespace_tokenizer = sm.WhitespaceTokenizer()
unigram_tokenizer = sm.QgramTokenizer(qval=2)
sunshine['whitespace_tok'] = sunshine['combined'].apply(lambda x: whitespace_tokenizer.tokenize(x))
sunshine['Qgram_tok'] = sunshine['combined'].apply(lambda x: unigram_tokenizer.tokenize(x))
part_d['whitespace_tok'] = part_d['combined'].apply(lambda x: whitespace_tokenizer.tokenize(x))
part_d['Qgram_tok'] = part_d['combined'].apply(lambda x: unigram_tokenizer.tokenize(x))

In [None]:
#We will find matches for the sunshine data in the part_d data
#This is because the sunshine data is the smaller set
#Here is the procedure:    
    #For each row in sunshine
        #Filter the city in part_d
        #Perform Jaccard on the whitespace token set between the remainder and the row
        #Filter out for scores higher than .75
        #If the remaining dataframe is 0
            #Return None
        #Else
            #Grab the highest matching one as the match
            #Save the match into sunshine_match dataframe and part_d_match dataframe
        #Remove the row from both part_d_noDups and sunshine_noDups

In [None]:
sim_measure = sm.OverlapCoefficient()

In [None]:
matched = []

In [None]:
def find_match(sunshine_row):
    #filter part_d for the last name:
    df_partD = part_d[part_d['lname']==sunshine_row['lname']].copy()
    df_partD.drop(df_partD[df_partD['combined'].isin(matched)].index,inplace=True)
    if len(df_partD) == 0:
        #Return None
        return None
    #Perform similarity on the token set between the remainder and the row
    df_partD['score'] = df_partD['Qgram_tok'].apply(lambda x: sim_measure.get_sim_score(x, sunshine_row['Qgram_tok']))
    #Filter out for scores higher than .75
    df_partD.drop(df_partD[df_partD['score']<.8].index,inplace=True)
    #If the remaining dataframe is 0
    if len(df_partD) == 0:
        #Return None
        return None
    #Else if the remaining dataframe is 1
    elif len(df_partD) == 1:
        #Return the remaining row's hash_id
        match = df_partD.iloc[0]['combined']
        matched.append(match)
        return match
    else:
        #Sort the remaining rows by score
        df_partD.sort_values(by=['score'],axis=0,ascending=False,ignore_index=True,inplace=True)
        #Return the highest one
        match = df_partD.iloc[0]['combined']
        matched.append(match)
        return match

In [None]:
from tqdm import tqdm

In [None]:
%%time
tqdm.pandas(desc="Progress")
sunshine['match'] = sunshine.progress_apply(lambda x: find_match(x),axis=1)

In [None]:
other_matches = sunshine[sunshine['match'].isnull()==False][['match','combined']].copy()
other_matches.rename(columns={'match':'part_d','combined':'sunshine'},inplace=True)

In [None]:
matches = pd.concat([matches,other_matches],ignore_index=True)

In [None]:
part_d.drop(part_d[part_d['combined'].isin(matches['part_d'].to_list())].index,inplace=True)
sunshine.drop(sunshine[sunshine['combined'].isin(matches['sunshine'].to_list())].index,inplace=True)

In [None]:
sunshine[['id','fname','lname','type','city']].to_csv('../Data/Physicians_Deduplication/Outputs/CMS_Sunshine_Physicians_no_matches.csv',index=False)
part_d[['id','fname','lname','type','city']].to_csv('../Data/Physicians_Deduplication/Outputs/Medicare_PartD_Physicians_no_matches.csv',index=False)

In [None]:
matches.to_csv('../Data/Physicians_Deduplication/Outputs/Physician_Matches.csv',index=False)

In [None]:
sunshine.drop(columns = ['whitespace_tok','Qgram_tok','match'],inplace=True)
part_d.drop(columns = ['whitespace_tok','Qgram_tok'],inplace=True)

In [None]:
matches.rename(columns={'part_d':'id'},inplace=True)
matches = matches.merge(right = part_d_org,how='left',left_on='id',right_on='combined')
matches.drop(columns=['sunshine','combined'],inplace=True)
matches.rename(columns={'id_x':'combined','id_y':'Prscrbr_NPI'},inplace=True)
matches = matches.merge(right = sunshine_org[['combined','id']],how='left',on='combined')
matches.rename(columns={'id':'Physician_Profile_ID'},inplace=True)
matches = matches[['combined','Prscrbr_NPI','Physician_Profile_ID','fname','lname','type','city']]

In [None]:
sunshine.rename(columns={'id':'Physician_Profile_ID'},inplace=True)
sunshine['Prscrbr_NPI']=''
sunshine = sunshine[['combined','Prscrbr_NPI','Physician_Profile_ID','fname','lname','type','city']]

In [None]:
part_d.rename(columns={'id':'Prscrbr_NPI'},inplace=True)
part_d['Physician_Profile_ID']=''
part_d = part_d[['combined','Prscrbr_NPI','Physician_Profile_ID','fname','lname','type','city']]

In [None]:
final_output = pd.concat([pd.concat([matches,sunshine],ignore_index=True),part_d],ignore_index=True)
final_output.drop(columns=['combined'],inplace=True)
final_output.reset_index(inplace=True)
final_output.rename(columns={'index':'id'},inplace=True)
final_output.to_csv('../Data/Physicians_Deduplication/Outputs/Physicians.csv',index=False)
final_output.to_pickle('../Data/Physicians_Deduplication/Outputs/Physicians.pkl')