# Core Problem
### Identify duplicate or matching individuals across CSVs with messy data

In [34]:
import pandas as pd
from rapidfuzz import fuzz
import string

df1 = pd.read_csv("df1.csv")
df2 = pd.read_csv("df2.csv")
schema = pd.read_csv('colsofinterest.csv')


#Below is all the col we will match on
col_list = []
for i,e in schema.values:
    col_list.append(i)


#below is all the cols that are string so we can format them
string_cols = []
cond = schema['data_type'] == 'string'
for i,e in schema[cond].values:
    string_cols.append(i)


#Data cleaning
#taking all string columns and stripping and lowering them
for col in string_cols:
    df1[col] = df1[col].str.lower().str.strip()

for col in string_cols:
    df2[col] = df2[col].str.lower().str.strip()


#Create a new index type column so we can track its original location
def create_ind(df,tb_name):
    df['og_loc'] = df.index
    df['source'] = tb_name
    return df
    
col_list.append('og_loc')    
col_list.append('source')    
df2 = create_ind(df2,'df2')
df1 = create_ind(df1,'df1')


#combining both df to be able to get our scores
comb_df = pd.concat([df2[col_list],df1[col_list]], axis = 0)


# Heuristic-based duplicate detection (blocking)
# NOTE: loop-based implementation for clarity.
# will vectorized later
comb_df['block_key_1'] = ''  # start with empty strings
comb_df['block_key_2'] = ''  # start with empty strings
comb_df['block_key_3'] = ''  # start with empty strings


#removing below since we do not need them for the bottom steps will readd 
col_list.remove('og_loc')    
col_list.remove('source')  

for col in col_list:
    comb_df['block_key_1'] += comb_df[col].str[:2].fillna('')

for col in col_list:
    comb_df['block_key_2'] += comb_df[col].str[:3].fillna('')

for col in col_list:
    comb_df['block_key_3'] += comb_df[col].str[:4].fillna('')


#adding them back in since we will use them to keep these cols
col_list.append('og_loc')    
col_list.append('source')  


#Creating the dup flag
# doing keep = false to mark all duplciates as 1
comb_df['dup_flag1'] = comb_df['block_key_1'].duplicated(keep=False).astype(int)
comb_df['dup_flag2'] = comb_df['block_key_2'].duplicated(keep=False).astype(int)
comb_df['dup_flag3'] = comb_df['block_key_3'].duplicated(keep=False).astype(int)
comb_df = comb_df.sort_values(by=['dup_flag3','dup_flag2','dup_flag1'], ascending = False)


#Providing a score for the heu column which will hold the score 
def get_score(row):
    if row['dup_flag1']:
        if row['dup_flag2']:
            if row['dup_flag3']:
                return 100
            return 66
        return 33
    return 0

comb_df['heu'] = comb_df.apply(get_score, axis = 1)
col_list.extend(['block_key_1','block_key_2','block_key_3','heu'])
comb_df = comb_df[col_list]
col_list.remove('heu')
col_list.remove('og_loc')    
col_list.remove('source') 


#Splitting back into the original dfs so we can do fuzzy matching
df1 = comb_df[comb_df['source'] == 'df1'].copy()
df2 = comb_df[comb_df['source'] == 'df2'].copy()


#Now breaking into pairs so we do not wast compute doing fuzzy matching on all rows
pairs_1 = df1.merge(df2, on="block_key_1", how="inner")
pairs_2 = df1.merge(df2, on="block_key_2", how="inner")
pairs_3 = df1.merge(df2, on="block_key_3", how="inner")

candidates = pd.concat([pairs_1, pairs_2, pairs_3]).drop_duplicates()
candidates = candidates.rename(columns={
    "fname_x": "fname_1",
    "lname_x": "lname_1",
    "dob_x": "dob_1",
    "email_x": "email_1",
    "fname_y": "fname_2",
    "lname_y": "lname_2",
    "dob_y": "dob_2",
    "email_y": "email_2",
})


#looping through the candidates to do fuzzy matching &
#cleaning up col_list so we can iterate through it in fuzzy match
col_list.remove('block_key_1')
col_list.remove('block_key_2')
col_list.remove('block_key_3')

col_list

for col in col_list:
    scores = []
    for _, row in candidates.iterrows():
        v1 = row[f"{col}_1"]
        v2 = row[f"{col}_2"]

        if pd.isna(v1) or pd.isna(v2):
            scores.append(0)
        else:
            scores.append(fuzz.partial_ratio(str(v1), str(v2)))

    candidates[f"{col}_score"] = scores


#combining the fuzzy scores
score_cols = candidates.filter(like="_score").columns
candidates["final_score_fuzzy"] = candidates[score_cols].mean(axis=1)
candidates["confidence_match_score"] = (candidates["final_score_fuzzy"] + (candidates["heu_x"]+candidates["heu_y"])/2)/2


#Building the final table that will be converted to csv
final_df = pd.DataFrame()


#adding in the cols 
for i in col_list:
    final_df[f'{i}_1'] = candidates[f'{i}_1']
    final_df[f'{i}_2'] = candidates[f'{i}_2']
        
other_final = ['source_x','og_loc_x','source_y','og_loc_y','confidence_match_score']
for i in other_final:
    final_df[i] = candidates[i]


#Sorting and producing csv
final_df.drop_duplicates()
final_df = final_df.sort_values('confidence_match_score', ascending=False).reset_index(drop=True)

final_df.to_csv('dedup_matches_final.csv', index=False)
print("Final matches table saved!")



Final matches table saved!
