# Core Problem
### Identify duplicate or matching individuals across CSVs with messy data

In [1]:
import pandas as pd
from rapidfuzz import fuzz
import string

df1 = pd.read_csv("general.csv")
df2 = pd.read_csv("political.csv")
schema = pd.read_csv('colofinterest.csv')


In [2]:
#below we will select the cols of interest
#Below is all the col we will match on
col_list = []
for i,e in schema.values:
    col_list.append(i)

#below is all the cols that are string so we can format them
string_cols = []
cond = schema['data_type'] == 'string'
for i,e in schema[cond].values:
    string_cols.append(i)



In [3]:
#Data cleaning
#taking all string columns and stripping and lowering them
for col in string_cols:
    df1[col] = df1[col].str.lower().str.strip()

for col in string_cols:
    df2[col] = df2[col].str.lower().str.strip()

In [4]:
#Create a new index type column so we can track its original location
def create_ind(df,tb_name):
    df['og_loc'] = df.index
    df['source'] = tb_name
    return df
    
col_list.append('og_loc')    
col_list.append('source')    
df2 = create_ind(df2,'df2')
df1 = create_ind(df1,'df1')

In [5]:
#combining both df to be able to get our scores
comb_df = pd.concat([df2[col_list],df1[col_list]], axis = 0)

In [6]:
# Heuristic-based duplicate detection (blocking)

# NOTE: loop-based implementation for clarity.
# will vectorized later

comb_df['flag_1'] = ''  # start with empty strings
comb_df['flag_2'] = ''  # start with empty strings
comb_df['flag_3'] = ''  # start with empty strings

#removing below since we do not need them for the bottom steps will readd 
col_list.remove('og_loc')    
col_list.remove('source')  

for col in col_list:
    comb_df['flag_1'] += comb_df[col].str[:3].fillna('')

for col in col_list:
    comb_df['flag_2'] += comb_df[col].str[:4].fillna('')

for col in col_list:
    comb_df['flag_3'] += comb_df[col].str[:5].fillna('')

#adding them back in
col_list.append('og_loc')    
col_list.append('source')   

In [7]:
#Creating the dup flag
# doing keep = false to mark all duplciates as 1
comb_df['dup_flag1'] = comb_df['flag_1'].duplicated(keep=False).astype(int)
comb_df['dup_flag2'] = comb_df['flag_2'].duplicated(keep=False).astype(int)
comb_df['dup_flag3'] = comb_df['flag_3'].duplicated(keep=False).astype(int)
comb_df = comb_df.sort_values(by=['dup_flag3','dup_flag2','dup_flag1'], ascending = False)


In [8]:
def get_score(row):
    if row['dup_flag1']:
        if row['dup_flag2']:
            if row['dup_flag3']:
                return 100
            return 66
        return 33
    return 0

comb_df['heu_score'] = comb_df.apply(get_score, axis = 1)
col_list.extend(['flag_1','flag_2','flag_3','heu_score'])
comb_df = comb_df[col_list]
col_list.remove('heu_score')
col_list.remove('og_loc')    
col_list.remove('source')  

In [9]:
group_info = (
    comb_df
    .groupby('flag_3')
    .agg(
        n_rows=('og_loc', 'count'),
        n_sources=('source', 'nunique')
    )
)

In [10]:
group_info

Unnamed: 0_level_0,n_rows,n_sources
flag_3,Unnamed: 1_level_1,Unnamed: 2_level_1
anthoharri1984-antho,1,1
ashletaylo1991-ashle,1,1
ashtaylo1991-ashle,1,1
chrismille1992-chris,2,2
dangarci1988-danie,1,1
daniegarci1988-danie,1,1
davidjacks1980-david,2,2
elizajones1990-eliza,1,1
emilydavis1989-em.da,2,2
hannalewis1997-hanna,2,2
