# Core Problem
### Identify duplicate or matching individuals across CSVs with messy data

In [23]:
import pandas as pd
import requests

df_g = pd.read_csv("general")
df_p = pd.read_csv("political")

In [24]:
df_g.head(3)

Unnamed: 0,fname,lname,dob,full_address,city,state,email
0,John,Smith,1985-04-12,123 Main St,Springfield,IL,john.smith@gmail.com
1,Elizabeth,Jones,1990-07-23,45 Oak Ave,Boston,MA,elizabeth.jones@yahoo.com
2,Mike,Johnson,1982-11-01,789 Pine Rd,Austin,TX,mike.j@gmail.com


In [25]:
df_p.head(3)

Unnamed: 0,fname,lname,dob,zipcode,city,state,political_affiliation,email
0,Jon,Smith,1985-04-12,62704,Springfield,IL,Independent,john.smith@gmail.com
1,Liz,Jones,1990-07-23,2116,Boston,MA,Democrat,elizabeth.jones@yahoo.com
2,Michael,Johnson,1982-11-01,78745,Austin,TX,Republican,mike.j@gmail.com


In [37]:
# Below is Data prep
for col in df_p.columns:            #looping through col
    for i in range(len(df_p[col])):  #Loop through each row in that col
        value = df_p.at[i,col]       # get the value
        if isinstance(value,str):    #check if its a string
            df_p.at[i,col] = value.lower().strip()  #make it lower and remoces white space


for col in df_g.columns:            #looping through col
    for i in range(len(df_g[col])):  #Loop through each row in that col
        value = df_g.at[i,col]       # get the value
        if isinstance(value,str):    #check if its a string
            df_g.at[i,col] = value.lower().strip() #make it lower and removes white space


In [35]:
df_g.head(3)

Unnamed: 0,fname,lname,dob,full_address,city,state,email
0,john,smith,1985-04-12,123 main st,springfield,il,john.smith@gmail.com
1,elizabeth,jones,1990-07-23,45 oak ave,boston,ma,elizabeth.jones@yahoo.com
2,mike,johnson,1982-11-01,789 pine rd,austin,tx,mike.j@gmail.com


In [36]:
df_p.head(3)

Unnamed: 0,fname,lname,dob,zipcode,city,state,political_affiliation,email
0,jon,smith,1985-04-12,62704,springfield,il,independent,john.smith@gmail.com
1,liz,jones,1990-07-23,2116,boston,ma,democrat,elizabeth.jones@yahoo.com
2,michael,johnson,1982-11-01,78745,austin,tx,republican,mike.j@gmail.com


In [48]:
#creating a index column so when we do union we can see original location
def index_og(df):
    df['og_loc'] = df.index
    return df

In [49]:
df_p = index_og(df_p)
df_g = index_og(df_g)

In [50]:
#have to add a part on how to handle nulls ? might not need to have to think about it

In [51]:
#Here have to add a part where they tell us what columns to select and we pull those columns
#no need to pull all the data since we will just refer to them by their 'og_loc' number

In [60]:
compr_df = pd.concat([df_g[['fname','lname','dob','state','og_loc']],df_p[['fname','lname','dob','state','og_loc']]], axis =0)
compr_df.head()

Unnamed: 0,fname,lname,dob,state,og_loc
0,john,smith,1985-04-12,il,0
1,elizabeth,jones,1990-07-23,ma,1
2,mike,johnson,1982-11-01,tx,2
3,sarah,williams,1995-02-19,co,3
4,robert,brown,1978-09-03,wa,4


In [69]:
compr_df['fname'].iloc[0][0:3]

'joh'

In [78]:
cols_to_use = ['fname', 'lname', 'dob', 'state']  # choose which columns to include

compr_df['combo_code1'] = ''  # start with empty strings
compr_df['combo_code2'] = ''  # start with empty strings
compr_df['combo_code3'] = ''  # start with empty strings

for col in cols_to_use:
    compr_df['combo_code1'] += compr_df[col].str[:3].fillna('')
    
for col in cols_to_use:
    compr_df['combo_code2'] += compr_df[col].str[:4].fillna('')

for col in cols_to_use:
    compr_df['combo_code3'] += compr_df[col].str[:5].fillna('')

In [79]:
compr_df.head(5)

Unnamed: 0,fname,lname,dob,state,og_loc,combo_code,combo_code1,combo_code2,combo_code3
0,john,smith,1985-04-12,il,0,,johsmi198il,johnsmit1985il,johnsmith1985-il
1,elizabeth,jones,1990-07-23,ma,1,,elijon199ma,elizjone1990ma,elizajones1990-ma
2,mike,johnson,1982-11-01,tx,2,,mikjoh198tx,mikejohn1982tx,mikejohns1982-tx
3,sarah,williams,1995-02-19,co,3,,sarwil199co,sarawill1995co,sarahwilli1995-co
4,robert,brown,1978-09-03,wa,4,,robbro197wa,robebrow1978wa,roberbrown1978-wa


In [80]:
compr_df['dup_flag1'] = compr_df['combo_code1'].duplicated(keep=False).astype(int)
compr_df['dup_flag2'] = compr_df['combo_code2'].duplicated(keep=False).astype(int)
compr_df['dup_flag3'] = compr_df['combo_code3'].duplicated(keep=False).astype(int)

In [91]:
compr_df.sort_values(by=['dup_flag3','dup_flag2','dup_flag1'], ascending = False)

Unnamed: 0,fname,lname,dob,state,og_loc,combo_code,combo_code1,combo_code2,combo_code3,dup_flag1,dup_flag2,dup_flag3
4,robert,brown,1978-09-03,wa,4,,robbro197wa,robebrow1978wa,roberbrown1978-wa,1,1,1
5,emily,davis,1989-12-25,or,5,,emidav198or,emildavi1989or,emilydavis1989-or,1,1,1
6,chris,miller,1992-06-30,oh,6,,chrmil199oh,chrimill1992oh,chrismille1992-oh,1,1,1
7,jessica,wilson,1986-01-14,fl,7,,jeswil198fl,jesswils1986fl,jessiwilso1986-fl,1,1,1
10,jake,anderson,1987-05-10,ca,10,,jakand198ca,jakeande1987ca,jakeander1987-ca,1,1,1
11,laura,thomas,1993-09-29,tx,11,,lautho199tx,laurthom1993tx,laurathoma1993-tx,1,1,1
12,david,jackson,1980-10-05,az,12,,davjac198az,davijack1980az,davidjacks1980-az,1,1,1
13,megan,white,1996-12-01,id,13,,megwhi199id,megawhit1996id,meganwhite1996-id,1,1,1
15,rachel,martin,1992-07-02,nv,15,,racmar199nv,rachmart1992nv,rachemarti1992-nv,1,1,1
16,william,thompson,1975-02-20,fl,16,,wiltho197fl,willthom1975fl,willithomp1975-fl,1,1,1
