# Core Problem
### Identify duplicate or matching individuals across CSVs with messy data

In [3]:
import pandas as pd
import requests

df_g = pd.read_csv("general")
df_p = pd.read_csv("political")

In [4]:
df_g.head(3)

Unnamed: 0,fname,lname,dob,full_address,city,state,email
0,John,Smith,1985-04-12,123 Main St,Springfield,IL,john.smith@gmail.com
1,Elizabeth,Jones,1990-07-23,45 Oak Ave,Boston,MA,elizabeth.jones@yahoo.com
2,Mike,Johnson,1982-11-01,789 Pine Rd,Austin,TX,mike.j@gmail.com


In [5]:
df_p.head(3)

Unnamed: 0,fname,lname,dob,zipcode,city,state,political_affiliation,email
0,Jon,Smith,1985-04-12,62704,Springfield,IL,Independent,john.smith@gmail.com
1,Liz,Jones,1990-07-23,2116,Boston,MA,Democrat,elizabeth.jones@yahoo.com
2,Michael,Johnson,1982-11-01,78745,Austin,TX,Republican,mike.j@gmail.com


In [6]:
# Below is Data prep
for col in df_p.columns:            #looping through col
    for i in range(len(df_p[col])):  #Loop through each row in that col
        value = df_p.at[i,col]       # get the value
        if isinstance(value,str):    #check if its a string
            df_p.at[i,col] = value.lower().strip()  #make it lower and remoces white space


for col in df_g.columns:            #looping through col
    for i in range(len(df_g[col])):  #Loop through each row in that col
        value = df_g.at[i,col]       # get the value
        if isinstance(value,str):    #check if its a string
            df_g.at[i,col] = value.lower().strip() #make it lower and removes white space


In [7]:
#creating a index column so when we do union we can see original location
def index_og(df):
    df['og_loc'] = df.index
    return df
    
df_p = index_og(df_p)
df_g = index_og(df_g)

In [8]:
#have to add a part on how to handle nulls ? might not need to have to think about it

In [11]:
#Here have to add a part where they tell us what columns to select and we pull those columns
#no need to pull all the data since we will just refer to them by their 'og_loc' number

col_to_use1 =[]
continue1 = True

while continue1:
    value = input("What col would you like to match on. One at a time then press enter").strip()
    if value:
        col_to_use1.append(value) #prevents blanks from being added in
    value2 = input("Any more? Y or N").upper().strip()
    if value2 == 'N':
        continue1 = False
#For the example we will use ['fname', 'lname', 'dob', 'state'] 

What col would you like to match on. One at a time then press enter fname
Any more? Y or N y
What col would you like to match on. One at a time then press enter lname
Any more? Y or N y
What col would you like to match on. One at a time then press enter dob
Any more? Y or N y
What col would you like to match on. One at a time then press enter state
Any more? Y or N n


In [12]:
compr_df = pd.concat([df_g[['fname','lname','dob','state','og_loc']],df_p[['fname','lname','dob','state','og_loc']]], axis =0)

In [15]:
compr_df['combo_code1'] = ''  # start with empty strings
compr_df['combo_code2'] = ''  # start with empty strings
compr_df['combo_code3'] = ''  # start with empty strings

for col in col_to_use1:
    compr_df['combo_code1'] += compr_df[col].str[:3].fillna('')
    
for col in col_to_use1:
    compr_df['combo_code2'] += compr_df[col].str[:4].fillna('')

for col in col_to_use1:
    compr_df['combo_code3'] += compr_df[col].str[:5].fillna('')

In [16]:
compr_df.head(5)

Unnamed: 0,fname,lname,dob,state,og_loc,combo_code1,combo_code2,combo_code3
0,john,smith,1985-04-12,il,0,johsmi198il,johnsmit1985il,johnsmith1985-il
1,elizabeth,jones,1990-07-23,ma,1,elijon199ma,elizjone1990ma,elizajones1990-ma
2,mike,johnson,1982-11-01,tx,2,mikjoh198tx,mikejohn1982tx,mikejohns1982-tx
3,sarah,williams,1995-02-19,co,3,sarwil199co,sarawill1995co,sarahwilli1995-co
4,robert,brown,1978-09-03,wa,4,robbro197wa,robebrow1978wa,roberbrown1978-wa


In [17]:
compr_df['dup_flag1'] = compr_df['combo_code1'].duplicated(keep=False).astype(int)
compr_df['dup_flag2'] = compr_df['combo_code2'].duplicated(keep=False).astype(int)
compr_df['dup_flag3'] = compr_df['combo_code3'].duplicated(keep=False).astype(int)

In [18]:
compr_df = compr_df.sort_values(by=['dup_flag3','dup_flag2','dup_flag1'], ascending = False)

In [19]:
compr_df.head(2)

Unnamed: 0,fname,lname,dob,state,og_loc,combo_code1,combo_code2,combo_code3,dup_flag1,dup_flag2,dup_flag3
4,robert,brown,1978-09-03,wa,4,robbro197wa,robebrow1978wa,roberbrown1978-wa,1,1,1
5,emily,davis,1989-12-25,or,5,emidav198or,emildavi1989or,emilydavis1989-or,1,1,1


In [21]:
def get_score(row):
    if row['dup_flag1']:
        if row['dup_flag2']:
            if row['dup_flag3']:
                return 100
            return 66
        return 33
    return 0

compr_df['score'] = compr_df.apply(get_score, axis = 1)

In [22]:
compr_df.drop(columns = ['dup_flag1','dup_flag2','dup_flag3', 'combo_code1','combo_code2','combo_code3'],inplace=True)

In [24]:
compr_df = compr_df.sort_values(by = ['score'], ascending =[False])
compr_df = compr_df[['og_loc','fname','lname','dob','state','score']]

In [25]:
compr_df

Unnamed: 0,og_loc,fname,lname,dob,state,score
4,4,robert,brown,1978-09-03,wa,100
5,5,emily,davis,1989-12-25,or,100
19,19,hannah,lewis,1997-08-13,fl,100
5,5,emily,davis,1989-12-25,or,100
15,15,rachel,martin,1992-07-02,nv,100
13,13,megan,white,1996-12-01,id,100
12,12,david,jackson,1980-10-05,az,100
11,11,laura,thomas,1993-09-29,tx,100
10,10,jake,anderson,1987-05-10,ca,100
7,7,jessica,wilson,1986-01-14,fl,100


In [None]:
# Dummy code below

def combined_score(row1, row2):
    # --- Your existing deterministic score ---
    rule_score = 0
    if row1['fname'][:3] == row2['fname'][:3]:
        rule_score += 30
    if row1['lname'][:3] == row2['lname'][:3]:
        rule_score += 30
    if str(row1['dob'])[:4] == str(row2['dob'])[:4]:
        rule_score += 40

    # --- Fuzzy component ---
    fuzzy_name = fuzz.partial_ratio(row1['fname'], row2['fname'])
    fuzzy_lname = fuzz.partial_ratio(row1['lname'], row2['lname'])
    fuzzy_dob = fuzz.ratio(str(row1['dob']), str(row2['dob']))
    fuzzy_avg = (fuzzy_name + fuzzy_lname + fuzzy_dob) / 3

    # --- Combine them (weighted average) ---
    final_score = 0.6 * rule_score + 0.4 * fuzzy_avg
    return final_score






In [None]:
def confidence_label(score):
    if score >= 85:
        return "High"
    elif score >= 70:
        return "Medium"
    else:
        return "Low"

# Rapi Fuzzy exp
- it works by using the Levenshtein distance (how many edits it takes to turn one string into another).
- Example:
-   "ismael" vs "ishmael" → 86%
-    "bob" vs "robert" → 40%
-    "123 main st" vs "123 main street" → 95%