In [2]:
import pandas as pd
from fuzzywuzzy import process
from fuzzywuzzy import fuzz

In [3]:
# Import file with messy names
# This has data in only one column

file = "/Users/ishitagopal/Desktop/Letters.csv"

df = pd.read_csv(file,skipinitialspace=True)
df.head()

Unnamed: 0,﻿Signatory
0,Senator Kevin Ranker Washington
1,Assemblyman Thomas Abinanti New York
2,Representative John Ager North Carolina
3,Senator Ben Allen California
4,Representative Sherry Appleton Washington


In [26]:
len(df)

225

In [4]:
df.columns

Index(['﻿Signatory'], dtype='object')

In [5]:
# Rename the 'Signatory' column name to deal with white-space problem
# New name is 'Signing_Legis'

df['﻿Signatory']
df.rename(columns = {'﻿Signatory':'Signing_Legis'}, inplace= True)


In [6]:
# Dictionary of abbreviated state names 

State_Abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

In [7]:
# Split 'Signing_Legis' to remove the first word which contains the designation of the legislator
# Save it in 'Legis_Design'

df[['Legis_Design','Remaining_Data']] = df["Signing_Legis"].str.split(" ", 1, expand=True)
df.head()


Unnamed: 0,Signing_Legis,Legis_Design,Remaining_Data
0,Senator Kevin Ranker Washington,Senator,Kevin Ranker Washington
1,Assemblyman Thomas Abinanti New York,Assemblyman,Thomas Abinanti New York
2,Representative John Ager North Carolina,Representative,John Ager North Carolina
3,Senator Ben Allen California,Senator,Ben Allen California
4,Representative Sherry Appleton Washington,Representative,Sherry Appleton Washington


In [10]:
# Save the state's name as an abbreveations in the column 'Legis_State'
# Save legislator's name in the column 'Legis_Name'

State_Codes = []
Names = []

for row in df["Remaining_Data"]:
    for key in State_Abbrev:
        if key in row:            
            Names.append(row.replace(key,'').strip())
            State_Codes.append(State_Abbrev[key])
            
df["Legis_Name"] = Names
df["Legis_State"] = State_Codes  

df.head()

Unnamed: 0,Signing_Legis,Legis_Design,Remaining_Data,Legis_Name,Legis_State
0,Senator Kevin Ranker Washington,Senator,Kevin Ranker Washington,Kevin Ranker,WA
1,Assemblyman Thomas Abinanti New York,Assemblyman,Thomas Abinanti New York,Thomas Abinanti,NY
2,Representative John Ager North Carolina,Representative,John Ager North Carolina,John Ager,NC
3,Senator Ben Allen California,Senator,Ben Allen California,Ben Allen,CA
4,Representative Sherry Appleton Washington,Representative,Sherry Appleton Washington,Sherry Appleton,WA


In [11]:
# Remove repetative information by droping the 'Remaining_Data' column 

df = df.drop(['Remaining_Data'], axis=1)
df.head()

Unnamed: 0,Signing_Legis,Legis_Design,Legis_Name,Legis_State
0,Senator Kevin Ranker Washington,Senator,Kevin Ranker,WA
1,Assemblyman Thomas Abinanti New York,Assemblyman,Thomas Abinanti,NY
2,Representative John Ager North Carolina,Representative,John Ager,NC
3,Senator Ben Allen California,Senator,Ben Allen,CA
4,Representative Sherry Appleton Washington,Representative,Sherry Appleton,WA


In [12]:
### Ignore this box of Code ###
#Some Examples for self reference

a = "Stanley Paige Zeigler"
b = a.split()
concat = b[1] +', '+ b[-1]
''.join(concat)
name = a.split()
name
a = ' '.join(name[:-1])
b = name[-1]
print(a)
print(b)
name[1]
com = b+','+' '+ a
com.strip()

Stanley Paige
Zeigler


'Zeigler, Stanley Paige'

In [13]:
# Format legislators names in 'Legis_Name' to match it to Shor-Mcarty data
# Save it in 'Formated_Legis_Names'

Formated_Names = []

for name in df["Legis_Name"]:
    name = name.split()
    last_part = name[-1]
    first_part = " ".join(name[:-1])
    concatinate = last_part +', '+first_part
    reversed_name = "".join(concatinate)
    Formated_Names.append(reversed_name.strip())
    
df["Formated_Legis_Names"] = Formated_Names

df.head(10)

Unnamed: 0,Signing_Legis,Legis_Design,Legis_Name,Legis_State,Formated_Legis_Names
0,Senator Kevin Ranker Washington,Senator,Kevin Ranker,WA,"Ranker, Kevin"
1,Assemblyman Thomas Abinanti New York,Assemblyman,Thomas Abinanti,NY,"Abinanti, Thomas"
2,Representative John Ager North Carolina,Representative,John Ager,NC,"Ager, John"
3,Senator Ben Allen California,Senator,Ben Allen,CA,"Allen, Ben"
4,Representative Sherry Appleton Washington,Representative,Sherry Appleton,WA,"Appleton, Sherry"
5,Representative David Arconti Connecticut,Representative,David Arconti,CT,"Arconti, David"
6,Representative John Autry North Carolina,Representative,John Autry,NC,"Autry, John"
7,Representative Phil Barnhart Oregon,Representative,Phil Barnhart,OR,"Barnhart, Phil"
8,Chairman Kumar Barve Maryland,Chairman,Kumar Barve,MD,"Barve, Kumar"
9,Representative Paul Baumbach Delaware,Representative,Paul Baumbach,DE,"Baumbach, Paul"


In [14]:
#Import the Shor-Mcarty dataset

file2 = "/Users/ishitagopal/Box/science_backed_policy_diffusion/Dataset/shor_mcarty_1993_to_2016_with_Id.csv"

fields = ['name', 'party', 'st', 'st_id', 'Legislator_Id']
df2 = pd.read_csv(file2, skipinitialspace = True, usecols = fields)

df2.head()

Unnamed: 0,name,party,st,st_id,Legislator_Id
0,Adams,R,AL,1,AL_1
1,"Albritton, Greg",R,AL,2,AL_2
2,"Allen, Gerald",R,AL,3,AL_3
3,Amari,R,AL,4,AL_4
4,"Armistead, Bill",R,AL,5,AL_5


In [19]:
# Check if exact matches between the messy names and the Shor-Mcarty dataset were found 

exact_matches = df.Formated_Legis_Names.isin(df2.name)
exact_matches.value_counts()

False    140
True      85
Name: Formated_Legis_Names, dtype: int64

In [20]:
# Create a list of messy names retrieved from the letter. Save in 'wrong_names'

wrong_names=df['Formated_Legis_Names'].values

# Create a list of legislator names from the Shor-Mcarty dataset in 'df2'

correct_names=df2['name'].values

In [34]:
len(wrong_names)

225

In [37]:
# Construct a function 'match_names' to *fuzzy* match the messy names in 'df' to the Shor-Mcarty names in 'df2'
# Return only the best match along with its ratio as 2 lists

names_array = []
ratio_array = []

def match_names(wrong_names,correct_names):
    for row in wrong_names:
        x = process.extractOne(row, correct_names)
        names_array.append(x[0])
        ratio_array.append(x[1])
    return names_array, ratio_array


In [38]:
#Apply the match_names function which returns two lists : 'name_match' and 'ratio_match'

name_match, ratio_match = match_names(wrong_names, correct_names)
 

In [40]:
len(name_match)
len(ratio_match)

225

In [42]:
# Store the matched names and their match ratios in 'Fuzzy_Legis_Name' and 'Fuzzy_Legis_Ratio'

df['Fuzzy_Legis_Name'] = pd.Series(name_match)
df['Fuzzy_Legis_Ratio'] = pd.Series(ratio_match)

df.head(10)

Unnamed: 0,Signing_Legis,Legis_Design,Legis_Name,Legis_State,Formated_Legis_Names,ExactMatch,Fuzzy_Legis_Name,Fuzzy_Legis_Ratio
0,Senator Kevin Ranker Washington,Senator,Kevin Ranker,WA,"Ranker, Kevin",True,"Ranker, Kevin",100
1,Assemblyman Thomas Abinanti New York,Assemblyman,Thomas Abinanti,NY,"Abinanti, Thomas",False,"Abinanti, Thomas J.",95
2,Representative John Ager North Carolina,Representative,John Ager,NC,"Ager, John",False,"Jagler, John",91
3,Senator Ben Allen California,Senator,Ben Allen,CA,"Allen, Ben",True,"Allen, Ben",100
4,Representative Sherry Appleton Washington,Representative,Sherry Appleton,WA,"Appleton, Sherry",True,"Appleton, Sherry",100
5,Representative David Arconti Connecticut,Representative,David Arconti,CT,"Arconti, David",False,"Arconti, David Jr.",95
6,Representative John Autry North Carolina,Representative,John Autry,NC,"Autry, John",False,"England, Christopher John",86
7,Representative Phil Barnhart Oregon,Representative,Phil Barnhart,OR,"Barnhart, Phil",False,Hart,90
8,Chairman Kumar Barve Maryland,Chairman,Kumar Barve,MD,"Barve, Kumar",True,"Barve, Kumar",100
9,Representative Paul Baumbach Delaware,Representative,Paul Baumbach,DE,"Baumbach, Paul",False,"Baumbach, Paul S.",95


In [52]:
# Attaching corresponding 'Legislator_ID' from the Shor-Mcarty dataset in 'df2' to the best name matches in 'Fuzzy_Legis_Name'
# Save all the associated IDs found for a name in 'All_Legis_Id_Options'

A = []
B = []
for name in df["Fuzzy_Legis_Name"]:
    unique_ID = df2.loc[df2['name'] == name, 'Legislator_Id'].values
    unique_ID2 = df2.loc[df2['name'] == name, 'Legislator_Id'].iloc[0]
    
    A.append(unique_ID)
    B.append(unique_ID2)

df["All_Legis_Id_Options"] = A
df["1st_Legis_Id_Option"] = B

df.head(50)


Unnamed: 0,Signing_Legis,Legis_Design,Legis_Name,Legis_State,Formated_Legis_Names,ExactMatch,Fuzzy_Legis_Name,Fuzzy_Legis_Ratio,All_Legis_Id_Options,1st_Legis_Id_Option
0,Senator Kevin Ranker Washington,Senator,Kevin Ranker,WA,"Ranker, Kevin",True,"Ranker, Kevin",100,[WA_120],WA_120
1,Assemblyman Thomas Abinanti New York,Assemblyman,Thomas Abinanti,NY,"Abinanti, Thomas",False,"Abinanti, Thomas J.",95,[NY_162],NY_162
2,Representative John Ager North Carolina,Representative,John Ager,NC,"Ager, John",False,"Jagler, John",91,[WI_179],WI_179
3,Senator Ben Allen California,Senator,Ben Allen,CA,"Allen, Ben",True,"Allen, Ben",100,"[AR_1, CA_4, GA_258]",AR_1
4,Representative Sherry Appleton Washington,Representative,Sherry Appleton,WA,"Appleton, Sherry",True,"Appleton, Sherry",100,[WA_173],WA_173
5,Representative David Arconti Connecticut,Representative,David Arconti,CT,"Arconti, David",False,"Arconti, David Jr.",95,[CT_103],CT_103
6,Representative John Autry North Carolina,Representative,John Autry,NC,"Autry, John",False,"England, Christopher John",86,[AL_149],AL_149
7,Representative Phil Barnhart Oregon,Representative,Phil Barnhart,OR,"Barnhart, Phil",False,Hart,90,[CA_62],CA_62
8,Chairman Kumar Barve Maryland,Chairman,Kumar Barve,MD,"Barve, Kumar",True,"Barve, Kumar",100,[MD_138],MD_138
9,Representative Paul Baumbach Delaware,Representative,Paul Baumbach,DE,"Baumbach, Paul",False,"Baumbach, Paul S.",95,[DE_50],DE_50


In [None]:
## Ignore this box ##
#Usefull Link to deal with paste warnings: https://www.youtube.com/watch?v=4R4WsDJ-KVc


In [70]:
# Create a dataframe where match ratio is >=95 (Likely correct matches)

df3 = df.loc[df['Fuzzy_Legis_Ratio']>=95,:].copy()

# Create dataframes where match ratio < 95 (Likely incorrect matches)

df4 = df.loc[df['Fuzzy_Legis_Ratio']<95,:].copy()


In [60]:
print(len(df3))
df3.head(10)

134


Unnamed: 0,Signing_Legis,Legis_Design,Legis_Name,Legis_State,Formated_Legis_Names,ExactMatch,Fuzzy_Legis_Name,Fuzzy_Legis_Ratio,All_Legis_Id_Options,1st_Legis_Id_Option
0,Senator Kevin Ranker Washington,Senator,Kevin Ranker,WA,"Ranker, Kevin",True,"Ranker, Kevin",100,[WA_120],WA_120
1,Assemblyman Thomas Abinanti New York,Assemblyman,Thomas Abinanti,NY,"Abinanti, Thomas",False,"Abinanti, Thomas J.",95,[NY_162],NY_162
3,Senator Ben Allen California,Senator,Ben Allen,CA,"Allen, Ben",True,"Allen, Ben",100,"[AR_1, CA_4, GA_258]",AR_1
4,Representative Sherry Appleton Washington,Representative,Sherry Appleton,WA,"Appleton, Sherry",True,"Appleton, Sherry",100,[WA_173],WA_173
5,Representative David Arconti Connecticut,Representative,David Arconti,CT,"Arconti, David",False,"Arconti, David Jr.",95,[CT_103],CT_103
8,Chairman Kumar Barve Maryland,Chairman,Kumar Barve,MD,"Barve, Kumar",True,"Barve, Kumar",100,[MD_138],MD_138
9,Representative Paul Baumbach Delaware,Representative,Paul Baumbach,DE,"Baumbach, Paul",False,"Baumbach, Paul S.",95,[DE_50],DE_50
12,Assemblyman Daniel Benson New Jersey,Assemblyman,Daniel Benson,NJ,"Benson, Daniel",False,"Benson, Daniel R",95,[NJ_118],NJ_118
14,Representative Lori Berman Florida,Representative,Lori Berman,FL,"Berman, Lori",True,"Berman, Lori",100,[FL_169],FL_169
15,Representative Seth Berry Maine,Representative,Seth Berry,ME,"Berry, Seth",True,"Berry, Seth",100,[ME_178],ME_178


In [59]:
print(len(df4))
df4.head(10)

91


Unnamed: 0,Signing_Legis,Legis_Design,Legis_Name,Legis_State,Formated_Legis_Names,ExactMatch,Fuzzy_Legis_Name,Fuzzy_Legis_Ratio,All_Legis_Id_Options,1st_Legis_Id_Option
2,Representative John Ager North Carolina,Representative,John Ager,NC,"Ager, John",False,"Jagler, John",91,[WI_179],WI_179
6,Representative John Autry North Carolina,Representative,John Autry,NC,"Autry, John",False,"England, Christopher John",86,[AL_149],AL_149
7,Representative Phil Barnhart Oregon,Representative,Phil Barnhart,OR,"Barnhart, Phil",False,Hart,90,[CA_62],CA_62
10,Representative Mary Belk North Carolina,Representative,Mary Belk,NC,"Belk, Mary",False,"McClurkin, Mary Sue",86,[AL_250],AL_250
11,Senator Shenna Bellows Maine,Senator,Shenna Bellows,ME,"Bellows, Shenna",False,Bell,90,"[OK_8, TN_105, WI_101]",OK_8
13,Representative Jennifer Benson Massachusetts,Representative,Jennifer Benson,MA,"Benson, Jennifer",False,Benson,90,[SD_10],SD_10
19,Senator Cathy Breen Maine,Senator,Cathy Breen,ME,"Breen, Cathy",False,"McMorris Rodgers, Cathy",86,[WA_377],WA_377
20,Representative Cecil Brockman North Carolina,Representative,Cecil Brockman,NC,"Brockman, Cecil",False,Brock,90,[AZ_169],AZ_169
23,Representative Deb Butler North Carolina,Representative,Deb Butler,NC,"Butler, Deb",False,"Butler, Timothy J.",86,[IL_197],IL_197
24,Senator Jeanine Calkin Rhode Island,Senator,Jeanine Calkin,RI,"Calkin, Jeanine",False,"Long, Jeanine",79,[WA_82],WA_82


In [328]:
# While going through the matches for the matched names in df4 (where the ratio is <95) 
#I find that the second best match is better 

# So I use the match_names function and store the second best name option as well



array(['Ager, John', 'Autry, John', 'Barnhart, Phil', 'Belk, Mary',
       'Bellows, Shenna'], dtype=object)

In [61]:
# Creating List of messy names in df4 and store it in 'wrong_names2'
# Correct_names stays the same 

wrong_names2 = df4["Formated_Legis_Names"].values
wrong_names2[:5]


array(['Ager, John', 'Autry, John', 'Barnhart, Phil', 'Belk, Mary',
       'Bellows, Shenna'], dtype=object)

In [62]:
# Create a match function 'match_second_best_names' which stores the second best predicted name and its associated ratio 

names_array2 = []
ratio_array2 = []

def match_second_best_names(wrong_names2, correct_names,limit=2):
    for row in wrong_names2:
        x = process.extract(row, correct_names, limit=limit)
        names_array2.append(x[1][0])
        ratio_array2.append(x[1][1])
    return names_array2, ratio_array2


In [63]:
# Apply the match_names_second_best function to wrong_names2

name_match2, ratio_match2 = match_second_best_names(wrong_names2, correct_names)


In [71]:
# Store the second best predicted legislator name in 'Fuzzy_Legis_Name2'

df4["Fuzzy_Legis_Name2"] = name_match2
df4["Fuzzy_Legis_Ratio2"] = ratio_match2
df4.head(10)

Unnamed: 0,Signing_Legis,Legis_Design,Legis_Name,Legis_State,Formated_Legis_Names,ExactMatch,Fuzzy_Legis_Name,Fuzzy_Legis_Ratio,All_Legis_Id_Options,1st_Legis_Id_Option,Fuzzy_Legis_Name2,Fuzzy_Legis_Ratio2
2,Representative John Ager North Carolina,Representative,John Ager,NC,"Ager, John",False,"Jagler, John",91,[WI_179],WI_179,"Ager, John Curtis",90
6,Representative John Autry North Carolina,Representative,John Autry,NC,"Autry, John",False,"England, Christopher John",86,[AL_149],AL_149,"Rogers Jr, John W",86
7,Representative Phil Barnhart Oregon,Representative,Phil Barnhart,OR,"Barnhart, Phil",False,Hart,90,[CA_62],CA_62,"Barnhart, Philip N.",88
10,Representative Mary Belk North Carolina,Representative,Mary Belk,NC,"Belk, Mary",False,"McClurkin, Mary Sue",86,[AL_250],AL_250,"Broadaway, Mary",86
11,Senator Shenna Bellows Maine,Senator,Shenna Bellows,ME,"Bellows, Shenna",False,Bell,90,"[OK_8, TN_105, WI_101]",OK_8,Bell,90
13,Representative Jennifer Benson Massachusetts,Representative,Jennifer Benson,MA,"Benson, Jennifer",False,Benson,90,[SD_10],SD_10,"Bertino-Tarrant, Jennifer",86
19,Senator Cathy Breen Maine,Senator,Cathy Breen,ME,"Breen, Cathy",False,"McMorris Rodgers, Cathy",86,[WA_377],WA_377,"Breen, Catherine",79
20,Representative Cecil Brockman North Carolina,Representative,Cecil Brockman,NC,"Brockman, Cecil",False,Brock,90,[AZ_169],AZ_169,Rock,90
23,Representative Deb Butler North Carolina,Representative,Deb Butler,NC,"Butler, Deb",False,"Butler, Timothy J.",86,[IL_197],IL_197,"Butler Dixon, Deborah",86
24,Senator Jeanine Calkin Rhode Island,Senator,Jeanine Calkin,RI,"Calkin, Jeanine",False,"Long, Jeanine",79,[WA_82],WA_82,"Akin, W.",77


In [72]:
#Attaching Legislator ID to the second best names found 

A_ = []
B_ = []

for name in df4["Fuzzy_Legis_Name2"]:
    unique_ID = df2.loc[df2['name'] == name, 'Legislator_Id'].values
    unique_ID2 = df2.loc[df2['name'] == name, 'Legislator_Id'].iloc[0]
    
    A_.append(unique_ID)
    B_.append(unique_ID2)

df4["All_Legis_Id_Options2"] = A_
df4["1st_Legis_Id_Option2"] = B_

df4.head(20)

Unnamed: 0,Signing_Legis,Legis_Design,Legis_Name,Legis_State,Formated_Legis_Names,ExactMatch,Fuzzy_Legis_Name,Fuzzy_Legis_Ratio,All_Legis_Id_Options,1st_Legis_Id_Option,Fuzzy_Legis_Name2,Fuzzy_Legis_Ratio2,All_Legis_Id_Options2,1st_Legis_Id_Option2
2,Representative John Ager North Carolina,Representative,John Ager,NC,"Ager, John",False,"Jagler, John",91,[WI_179],WI_179,"Ager, John Curtis",90,[NC_162],NC_162
6,Representative John Autry North Carolina,Representative,John Autry,NC,"Autry, John",False,"England, Christopher John",86,[AL_149],AL_149,"Rogers Jr, John W",86,[AL_295],AL_295
7,Representative Phil Barnhart Oregon,Representative,Phil Barnhart,OR,"Barnhart, Phil",False,Hart,90,[CA_62],CA_62,"Barnhart, Philip N.",88,[OR_92],OR_92
10,Representative Mary Belk North Carolina,Representative,Mary Belk,NC,"Belk, Mary",False,"McClurkin, Mary Sue",86,[AL_250],AL_250,"Broadaway, Mary",86,[AR_196],AR_196
11,Senator Shenna Bellows Maine,Senator,Shenna Bellows,ME,"Bellows, Shenna",False,Bell,90,"[OK_8, TN_105, WI_101]",OK_8,Bell,90,"[OK_8, TN_105, WI_101]",OK_8
13,Representative Jennifer Benson Massachusetts,Representative,Jennifer Benson,MA,"Benson, Jennifer",False,Benson,90,[SD_10],SD_10,"Bertino-Tarrant, Jennifer",86,[IL_8],IL_8
19,Senator Cathy Breen Maine,Senator,Cathy Breen,ME,"Breen, Cathy",False,"McMorris Rodgers, Cathy",86,[WA_377],WA_377,"Breen, Catherine",79,[ME_17],ME_17
20,Representative Cecil Brockman North Carolina,Representative,Cecil Brockman,NC,"Brockman, Cecil",False,Brock,90,[AZ_169],AZ_169,Rock,90,[KS_99],KS_99
23,Representative Deb Butler North Carolina,Representative,Deb Butler,NC,"Butler, Deb",False,"Butler, Timothy J.",86,[IL_197],IL_197,"Butler Dixon, Deborah",86,[MS_184],MS_184
24,Senator Jeanine Calkin Rhode Island,Senator,Jeanine Calkin,RI,"Calkin, Jeanine",False,"Long, Jeanine",79,[WA_82],WA_82,"Akin, W.",77,[MO_131],MO_131


In [334]:
df3 = df3.sort_values(by=['Fuzzy_Legis_Ratio'],ascending=False)

In [336]:
df3.to_csv("/Users/ishitagopal/Desktop/df3_95_to_100.csv")

In [251]:
df4.to_csv("/Users/ishitagopal/Desktop/df3_below_95.csv")

In [80]:
## Function to test single names ###

def match_single_names(wrong_name, correct_names, limit):
    result = process.extract(wrong_name, correct_names, limit=limit)
    return result
    

In [82]:
match_single_names('Dhingra, Manka', correct_names,10)

[('King', 68),
 ('Mack', 68),
 ('King', 68),
 ('Gray', 68),
 ('King', 68),
 ('King', 68),
 ('Sinagra, Jack', 67),
 ('Diaz, Manny', 64),
 ('Ma, Fiona', 64),
 ('Chang, Mark', 64)]

In [85]:
match_single_names("Creagan, Richard",correct_names,10)

[('Reagan', 90),
 ('Perkins, Anthony Richard', 86),
 ('Garcia Richard, Stephanie', 86),
 ('Browning, Dwight Richard', 86),
 ('Richards', 84),
 ('Morgan, Richard', 84),
 ('Hardt', 80),
 ('Carey, Richard', 80),
 ('Rosen, Richard', 80),
 ('Cebra, Richard', 80)]

In [88]:
match_single_names("Cyr, Julian",correct_names,10)

[('Uliana', 82),
 ('Carroll, Julian', 77),
 ('Lynn, Julia', 73),
 ('Cryor, Jean', 73),
 ('Julian, Patti', 71),
 ('Fant III, Julian', 71),
 ('Bond, Julian', 71),
 ('Garrett, Julian', 71),
 ('Julian, Larry', 71),
 ('Curry, Julie', 70)]