In [22]:
import os 
import pandas as pd 
import ethnicolr

In [34]:
# Function
def race_eth(tab, methods = ["ce-l", "fl-f", "fl-l"]):
    
    # ce-l: census data, prediction on last name ------------------------------------------------
    if "ce-l" in methods:
        df1 = ethnicolr.pred_census_ln(df=tab, namecol="last_name")
        cols = {'api': 'pasian', 'black': 'pblack', 'hispanic': 'phispa', 'white': 'pwhite'}
        df1.rename(columns=cols, inplace=True)
        df1["method"]="ece-1-0-0-0-0-N"
    else:
        df1=pd.DataFrame()
    
    # fl-f: florida data, prediction on full name -----------------------------------------------
    if "fl-f" in methods:
        df2 = ethnicolr.pred_fl_reg_name(df=tab, lname_col="last_name", fname_col="first_name")
        cols={'asian': 'pasian', 'nh_black': 'pblack', 'hispanic': 'phispa', 'nh_white': 'pwhite'}
        df2.rename(columns=cols, inplace=True)
        df2["method"]="efl-1-1-0-0-0-N"
    else:
        df2=pd.DataFrame()
    
    # fl-l: florida data, prediction on last name -----------------------------------------------
    if "fl-l" in methods:
        df3 = ethnicolr.pred_fl_reg_ln(df=tab, namecol="last_name")
        cols={'asian': 'pasian', 'nh_black': 'pblack', 'hispanic': 'phispa', 'nh_white': 'pwhite'}
        df3.rename(columns=cols, inplace=True)
        df3["method"]="efl-1-1-0-0-0-N"
    else:
        df3=pd.DataFrame()
  
    # combine dataframes to single dataframe ----------------------------------------------------
    df_out=pd.concat([df1, df2, df3], ignore_index=True)
    df_out["pother"]=0
    
    # select and reorder columns ----------------------------------------------------------------
    cols_old=list(tab.columns.values)
    if "name__" in cols_old: cols_old.remove("__name", )
    cols_new=["method", "pother", "pasian", "pblack", "phispa", "pwhite", "pother", "race"]
    df_out=df_out[cols_old+cols_new]
    
    # recode race values ------------------------------------------------------------------------
    race_old=["hispanic", "nh_white", "nh_black"]
    race_new=["hispa", "white", "black"]
    df_out["race"]=df_out["race"].replace(race_old, race_new)
    
    # return output -----------------------------------------------------------------------------
    return df_out

In [24]:
# Test Dataframe
cols = {'id': ['ID1', 'ID2', 'ID3', 'ID4', 'ID5', 'ID6', 'ID7'],
        'first_name':  ['Paul','Jonas','Sara','Susane','Enrico','Zhan', 'Hande'],
        'last_name': ['Williams','Smith','Coleberg','White','Guzman','Wu','Wen']}

df = pd.DataFrame(cols , columns= ['id','first_name', 'last_name'])
df

Unnamed: 0,id,first_name,last_name
0,ID1,Paul,Williams
1,ID2,Jonas,Smith
2,ID3,Sara,Coleberg
3,ID4,Susane,White
4,ID5,Enrico,Guzman
5,ID6,Zhan,Wu
6,ID7,Hande,Wen


In [36]:
race_eth(df, "ce-l") # Only Census Data (Last Name)

Unnamed: 0,id,first_name,last_name,__name,method,pother,pasian,pblack,phispa,pwhite,pother.1,race
0,ID1,Paul,Williams,Williams Paul,ece-1-0-0-0-0-N,0,0.003612,0.488306,0.012068,0.496013,0,white
1,ID2,Jonas,Smith,Smith Jonas,ece-1-0-0-0-0-N,0,0.003291,0.243096,0.020006,0.733607,0,white
2,ID3,Sara,Coleberg,Coleberg Sara,ece-1-0-0-0-0-N,0,0.001656,0.033526,0.014904,0.949913,0,white
3,ID4,Susane,White,White Susane,ece-1-0-0-0-0-N,0,0.0034,0.303031,0.021746,0.671824,0,white
4,ID5,Enrico,Guzman,Guzman Enrico,ece-1-0-0-0-0-N,0,0.012233,0.000617,0.946036,0.041114,0,hispa
5,ID6,Zhan,Wu,Wu Zhan,ece-1-0-0-0-0-N,0,0.983361,0.000779,0.006229,0.009631,0,api
6,ID7,Hande,Wen,Wen Hande,ece-1-0-0-0-0-N,0,0.891676,0.011717,0.010623,0.085984,0,api


In [37]:
race_eth(df, "fl-f") # Only FLorida Data (Full Name)

Unnamed: 0,id,first_name,last_name,__name,method,pother,pasian,pblack,phispa,pwhite,pother.1,race
0,ID1,Paul,Williams,Williams Paul,efl-1-1-0-0-0-N,0,0.002019,0.32094,0.010021,0.66702,0,white
1,ID2,Jonas,Smith,Smith Jonas,efl-1-1-0-0-0-N,0,0.003128,0.341513,0.024439,0.630921,0,white
2,ID3,Sara,Coleberg,Coleberg Sara,efl-1-1-0-0-0-N,0,0.003303,0.023605,0.030684,0.942408,0,white
3,ID4,Susane,White,White Susane,efl-1-1-0-0-0-N,0,0.002678,0.039387,0.01037,0.947564,0,white
4,ID5,Enrico,Guzman,Guzman Enrico,efl-1-1-0-0-0-N,0,0.002886,0.007604,0.911398,0.078111,0,hispa
5,ID6,Zhan,Wu,Wu Zhan,efl-1-1-0-0-0-N,0,0.979094,0.00295,0.001496,0.016461,0,asian
6,ID7,Hande,Wen,Wen Hande,efl-1-1-0-0-0-N,0,0.266156,0.158126,0.038692,0.537027,0,white


In [38]:
race_eth(df, "fl-l") # Only FLorida Data (Last Name)

Unnamed: 0,id,first_name,last_name,__name,method,pother,pasian,pblack,phispa,pwhite,pother.1,race
0,ID1,Paul,Williams,Williams Paul,efl-1-1-0-0-0-N,0,0.005113,0.587987,0.010691,0.39621,0,black
1,ID2,Jonas,Smith,Smith Jonas,efl-1-1-0-0-0-N,0,0.004512,0.251722,0.017937,0.725829,0,white
2,ID3,Sara,Coleberg,Coleberg Sara,efl-1-1-0-0-0-N,0,0.013045,0.068452,0.031588,0.886916,0,white
3,ID4,Susane,White,White Susane,efl-1-1-0-0-0-N,0,0.004088,0.290451,0.014484,0.690977,0,white
4,ID5,Enrico,Guzman,Guzman Enrico,efl-1-1-0-0-0-N,0,0.00793,0.006714,0.929116,0.05624,0,hispa
5,ID6,Zhan,Wu,Wu Zhan,efl-1-1-0-0-0-N,0,0.986977,0.000855,0.002355,0.009812,0,asian
6,ID7,Hande,Wen,Wen Hande,efl-1-1-0-0-0-N,0,0.56893,0.028301,0.095547,0.307222,0,asian


In [39]:
race_eth(df) # All Methods/Data

Unnamed: 0,id,first_name,last_name,__name,method,pother,pasian,pblack,phispa,pwhite,pother.1,race
0,ID1,Paul,Williams,Williams Paul,ece-1-0-0-0-0-N,0,0.003612,0.488306,0.012068,0.496013,0,white
1,ID2,Jonas,Smith,Smith Jonas,ece-1-0-0-0-0-N,0,0.003291,0.243096,0.020006,0.733607,0,white
2,ID3,Sara,Coleberg,Coleberg Sara,ece-1-0-0-0-0-N,0,0.001656,0.033526,0.014904,0.949913,0,white
3,ID4,Susane,White,White Susane,ece-1-0-0-0-0-N,0,0.0034,0.303031,0.021746,0.671824,0,white
4,ID5,Enrico,Guzman,Guzman Enrico,ece-1-0-0-0-0-N,0,0.012233,0.000617,0.946036,0.041114,0,hispa
5,ID6,Zhan,Wu,Wu Zhan,ece-1-0-0-0-0-N,0,0.983361,0.000779,0.006229,0.009631,0,api
6,ID7,Hande,Wen,Wen Hande,ece-1-0-0-0-0-N,0,0.891676,0.011717,0.010623,0.085984,0,api
7,ID1,Paul,Williams,Williams Paul,efl-1-1-0-0-0-N,0,0.002019,0.32094,0.010021,0.66702,0,white
8,ID2,Jonas,Smith,Smith Jonas,efl-1-1-0-0-0-N,0,0.003128,0.341513,0.024439,0.630921,0,white
9,ID3,Sara,Coleberg,Coleberg Sara,efl-1-1-0-0-0-N,0,0.003303,0.023605,0.030684,0.942408,0,white


In [40]:
race_eth(df, ["cs-l", "fl-l", "fl-f"]) # All Methods/Data

Unnamed: 0,id,first_name,last_name,__name,method,pother,pasian,pblack,phispa,pwhite,pother.1,race
0,ID1,Paul,Williams,Williams Paul,efl-1-1-0-0-0-N,0,0.002019,0.32094,0.010021,0.66702,0,white
1,ID2,Jonas,Smith,Smith Jonas,efl-1-1-0-0-0-N,0,0.003128,0.341513,0.024439,0.630921,0,white
2,ID3,Sara,Coleberg,Coleberg Sara,efl-1-1-0-0-0-N,0,0.003303,0.023605,0.030684,0.942408,0,white
3,ID4,Susane,White,White Susane,efl-1-1-0-0-0-N,0,0.002678,0.039387,0.01037,0.947564,0,white
4,ID5,Enrico,Guzman,Guzman Enrico,efl-1-1-0-0-0-N,0,0.002886,0.007604,0.911398,0.078111,0,hispa
5,ID6,Zhan,Wu,Wu Zhan,efl-1-1-0-0-0-N,0,0.979094,0.00295,0.001496,0.016461,0,asian
6,ID7,Hande,Wen,Wen Hande,efl-1-1-0-0-0-N,0,0.266156,0.158126,0.038692,0.537027,0,white
7,ID1,Paul,Williams,Williams Paul,efl-1-1-0-0-0-N,0,0.005113,0.587987,0.010691,0.39621,0,black
8,ID2,Jonas,Smith,Smith Jonas,efl-1-1-0-0-0-N,0,0.004512,0.251722,0.017937,0.725829,0,white
9,ID3,Sara,Coleberg,Coleberg Sara,efl-1-1-0-0-0-N,0,0.013045,0.068452,0.031588,0.886916,0,white
