# Gender Identifier Exploration

**`Goal` :** Toy around with some of the gender identification packages to figure out the best way to combine and complement them

## 1. Import packages

In [None]:
import pandas as pd
from genderize import Genderize
import gender_guesser.detector as gender

## 2. Load the data

In [None]:
df = pd.read_csv('../data/interim/accounting-cleaned.csv')

In [None]:
df

Unnamed: 0.1,Unnamed: 0,search_query,name,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,pct_jobs_completed,pct_on_budget,pct_on_time,days_joined
0,0,accountant,Shineeza,40.0,0.0,0.0,0,0,,,,42
1,1,accountant,Lisa,13.0,0.0,0.0,0,0,,,,77
2,2,accountant,Ahmed,15.0,0.0,0.0,0,0,,,,814
3,3,accountant,Alex,75.0,0.0,0.0,0,0,,,,212
4,4,accountant,Agustina,40.0,0.0,0.0,0,0,,,,181
...,...,...,...,...,...,...,...,...,...,...,...,...
468,468,accountant,Jessica,15.0,0.0,0.0,0,0,,,,66
469,469,accountant,Roy,20.0,0.0,0.0,0,0,,,,1370
470,470,accountant,Beverly,30.0,0.0,0.0,0,0,,,,61
471,471,accountant,Marian,15.0,0.0,0.0,0,0,,,,281


## 3. Gender Detection

In [None]:
#Initialize the second option detector
gender_detector_opt2 = gender.Detector()

In [None]:
def determine_gender(x,prob_threshold = 0.8,count_threshold=50):

    """
    Function to determine gender based on first name. 
    Prediction is adjusted based on a probability and frequency threshold
    """
    
    #Use prefererred tool (Only 1000 free requests daily)
    try:
        results = Genderize().get1(x)

        if results['probability'] >= prob_threshold and results['count'] >= count_threshold:
            return results['gender']
        else:
            return ('low_conf',results['probability'],results['count'])

    #If request limit for preferred tool reached, use second option
    except:

        result = gender_detector_opt2.get_gender(x)

        if result == 'andy':
            return 'low_conf'
        
        elif 'mostly' in result:
            return result.replace('mostly_','')
        
        else:
            return result
        


In [None]:
df['gender'] = df.name.apply(determine_gender)

GenderizeException: ('Request limit reached', 429, {'Server': 'nginx/1.16.1', 'Date': 'Wed, 16 Mar 2022 15:32:38 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '33', 'Connection': 'keep-alive', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'GET', 'Access-Control-Allow-Headers': 'Content-Type, X-Genderize-Source', 'X-Rate-Limit-Limit': '1000', 'X-Rate-Reset': '30442', 'X-Rate-Limit-Remaining': '0', 'ETag': 'W/"21-tYoIBroDGdB+35cIAOMCdpXfqjI"'})

ValueError: "isinstance" is not a supported function

In [None]:
df.insert(3,'gender',df.pop('gender'))
df.query("gender != gender")

Unnamed: 0.1,Unnamed: 0,search_query,name,gender,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,pct_jobs_completed,pct_on_budget,pct_on_time,days_joined
0,0,accountant,Shineeza,,40.0,0.0,0.0,0,0,,,,42
5,5,accountant,fkoenigs,,18.0,0.0,0.0,0,0,,,,209
6,6,accountant,Soodong,,65.0,0.0,0.0,0,0,,,,70
7,7,accountant,Walsens,,15.0,0.0,0.0,0,0,,,,114
10,10,accountant,Bayley,,20.0,0.0,0.0,0,0,,,,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...
452,452,accountant,Iveel,,30.0,0.0,0.0,0,0,,,,884
460,460,accountant,Vanaja,,10.0,0.0,0.0,0,0,,,,94
462,462,accountant,Hsundar,,15.0,0.0,0.0,0,0,,,,231
466,466,accountant,Oddailin,,40.0,0.0,0.0,0,0,,,,873


In [None]:
df['gender2'] = df.name.apply(gender_detector_opt2.get_gender)
df.insert(4,'gender2',df.pop('gender2'))

In [None]:
df2 = df.copy()

In [None]:
df2['gender'] = df2['gender'].fillna('unknown')
df2['gender2'] = df2['gender2'].fillna('unknown')
df2['gender2'].loc[df2['gender2'] == 'andy'] = None
df2.gender2 = df2.gender2.str.replace('mostly_','')

df2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0.1,Unnamed: 0,search_query,name,gender,gender2,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,pct_jobs_completed,pct_on_budget,pct_on_time,days_joined
0,0,accountant,Shineeza,unknown,unknown,40.0,0.0,0.0,0,0,,,,42
1,1,accountant,Lisa,female,female,13.0,0.0,0.0,0,0,,,,77
2,2,accountant,Ahmed,male,male,15.0,0.0,0.0,0,0,,,,814
3,3,accountant,Alex,male,male,75.0,0.0,0.0,0,0,,,,212
4,4,accountant,Agustina,female,female,40.0,0.0,0.0,0,0,,,,181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468,468,accountant,Jessica,female,female,15.0,0.0,0.0,0,0,,,,66
469,469,accountant,Roy,male,male,20.0,0.0,0.0,0,0,,,,1370
470,470,accountant,Beverly,female,female,30.0,0.0,0.0,0,0,,,,61
471,471,accountant,Marian,unknown,male,15.0,0.0,0.0,0,0,,,,281


In [None]:
sum(df2.gender == df2.gender2)/len(df2)

0.8477801268498943

In [None]:
df2.query("gender != gender2")

Unnamed: 0.1,Unnamed: 0,search_query,name,gender,gender2,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,pct_jobs_completed,pct_on_budget,pct_on_time,days_joined
14,14,accountant,Qudrat,male,unknown,30.0,0.0,0.0,0,0,,,,19
21,21,accountant,Eman,unknown,female,100.0,0.0,0.0,0,0,,,,927
22,22,accountant,Rupali,female,unknown,25.0,0.0,0.0,0,0,,,,1315
32,32,accountant,Dat,unknown,male,25.0,0.0,0.0,0,0,,,,1945
34,34,accountant,Atika,female,unknown,5.0,0.0,0.0,0,0,,,,225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,437,accountant,Farheen,female,unknown,700.0,0.0,0.0,0,0,,,,1513
440,440,accountant,Nike,unknown,female,39.0,0.0,0.0,0,0,,,,2289
459,459,accountant,Paritosh,male,unknown,20.0,0.0,0.0,0,0,,,,53
460,460,accountant,Vanaja,unknown,female,10.0,0.0,0.0,0,0,,,,94


When both make a confident prediction, they match >99% of the time. 

In [None]:
df2['gender3'] = df2.name.apply(gender_detector_opt2.get_gender)
df2.insert(5,'gender3',df2.pop('gender3'))

In [None]:
df2.query("gender != gender2")

Unnamed: 0.1,Unnamed: 0,search_query,name,gender,gender2,gender3,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,pct_jobs_completed,pct_on_budget,pct_on_time,days_joined
14,14,accountant,Qudrat,male,unknown,unknown,30.0,0.0,0.0,0,0,,,,19
21,21,accountant,Eman,unknown,female,female,100.0,0.0,0.0,0,0,,,,927
22,22,accountant,Rupali,female,unknown,unknown,25.0,0.0,0.0,0,0,,,,1315
32,32,accountant,Dat,unknown,male,mostly_male,25.0,0.0,0.0,0,0,,,,1945
34,34,accountant,Atika,female,unknown,unknown,5.0,0.0,0.0,0,0,,,,225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,437,accountant,Farheen,female,unknown,unknown,700.0,0.0,0.0,0,0,,,,1513
440,440,accountant,Nike,unknown,female,female,39.0,0.0,0.0,0,0,,,,2289
459,459,accountant,Paritosh,male,unknown,unknown,20.0,0.0,0.0,0,0,,,,53
460,460,accountant,Vanaja,unknown,female,female,10.0,0.0,0.0,0,0,,,,94


Chinonso

- Option 2 seems to mostly get it right with the mostlys lol
- Both seem to complement each other well. When one doesn't know the other knows and seems to be right a lot of the time
- Both have a high correspondence towards the same prediction ~85% of the time in this dataset

Thoughts: Start with option 2 and only resort to option 1 when option 2 makes no prediction


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=acc27b92-84be-4130-8026-204943f38189' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>