# Gender Detection using names

**`Goal`:** Determine the gender of the freelancer using the first names extracted and cleaned in previous steps/notebooks

### 1. Import packages

In [None]:
#General packages for dataframe and data manipulation
import pandas as pd
import numpy as np

#Gender detection packages
from genderize import Genderize
import gender_guesser.detector as gender

### 2. Load the data

In [None]:
df = pd.read_csv('../data/interim/cleaned-merge.csv')
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,search_query,name,profile_link,location,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,pct_jobs_completed,...,pct_certifications_requirements_engineering_1,skill_furniture_removalist,skill_workday_security,skill_oracle_ebs_tech_integration,pct_certifications_google_webmaster_central_1,skill_modx,skill_cubecart,skill_phaser,skill_drilling_engineering,skill_casperjs
0,designer,Milen,https://www.freelancer.com/u/MsCaddServices,Edmonds,45.0,0.0,0.0,0,0,,...,,,,,,,,,,
1,designer,Jeremy,https://www.freelancer.com/u/Conescu,Orinda,90.0,0.0,0.0,0,0,,...,,,,,,,,,,
2,designer,Nichole,https://www.freelancer.com/u/NicholeMW,Holly,25.0,4.0,5.0,2,0,100.0,...,,,,,,,,,,
3,designer,Robert,https://www.freelancer.com/u/rhoenig1277,Beloit,75.0,0.0,0.0,0,0,,...,,,,,,,,,,
4,designer,Shea,https://www.freelancer.com/u/blaqsupply,Cockeysville,25.0,0.0,0.0,0,0,,...,,,,,,,,,,


### 3. Gender Detection Pipeline

In [None]:
#Initialize the main detector
gender_detector = gender.Detector()

In [None]:
def determine_gender(name,prob_threshold = 0.8,count_threshold=50):

    """
    MAIN FUNCTION

    Function to determine gender for an inputted first name 
    Prediction by second option detector is adjusted based on a probability and frequency threshold

    Inputs:
        - name (str): The first name to predict gender based on
        - prob_threshold (float): The probability of the classification being 
                                  correct (proxy for confidence). Default: 0.8
        - count_threshold (float): The number of samples the classification is based on. 
                                   Default: 50
    
    Output:
        - pred (str): Gender prediction


    """
    
    #Try to detect gender for the inputted name using first model
    pred = gender_detector.get_gender(name)

    #If the model predicts male or female, return the prediction
    if pred in ('male','female'):
        return pred
    
    #If it predicts mostly male or mostly female, go with prediction (male or female)
    elif 'mostly' in pred:
        return pred.replace('mostly_','')

    #If the model cannot determine the gender ('unknown') or is unsure ('andy')
    else:

        #Pass prediction task to the other model (Only 1000 free requests daily)
        try:
            alt_pred = Genderize().get1(name)

            #If the prediction stats pass the set threshold
            if alt_pred['probability'] >= prob_threshold and alt_pred['count'] >= count_threshold:
                return alt_pred['gender']
            
            #If not, note the model's confidence level and leave for human reannotation
            else:
                return 'low_confidence'
        
        #If an exception occurs (mostly API request limit met), report error
        except:
            return 'error'


In [None]:
def alt_determine_gender(name,prob_threshold = 0.8,count_threshold=50):

    """

    ALTERNATIVE FUNCTION – SEE USE CASE BELOW
    
    This function only leverages the genderize.io api unlike the main determine_gender() function
    Prediction is adjusted based on a probability and frequency threshold.

    USE CASE FOR THIS FUNCTION
        - If the rate limit was reached when using the genderize.io api
          incorporated within main determine_gender() function. This function might 
          the run the following day when the requests quota is restored
    
    Inputs:
        - name (str): The first name to predict gender based on
        - prob_threshold (float): The probability of the classification being 
                                  correct (proxy for confidence). Default: 0.8
        - count_threshold (float): The number of samples the classification is based on. 
                                   Default: 50
    
    Output:
        - pred (str): Gender prediction


    """

    try:
        alt_pred = Genderize().get1(name)

        #If the prediction stats pass the set threshold
        if alt_pred['probability'] >= prob_threshold and alt_pred['count'] >= count_threshold:
            return alt_pred['gender']
        
        #If not, note the model's confidence level and leave for human reannotation
        else:
            return 'low_confidence'
    
    #If an exception occurs (most likely request limit exceeded)
    except:
        return 'error'


### 4. Predict gender for the dataset

In [None]:
#Predict the gender
gender_preds = df.name.apply(determine_gender)
df['gender'] = gender_preds

#Reorganize dataframe so gender is next to name
df.insert(int(np.where(df.columns == 'name')[0][0]+1),'gender',df.pop('gender'))

In [None]:
df

Unnamed: 0,search_query,name,gender,profile_link,location,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,...,pct_certifications_requirements_engineering_1,skill_furniture_removalist,skill_workday_security,skill_oracle_ebs_tech_integration,pct_certifications_google_webmaster_central_1,skill_modx,skill_cubecart,skill_phaser,skill_drilling_engineering,skill_casperjs
0,designer,Milen,male,https://www.freelancer.com/u/MsCaddServices,Edmonds,45.0,0.0,0.0,0,0,...,,,,,,,,,,
1,designer,Jeremy,male,https://www.freelancer.com/u/Conescu,Orinda,90.0,0.0,0.0,0,0,...,,,,,,,,,,
2,designer,Nichole,female,https://www.freelancer.com/u/NicholeMW,Holly,25.0,4.0,5.0,2,0,...,,,,,,,,,,
3,designer,Robert,male,https://www.freelancer.com/u/rhoenig1277,Beloit,75.0,0.0,0.0,0,0,...,,,,,,,,,,
4,designer,Shea,low_conf,https://www.freelancer.com/u/blaqsupply,Cockeysville,25.0,0.0,0.0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10089,software engineer,Jackie,error,https://www.freelancer.com/u/jackielinn,Tyler,30.0,0.0,0.0,0,0,...,,,,,,,,,,
10090,software engineer,Alex,male,https://www.freelancer.com/u/aresnik,Hamden,15.0,0.4,5.0,1,0,...,,,,,,,,,,
10091,software engineer,Jonathan,male,https://www.freelancer.com/u/tcoffin014,Denham Springs,150.0,0.0,0.0,0,0,...,,,,,,,,,,
10092,software engineer,Dee,female,https://www.freelancer.com/u/IntelligentDee,Arlington,25.0,0.0,0.0,0,3,...,,,,,,,,,,


### 5. Get cases where model is not confident or where error occured

In [None]:
df.query("(gender=='error') or (gender == 'low_conf')")

Unnamed: 0,search_query,name,gender,profile_link,location,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,...,pct_certifications_requirements_engineering_1,skill_furniture_removalist,skill_workday_security,skill_oracle_ebs_tech_integration,pct_certifications_google_webmaster_central_1,skill_modx,skill_cubecart,skill_phaser,skill_drilling_engineering,skill_casperjs
4186,designer,Shang-Te,error,https://www.freelancer.com/u/tedchen0313,San Francisco,20.0,5.0,5.0,9,0,...,,,,,,,,,,
4197,designer,"LKrketing,c.",error,https://www.freelancer.com/u/lkonomos,Blue Springs,25.0,1.4,4.6,1,0,...,,,,,,,,,,
4203,designer,C,error,https://www.freelancer.com/u/cbrinks,Dunedin,10.0,0.0,0.0,0,3,...,,,,,,,,,,
4204,designer,Dariell,error,https://www.freelancer.com/u/dariellellis7,Memphis,20.0,0.0,0.0,0,0,...,,,,,,,,,,
4208,designer,Lamyaa,error,https://www.freelancer.com/u/lamyaahassan,Sugar Land,75.0,1.1,5.0,1,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10074,software engineer,Jousef,error,https://www.freelancer.com/u/akifcamizci,Woodbury,35.0,0.0,0.0,0,0,...,,,,,,,,,,
10082,software engineer,Jaimean,error,https://www.freelancer.com/u/jagcruz,Marianao,8.0,0.0,0.0,0,0,...,,,,,,,,,,
10085,software engineer,Prapulla,error,https://www.freelancer.com/u/abpp,Elkridge,35.0,0.0,0.0,0,0,...,,,,,,,,,,
10088,software engineer,Tunde,error,https://www.freelancer.com/u/tuns21,Kennesaw,60.0,0.0,0.0,0,0,...,,,,,,,,,,


In [None]:
df.gender.value_counts()

male        5404
female      2479
error       1491
low_conf     720
Name: gender, dtype: int64

### 6. Re-predict gender for 'error' values when request quota replenished

In [None]:
#Load the data from previous checkpoint
gender_df = pd.read_csv('../data/gender-annotated/cleaned-gender-annotated-v2.csv',low_memory=False)

In [None]:
#Run gender identification pipeline on the data
gender_df.loc[gender_df.gender == 'error','gender'] = gender_df.loc[gender_df.gender == 'error','name'].apply(alt_determine_gender)

In [None]:
#Error in coding. My bad! Replace low_conf with low_confidence
gender_df.loc[gender_df.gender == 'low_conf','gender'] = 'low_confidence'

In [None]:
gender_df.gender.value_counts()

male              5750
female            2569
low_confidence    1775
Name: gender, dtype: int64

### 7. Export CSV for future runs and human reannotation

In [None]:
#df.to_csv('../data/gender-annotated/cleaned-gender-annotated-v1.csv',index=False)
gender_df.to_csv('../data/gender-annotated/cleaned-gender-annotated-v2.csv',index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=acc27b92-84be-4130-8026-204943f38189' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>