# Gender Detection using names [v2]

**`Goal`:** Determine the gender of the freelancer using the first names extracted and cleaned in previous steps/notebooks. This second name gender identification step (see 3.1 for first step) expands on the data cleaning phase (see 2.1) by better accounting for compound names.

### 1. Import packages

In [None]:
#General packages for dataframe and data manipulation
import pandas as pd
import numpy as np
from glob import glob

#Gender detection packages
from genderize import Genderize
import gender_guesser.detector as gender

### 2. Load the data

In [None]:
# A. GETTING THE ORIGINAL (RAW) DATAFILES
orig_file_paths = [file for file in glob("../data/raw/*.csv") if '_raw' not in file]

#Load the data files into pandas dataframes and merge into one huge dataframe
original_df = pd.concat([pd.read_csv(file, low_memory=False) for file in orig_file_paths],ignore_index=True)

# B. GET THE LAST ANNOTATION CHECKPOINT
new_df = pd.read_csv('../data/gender-annotated/cleaned-gender-annotated-v3.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
original_df.head()

Unnamed: 0,search_query,name,profile_link,tagline,user_description,location,join_date,hourly_rate,pay_grade,avg_rating,...,skill_public_sector_and_taxation,skill_m&a_tax,skill_wave_accounting,skill_media_and_entertainment_tax,skill_shared_services,skill_payment_consulting,skill_investment_banking,skill_palm,skill_global_tax_compliance,skill_adobe_pagemaker
0,copywriter,Marty P.,https://www.freelancer.com/u/mpekar,Very experienced advertising copywriter,I'm a semi-retired advertising copywriter with...,"Greenwich, United States","July 18, 2011",$20,3.6,5.0,...,,,,,,,,,,
1,copywriter,Cat A.,https://www.freelancer.com/u/Cadduci,"Marketing, Business and Administrative Support",My name is Cat and I am a Marketing and Commun...,"Hatboro, United States","August 28, 2021",$15,0.0,0.0,...,,,,,,,,,,
2,copywriter,Jonathan G.,https://www.freelancer.com/u/JG24,"Copy Editor, Copywriter, and Marketing Specialist","Wordsmith, editing extraordinaire, and dedicat...","Warwick, United States","January 26, 2016",$20,0.0,0.0,...,,,,,,,,,,
3,copywriter,Melanie B.,https://www.freelancer.com/u/Melwritesindy,"Creative Writer, Copywriter, Mom",I studied Creative Writing and English Educati...,"Indianapolis, United States","December 14, 2021",$50,0.0,0.0,...,,,,,,,,,,
4,copywriter,Brittney G.,https://www.freelancer.com/u/satisfiedsoulcre,Copywriter & Brand Content Creator,I am your secret weapon to bettering your bran...,"Memphis, United States","April 12, 2021",$25,0.0,0.0,...,,,,,,,,,,


In [None]:
new_df.head()

Unnamed: 0,search_query,name,gender,profile_link,location,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,...,pct_certifications_requirements_engineering_1,skill_furniture_removalist,skill_workday_security,skill_oracle_ebs_tech_integration,pct_certifications_google_webmaster_central_1,skill_modx,skill_cubecart,skill_phaser,skill_drilling_engineering,skill_casperjs
0,designer,Milen,male,https://www.freelancer.com/u/MsCaddServices,Edmonds,45.0,0.0,0.0,0,0,...,,,,,,,,,,
1,designer,Jeremy,male,https://www.freelancer.com/u/Conescu,Orinda,90.0,0.0,0.0,0,0,...,,,,,,,,,,
2,designer,Nichole,female,https://www.freelancer.com/u/NicholeMW,Holly,25.0,4.0,5.0,2,0,...,,,,,,,,,,
3,designer,Robert,male,https://www.freelancer.com/u/rhoenig1277,Beloit,75.0,0.0,0.0,0,0,...,,,,,,,,,,
4,designer,Jean-Paul,male,https://www.freelancer.com/u/PaulCarriazo,Miami,19.0,0.0,0.0,0,0,...,,,,,,,,,,


### 3. Get original names (not just first names) from raw data

In [None]:
#Get subset of the original raw data for merging
subset_original = original_df.loc[:,['search_query','name','profile_link']]

#Get the records with missing predictions from the last annotation checkpoint (this will be merged on)
new_df_missing = new_df.query(" gender not in ['male','female'] ")

In [None]:
#Merge the original (full) names on the missing gender predictions from the last annotation checkpoint 
main_df = new_df_missing.merge(subset_original,on=['search_query','profile_link'])

#Move original names to visible spot in dataframe
main_df.insert(int(np.where(main_df.columns == 'name_x')[0][0]+1),'name',main_df.pop('name_y'))

#Remove the previous extracted first names 
main_df.pop('name_x')

main_df.head()

Unnamed: 0,search_query,name,gender,profile_link,location,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,...,pct_certifications_requirements_engineering_1,skill_furniture_removalist,skill_workday_security,skill_oracle_ebs_tech_integration,pct_certifications_google_webmaster_central_1,skill_modx,skill_cubecart,skill_phaser,skill_drilling_engineering,skill_casperjs
0,designer,Dyllen G.,low_confidence,https://www.freelancer.com/u/DyllenGeorge,Fayetteville,16.0,3.1,5.0,1,1,...,,,,,,,,,,
1,designer,Shuishui Y.,prediction_error,https://www.freelancer.com/u/morningcarter,Austin,15.0,0.0,0.0,0,0,...,,,,,,,,,,
2,designer,Doshianique B.,prediction_error,https://www.freelancer.com/u/Doshianique,Hope,12.0,0.0,0.0,0,0,...,,,,,,,,,,
3,designer,Chean S.,prediction_error,https://www.freelancer.com/u/alstonshek,San Francisco,20.0,0.0,0.0,0,0,...,,,,,,,,,,
4,designer,Kanacia J.,prediction_error,https://www.freelancer.com/u/jame0813,Neversink,75.0,0.0,0.0,0,0,...,,,,,,,,,,


### 4. Try to get first name or names in compound name

In [None]:
#Split the name based on spaces
split_names = main_df.name.str.split(' ')

#If not a compound name, get the first name. If a compound name, remove last name initial
first_names = [name[0] if (len(name) == 2 or len(name)==1) else name[:-1] for name in split_names]

#Reassign name in the main dataframe
main_df.name = first_names

main_df

Unnamed: 0,search_query,name,gender,profile_link,location,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,...,pct_certifications_requirements_engineering_1,skill_furniture_removalist,skill_workday_security,skill_oracle_ebs_tech_integration,pct_certifications_google_webmaster_central_1,skill_modx,skill_cubecart,skill_phaser,skill_drilling_engineering,skill_casperjs
0,designer,Dyllen,low_confidence,https://www.freelancer.com/u/DyllenGeorge,Fayetteville,16.0,3.1,5.0,1,1,...,,,,,,,,,,
1,designer,Shuishui,prediction_error,https://www.freelancer.com/u/morningcarter,Austin,15.0,0.0,0.0,0,0,...,,,,,,,,,,
2,designer,Doshianique,prediction_error,https://www.freelancer.com/u/Doshianique,Hope,12.0,0.0,0.0,0,0,...,,,,,,,,,,
3,designer,Chean,prediction_error,https://www.freelancer.com/u/alstonshek,San Francisco,20.0,0.0,0.0,0,0,...,,,,,,,,,,
4,designer,Kanacia,prediction_error,https://www.freelancer.com/u/jame0813,Neversink,75.0,0.0,0.0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
622,software engineer,Ananta,prediction_error,https://www.freelancer.com/u/deepakg1995,United States,10.0,0.0,0.0,0,0,...,,,,,,,,,,
623,software engineer,Engineering,prediction_error,https://www.freelancer.com/u/EarleyEngg,Iselin,100.0,0.0,0.0,0,0,...,,,,,,,,,0.0,
624,software engineer,Mengyu,low_confidence,https://www.freelancer.com/u/kiwini2018,Ashburn,50.0,2.8,5.0,2,0,...,,,,,,,,,,
625,software engineer,Jansu,low_confidence,https://www.freelancer.com/u/Jansu2604,Charlottesville,30.0,0.0,0.0,0,0,...,,,,,,,,,,


### 3. Gender Detection Pipeline
Adapted to handle compound names (see `gender_detection_func`)

In [None]:
#Initialize the main detector
gender_detector = gender.Detector()

def determine_gender(name,prob_threshold = 0.8,count_threshold=50):

    """
    MAIN FUNCTION

    Function to determine gender for an inputted first name 
    Prediction by second option detector is adjusted based on a probability and frequency threshold

    Inputs:
        - name (str): The first name to predict gender based on
        - prob_threshold (float): The probability of the classification being 
                                  correct (proxy for confidence). Default: 0.8
        - count_threshold (float): The number of samples the classification is based on. 
                                   Default: 50
    
    Output:
        - pred (str): Gender prediction

    """

    #Try to detect gender for the inputted name using first model
    pred = gender_detector.get_gender(name)

    #If the model predicts male or female, return the prediction
    if pred in ('male','female'):
        return pred
    
    #If it predicts mostly male or mostly female, go with prediction (male or female)
    elif 'mostly' in pred:
        return pred.replace('mostly_','')

    #If the model cannot determine the gender ('unknown') or is unsure ('andy')
    else:

        #Pass prediction task to the other model (Only 1000 free requests daily)
        try:
            alt_pred = Genderize().get1(name)

            #If the prediction stats pass the set threshold
            if alt_pred['probability'] >= prob_threshold and alt_pred['count'] >= count_threshold:
                return alt_pred['gender']
            
            #If not, note the model's confidence level and leave for human reannotation
            else:
                return 'low_confidence'
        
        #If an exception occurs (mostly API request limit met), report error
        except:
            return 'error'

def gender_predict_func(name):

    """
    Gender prediction function adapted to account for compound names.
    Makes call to the main determine_gender function –– see above
    """

    #Check if it is a compound name (i.e. multiple names)
    if type(name) == list:

        #If it is, iterate through the different names
        for k in name:

            #Try to get a gender prediction for the current name
            pred = determine_gender(k)

            #If a gender was predicted return the gender
            if pred in ['male','female']:
                return pred
            
        return pred
    
    #If it is a single name
    else:
        
        #Predict the gender for that name
        return determine_gender(name)


### 4. Predict gender for the dataset

In [None]:
#Predict the gender
gender_preds = main_df.name.apply(gender_predict_func)
main_df['gender'] = gender_preds

#Reorganize dataframe so gender is next to name
main_df.insert(int(np.where(main_df.columns == 'name')[0][0]+1),'gender',main_df.pop('gender'))

main_df

Unnamed: 0,search_query,name,gender,profile_link,location,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,...,pct_certifications_requirements_engineering_1,skill_furniture_removalist,skill_workday_security,skill_oracle_ebs_tech_integration,pct_certifications_google_webmaster_central_1,skill_modx,skill_cubecart,skill_phaser,skill_drilling_engineering,skill_casperjs
0,designer,Dyllen,low_confidence,https://www.freelancer.com/u/DyllenGeorge,Fayetteville,16.0,3.1,5.0,1,1,...,,,,,,,,,,
1,designer,Shuishui,low_confidence,https://www.freelancer.com/u/morningcarter,Austin,15.0,0.0,0.0,0,0,...,,,,,,,,,,
2,designer,Doshianique,low_confidence,https://www.freelancer.com/u/Doshianique,Hope,12.0,0.0,0.0,0,0,...,,,,,,,,,,
3,designer,Chean,low_confidence,https://www.freelancer.com/u/alstonshek,San Francisco,20.0,0.0,0.0,0,0,...,,,,,,,,,,
4,designer,Kanacia,low_confidence,https://www.freelancer.com/u/jame0813,Neversink,75.0,0.0,0.0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
622,software engineer,Ananta,low_confidence,https://www.freelancer.com/u/deepakg1995,United States,10.0,0.0,0.0,0,0,...,,,,,,,,,,
623,software engineer,Engineering,low_confidence,https://www.freelancer.com/u/EarleyEngg,Iselin,100.0,0.0,0.0,0,0,...,,,,,,,,,0.0,
624,software engineer,Mengyu,low_confidence,https://www.freelancer.com/u/kiwini2018,Ashburn,50.0,2.8,5.0,2,0,...,,,,,,,,,,
625,software engineer,Jansu,low_confidence,https://www.freelancer.com/u/Jansu2604,Charlottesville,30.0,0.0,0.0,0,0,...,,,,,,,,,,


In [None]:
main_df.gender.value_counts()

low_confidence    511
male               74
female             42
Name: gender, dtype: int64

In [None]:
#Check if any is NA
main_df.gender.isna().sum()

0

### 5. Replace gender predictions in annotated dataset

In [None]:
#Replace records in the new dataframe with new predictions
new_df.loc[(new_df.gender != 'male') & (new_df.gender != 'female'),:] = main_df.values

  arr_value = np.array(value)


In [None]:
new_df.gender.value_counts()

male              6634
female            2943
low_confidence     511
Name: gender, dtype: int64

### 5. Export CSV for future human reannotation

In [None]:
#New annotation checkpoint
new_df.to_csv('../data/gender-annotated/cleaned-gender-annotated-v4.csv',index=False)

### 6. Observing data after human reannotation

In [None]:
df_5 = pd.read_csv('../data/gender-annotated/cleaned-gender-annotated-v5.csv', low_memory=False)

In [None]:
df_5.shape

(9769, 2266)

In [None]:
print(df_5.gender.isna().sum())
df_5.gender.value_counts()

0


male      6694
female    3075
Name: gender, dtype: int64

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=acc27b92-84be-4130-8026-204943f38189' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>