# Preprocessing: Encoding & NA Removal
### [Male=1]

**`Goal:`** Clean and encode the data in preparation for matching procedure. In this notebook, male is encoded as 1 and female as 0

### a. Load packages/libraries

In [None]:
import pandas as pd
import numpy as np

### b. Load data

In [None]:
df = pd.read_csv('/work/DS4SG-Gender-Inequality/data/interim/location-cleaned.csv', low_memory=False)
df.head()

Unnamed: 0,search_query,name,gender,profile_link,location,location_size,hourly_rate,pay_grade,avg_rating,num_reviews,...,skill_oracle_ebs_tech_integration,pct_certifications_google_webmaster_central_1,skill_modx,skill_cubecart,skill_phaser,skill_drilling_engineering,skill_casperjs,join_date_from_earliest,badge_preferred_freelancer,badge_verified
0,designer,Milen,male,https://www.freelancer.com/u/MsCaddServices,Edmonds,1,45,0.0,0.0,0,...,,,,,,,,7063,False,False
1,designer,Jeremy,male,https://www.freelancer.com/u/Conescu,Orinda,1,90,0.0,0.0,0,...,,,,,,,,7526,False,False
2,designer,Nichole,female,https://www.freelancer.com/u/NicholeMW,Holly,0,25,4.0,5.0,2,...,,,,,,,,6430,False,False
3,designer,Robert,male,https://www.freelancer.com/u/rhoenig1277,Beloit,1,75,0.0,0.0,0,...,,,,,,,,3238,False,False
4,designer,Jean-Paul,male,https://www.freelancer.com/u/PaulCarriazo,Miami,5,19,0.0,0.0,0,...,,,,,,,,6661,False,False


### c.Check NAs

In [None]:
#Get dataframe without skills and certifications
removal_condition = [col for col in df.columns if ('skill_' not in col) and ('pct_certifications' not in col)]
no_skill_certifications = df.loc[:,removal_condition]

no_skill_certifications.shape

(9766, 24)

In [None]:
num_missing = np.extract(no_skill_certifications.isna().sum().values > 0,
                         no_skill_certifications.isna().sum().values)
vars_missing = no_skill_certifications.iloc[:,np.argwhere(no_skill_certifications.isna().sum().values > 0).flatten()].columns

for var,num in zip(vars_missing,num_missing):
    print(var,':',num)

location : 2
pct_jobs_completed : 8069
pct_on_budget : 8165
pct_on_time : 8162


### d. Dealing with NAs

#### 1. Drop location and profile link columns

In [None]:
df = df.drop(columns=['location','profile_link'])

#### 2. pct_{variable}
Here we fill the NAs with 0s. Since we are bucketing, all the 0s should end up in a single bucket and thus allow us to match on individuals with NAs for the variables

In [None]:
print(type(df.pct_jobs_completed.unique()[1]))
df.pct_jobs_completed.unique()

<class 'numpy.float64'>


array([ nan, 100.,  80.,  94.,  96.,  95.,  67.,  98.,  97.,  89.,  88.,
        85.,  83.,  50.,  75.,  90.,  99.,  86.,  92.,  71.,  53.,  93.,
        79.,  91.,  73.,  84.,  63.,  33.,  82.,  78.,  77.,  29.])

In [None]:
df.loc[:,['pct_jobs_completed','pct_on_budget','pct_on_time']] = df.loc[:,['pct_jobs_completed','pct_on_budget','pct_on_time']].fillna(0)

In [None]:
df.shape

(9766, 2265)

#### 3. Check to see if all NAs (except in skills and certifications) are resolved

In [None]:
#Get dataframe without skills and certifications
removal_condition = [col for col in df.columns if ('skill_' not in col) and ('pct_certifications' not in col)]
no_skill_certifications = df.loc[:,removal_condition]

np.extract(no_skill_certifications.isna().sum().values > 0,
           no_skill_certifications.isna().sum().values)

array([], dtype=int64)

### &nbsp;e.&nbsp;Encode variables

In [None]:
no_skill_certifications.columns

Index(['search_query', 'name', 'gender', 'location_size', 'hourly_rate',
       'pay_grade', 'avg_rating', 'num_reviews', 'num_recommendations',
       'pct_jobs_completed', 'pct_on_budget', 'pct_on_time',
       'verification_preferred_freelancer', 'verification_identity_verified',
       'verification_payment_verified', 'verification_phone_verified',
       'verification_email_verified', 'verification_facebook_connected',
       'badge_plus_membership', 'join_date_from_earliest',
       'badge_preferred_freelancer', 'badge_verified'],
      dtype='object')

In [None]:
df.verification_preferred_freelancer.value_counts()

False    9718
True       48
Name: verification_preferred_freelancer, dtype: int64

In [None]:
for col in ['search_query','gender','location_size','verification_preferred_freelancer',
            'verification_identity_verified', 'verification_payment_verified',
            'verification_phone_verified', 'verification_email_verified',
            'verification_facebook_connected', 'badge_plus_membership',
             'badge_preferred_freelancer', 'badge_verified']:
             
            print(col.upper())
            categorical = pd.Categorical(df[col])
            categories = categorical.categories
            codes = categorical.codes
             
            for category,code in zip(categories,np.unique(codes)):
                print(f'-{category}:{code}')

            print('\n')

            #Encode the variable
            df[col] = codes

SEARCH_QUERY
-accountant:0
-copywriter:1
-designer:2
-software engineer:3


GENDER
-female:0
-male:1


LOCATION_SIZE
-0:0
-1:1
-2:2
-3:3
-4:4
-5:5


VERIFICATION_PREFERRED_FREELANCER
-False:0
-True:1


VERIFICATION_IDENTITY_VERIFIED
-False:0
-True:1


VERIFICATION_PAYMENT_VERIFIED
-False:0
-True:1


VERIFICATION_PHONE_VERIFIED
-False:0
-True:1


VERIFICATION_EMAIL_VERIFIED
-True:0


VERIFICATION_FACEBOOK_CONNECTED
-False:0
-True:1


BADGE_PLUS_MEMBERSHIP
-False:0
-True:1


BADGE_PREFERRED_FREELANCER
-False:0
-True:1


BADGE_VERIFIED
-False:0
-True:1




### f. Reorganize dataset

In [None]:
no_skill_certifications.columns

Index(['search_query', 'name', 'gender', 'location_size', 'hourly_rate',
       'pay_grade', 'avg_rating', 'num_reviews', 'num_recommendations',
       'pct_jobs_completed', 'pct_on_budget', 'pct_on_time',
       'verification_preferred_freelancer', 'verification_identity_verified',
       'verification_payment_verified', 'verification_phone_verified',
       'verification_email_verified', 'verification_facebook_connected',
       'badge_plus_membership', 'join_date_from_earliest',
       'badge_preferred_freelancer', 'badge_verified'],
      dtype='object')

In [None]:
#Reorganize dataframe
df.insert(int(np.where(df.columns == 'gender')[0][0]+1),'join_date_from_earliest',
          df.pop('join_date_from_earliest'))

In [None]:
# export new dataset
df.to_csv('../data/interim/encoded.csv',index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=acc27b92-84be-4130-8026-204943f38189' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>