# Preprocessing: Skill & Certification

**`Goal:`** Feature engineering or preprocessing on skill and certification columns in preparation for matching procedure

### a. Load packages/libraries

In [None]:
import pandas as pd
import numpy as np

### b. Load data

In [None]:
df1 = pd.read_csv('../data/processed/skills_certifications_categorized_skill_count_female_treatment.csv', low_memory=False)
df2 = pd.read_csv('../data/processed/skills_certifications_categorized.csv', low_memory=False)

#Quick preview
df2.head()

Unnamed: 0,search_query,name,gender,join_date_from_earliest,location_size,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,...,management_skills,marketing_business_skills,performance_arts_skills,design_skills,teaching_training_skills,miscellaneous_skills,language_certifications,freelancer_certifications,general_skill_certifications,programming_certifications
0,2,Milen,1,7063,1,45,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
1,2,Jeremy,1,7526,1,90,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0
2,2,Nichole,0,6430,0,25,4.0,5.0,2,0,...,0.0,0.0,0.0,5.0,0.0,0.0,1,0,0,0
3,2,Robert,1,3238,1,75,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
4,2,Jean-Paul,1,6661,5,19,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0


In [None]:
df2.columns

Index(['search_query', 'name', 'gender', 'join_date_from_earliest',
       'location_size', 'hourly_rate', 'pay_grade', 'avg_rating',
       'num_reviews', 'num_recommendations', 'pct_jobs_completed',
       'pct_on_budget', 'pct_on_time', 'verification_preferred_freelancer',
       'verification_identity_verified', 'verification_payment_verified',
       'verification_phone_verified', 'verification_email_verified',
       'verification_facebook_connected', 'badge_plus_membership',
       'badge_preferred_freelancer', 'badge_verified', 'engineering_skills',
       'writing_skills', 'technical_programming_skills',
       'language_translation_skills', 'finance_accounting_skills',
       'management_skills', 'marketing_business_skills',
       'performance_arts_skills', 'design_skills', 'teaching_training_skills',
       'miscellaneous_skills', 'language_certifications',
       'freelancer_certifications', 'general_skill_certifications',
       'programming_certifications'],
      dtype=

### c. Get top skill category
- Pass 1: Get skill category which has been utilized most in projects
- Pass 2: If none in Pass 1, get skill category which contains the most listed skills on the profile

In [None]:
skill_columns = ['writing_skills', 'technical_programming_skills',
                 'language_translation_skills', 'finance_accounting_skills', 
                 'marketing_business_skills','design_skills']

winner_takes_all_skill_col = []
winner_takes_all_skill_cat_count = []

for row in range(len(df2)):

    #Get the skill values for the row/individual
    skill_project_count = df2.iloc[row][skill_columns].values
    
    #Get profile skill count
    profile_skill_count = df1.iloc[row][skill_columns].values

    

    #Check if any of the project counts are greater than zero
    if np.any(skill_project_count):

        #Get the top skill based on project count
        winner_takes_all_skill_col.append(skill_columns[skill_project_count.argmax()])
        
        #Append number of skills in category to list
        winner_takes_all_skill_cat_count.append(profile_skill_count[skill_project_count.argmax()])
    
    #If no skill has been utilized in a project, get top skill based on profile skill count
    else:
        winner_takes_all_skill_col.append(skill_columns[profile_skill_count.argmax()])

        #Append number of skills in category to list
        winner_takes_all_skill_cat_count.append(max(profile_skill_count))


### d. Encode variables

In [None]:
categorical = pd.Categorical(winner_takes_all_skill_col)
categories = categorical.categories
codes = categorical.codes
    
for category,code in zip(categories,np.unique(codes)):
    print(f'- {category}: {code}')

print('\n')

- design_skills: 0
- finance_accounting_skills: 1
- language_translation_skills: 2
- marketing_business_skills: 3
- technical_programming_skills: 4
- writing_skills: 5




In [None]:
#Assign plain top skill category
df1["top_skill_category"] = winner_takes_all_skill_col 

#Assign encoded top skill category
df1["top_skill_category_encoded"] = codes

#Assign top skill count
df1["top_skill_category_count"] = winner_takes_all_skill_cat_count

df1.head()

Unnamed: 0,search_query,name,gender,join_date_from_earliest,location_size,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,...,design_skills,teaching_training_skills,miscellaneous_skills,language_certifications,freelancer_certifications,general_skill_certifications,programming_certifications,top_skill_category,top_skill_category_encoded,top_skill_category_count
0,2,Milen,0,7063,1,45,0.0,0.0,0,0,...,8,0,1,0,0,0,0,design_skills,0,8
1,2,Jeremy,0,7526,1,90,0.0,0.0,0,0,...,18,0,0,1,0,0,0,design_skills,0,18
2,2,Nichole,1,6430,0,25,4.0,5.0,2,0,...,16,0,0,1,0,0,0,design_skills,0,16
3,2,Robert,0,3238,1,75,0.0,0.0,0,0,...,5,0,0,0,1,0,0,technical_programming_skills,4,9
4,2,Jean-Paul,0,6661,5,19,0.0,0.0,0,0,...,6,0,0,0,0,0,0,design_skills,0,6


In [None]:
df1.to_csv('../data/processed/winner_takes_all_v2.csv',index=False)

### e. Export based on skill categories

In [None]:
df = pd.read_csv('../data/processed/winner_takes_all_v2.csv', low_memory=False)

In [None]:
skill_occ = [('design_skills','designer'), ('technical_programming_skills', 'software_engineer'), 
             ('writing_skills','copywriter'),('marketing_business_skills','marketer'), 
             ('language_translation_skills','translator'), ('finance_accounting_skills','accountant')]

#Iterate through all occupations/skills
for skill,occupation in skill_occ:
    
    #Get the dataframe for the particular occupation
    skill_df = df.loc[df.top_skill_category==skill,:]
    print(f"Extracted {skill}.....")

    #Write the dataframe as a CSV
    skill_df.to_csv(f'../data/processed/occupation/{occupation}.csv', index=False)
    
    print(f"Finished writing to {occupation}.csv")

    print('\n')

Extracted design_skills.....
Finished writing to designer.csv


Extracted technical_programming_skills.....
Finished writing to software_engineer.csv


Extracted writing_skills.....
Finished writing to copywriter.csv


Extracted marketing_business_skills.....
Finished writing to marketer.csv


Extracted language_translation_skills.....
Finished writing to translator.csv


Extracted finance_accounting_skills.....
Finished writing to accountant.csv




In [None]:
pd.read_csv('../data/processed/occupation/software_engineer.csv', low_memory=False).head()

Unnamed: 0,search_query,name,gender,join_date_from_earliest,location_size,hourly_rate,pay_grade,avg_rating,num_reviews,num_recommendations,...,design_skills,teaching_training_skills,miscellaneous_skills,language_certifications,freelancer_certifications,general_skill_certifications,programming_certifications,top_skill_category,top_skill_category_encoded,top_skill_category_count
0,2,Robert,0,3238,1,75,0.0,0.0,0,0,...,5,0,0,0,1,0,0,technical_programming_skills,4,9
1,2,Katrin,1,6647,1,20,0.0,0.0,0,0,...,2,0,0,0,0,0,0,technical_programming_skills,4,12
2,2,Carson,0,4417,3,20,0.0,0.0,0,0,...,7,0,0,0,0,0,0,technical_programming_skills,4,13
3,2,Eliran,0,5229,1,150,4.0,5.0,10,1,...,6,0,0,1,0,0,0,technical_programming_skills,4,13
4,2,Michael,0,2672,2,65,0.0,0.0,0,0,...,1,1,2,0,0,0,0,technical_programming_skills,4,8


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=acc27b92-84be-4130-8026-204943f38189' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>