In [1]:
import numpy as np
import pandas as pd
import re
from collections import Counter

In [2]:
data= pd.read_excel('/Users/dongxujia/Dropbox/DS-Discovery_GenderDiversity/04_Data/raw/jobads.xlsx')

In [3]:
data.shape

(6472, 5)

In [4]:
data.id.nunique()

6472

In [5]:
data.head()

Unnamed: 0,id,job_title_en,job_requirement_en,job_description_en,job_category
0,5606,flash and deliver prototypes,,job description\n\nconstruction and electrical...,engineering--engineering
1,14,physical verification engineer lnd_150019,your duties as a system test verification eng...,Your duties as a system test & verification en...,"research and development--mechanical,engineeri..."
2,5371,3d printing specialist for packaging materials,academic degree in mechanical engineering pac...,Establishing a global 3d printing strategy for...,logistics--other
3,831,a strengthening support employee,successfully completed natural or engineering ...,we are looking for an active and motivated sup...,"research and development--other,research and d..."
4,360,account manager,a degree in the engineering field several year...,as accoutn manager (m / w) of the business uni...,"marketing and sales--service customer support,..."


### • 1) Classify job titles using a two-factor model
	○ Find the area of the job (e.g. software engineering)
		○ (for the area the first element of the job category might be helpful) and
	○ and the rank of the job (e.g. lead, senior, group lead, team leader, assistant) if applicable. 


### • 2) Classify job titles using the degree requirements
	○ Isolate the type of degree and the number of years of worker experience for each position
	○ Group positions by degree x experience groups 

In [6]:
data.head()

Unnamed: 0,id,job_title_en,job_requirement_en,job_description_en,job_category
0,5606,flash and deliver prototypes,,job description\n\nconstruction and electrical...,engineering--engineering
1,14,physical verification engineer lnd_150019,your duties as a system test verification eng...,Your duties as a system test & verification en...,"research and development--mechanical,engineeri..."
2,5371,3d printing specialist for packaging materials,academic degree in mechanical engineering pac...,Establishing a global 3d printing strategy for...,logistics--other
3,831,a strengthening support employee,successfully completed natural or engineering ...,we are looking for an active and motivated sup...,"research and development--other,research and d..."
4,360,account manager,a degree in the engineering field several year...,as accoutn manager (m / w) of the business uni...,"marketing and sales--service customer support,..."


### Use job_category to extract job_area

In [7]:
# some entries in job_category are null types
data.job_category.isnull().value_counts()

False    6440
True       32
Name: job_category, dtype: int64

In [8]:
# change these to None
data.loc[data.job_category.isnull(),['job_category']] = None

In [9]:
data.job_category.str.contains("--", na=False).value_counts()

True     6415
False      57
Name: job_category, dtype: int64

In [10]:
# note that job_categories that are without "--" are either None or "general management"

data[~data.job_category.str.contains("--", na=False)].head()

Unnamed: 0,id,job_title_en,job_requirement_en,job_description_en,job_category
40,4611,acquisition tpl,missing,? guided by supply phases to tire pressure con...,
392,2062,bu head of engine systems iran,academic degree in electrical engineering mech...,target is to build up and develop a joint vent...,general management
477,6423,category buyer for embedded software licenses,missing,description:\n license price and contract nego...,
562,2405,cbs reporting controller admin coach parttime...,successful completion of degree in business ad...,develop worldwide cbs reporting strategy for c...,general management
820,6419,control software engineer replacement panagio...,missing,Perform software projects for technical concep...,


In [11]:
# Create a new column called "job_area" that keeps the word before "--" in job_category.
# For entries in job_category that do not contain "--" (either None or "general management"),
# fill in None for the former and "management" for the latter.

In [12]:
data.loc[data.job_category.str.contains(
    "--", na=False),'job_area'] = data.job_category.str.split("--").str[0]

In [13]:
data.loc[data.job_category.isnull(),'job_area'] = None

In [14]:
data.loc[data.job_category == "general management",'job_area'] = "management"

In [15]:
data.job_area = data.job_area.str.replace(","," ")

In [16]:
# create another column job_area_oneword that combine each entry in job_area as one word using underscores

In [17]:
data['job_area_oneword'] = data.job_area.str.replace(" ","_")

In [18]:
data.head()

Unnamed: 0,id,job_title_en,job_requirement_en,job_description_en,job_category,job_area,job_area_oneword
0,5606,flash and deliver prototypes,,job description\n\nconstruction and electrical...,engineering--engineering,engineering,engineering
1,14,physical verification engineer lnd_150019,your duties as a system test verification eng...,Your duties as a system test & verification en...,"research and development--mechanical,engineeri...",research and development,research_and_development
2,5371,3d printing specialist for packaging materials,academic degree in mechanical engineering pac...,Establishing a global 3d printing strategy for...,logistics--other,logistics,logistics
3,831,a strengthening support employee,successfully completed natural or engineering ...,we are looking for an active and motivated sup...,"research and development--other,research and d...",research and development,research_and_development
4,360,account manager,a degree in the engineering field several year...,as accoutn manager (m / w) of the business uni...,"marketing and sales--service customer support,...",marketing and sales,marketing_and_sales


In [19]:
#Some entries in job_area_oneword still have multiple listings (general management + other stuff) 
# replace the ones that starts with general management with general management. 

In [20]:
data.loc[data.job_area_oneword.str.startswith(
    'general_management',na=False),'job_area_oneword'] = 'general_management'

In [21]:
job_area_oneword_counter = Counter(data.job_area_oneword.astype(str))

In [22]:
# 20 most common job_area_oneword

job_area_oneword_counter.most_common(20)

[('engineering', 1865),
 ('research_and_development', 931),
 ('quality', 443),
 ('marketing_and_sales', 439),
 ('finance_and_controlling', 373),
 ('information_technology', 373),
 ('project_management', 317),
 ('human_resources', 291),
 ('logistics', 261),
 ('purchasing', 256),
 ('manufacturing_operations_and_production', 208),
 ('industrial_engineering', 183),
 ('administration_and_assistance', 158),
 ('general_management', 68),
 ('key_account_management', 67),
 ('communications', 62),
 ('None', 32),
 ('law_patents_and_licences', 27),
 ('facility_management', 26),
 ('management', 25)]

In [23]:
# create a column called "job_area_broader" to create broader job area indicator using the classification below:

# engineering = engineering, industrial_engineering, research_and_development
# finance = finance_and_controlling, law_patents_and_licences, auditing
# marketing_and_sales = marketing_and_sales, key_account_management, communications
# health_safety_and_security_environment = health_safety_and_security, environment

In [24]:
data['job_area_broader'] = data.job_area_oneword

In [25]:
data.loc[(data.job_area_oneword == 'industrial_engineering')|(
    data.job_area_oneword == 'research_and_development'), 'job_area_broader'] = 'engineering'

In [26]:
data.loc[(data.job_area_oneword == 'finance_and_controlling')|(
    data.job_area_oneword == 'law_patents_and_licences')|(
    data.job_area_oneword == 'auditing'), 'job_area_broader'] = 'finance'

In [27]:
data.loc[(data.job_area_oneword == 'key_account_management')|(
    data.job_area_oneword == 'communications'), 'job_area_broader'] = 'marketing_and_sales'

In [28]:
data.loc[(data.job_area_oneword == 'health_safety_and_security')|(
    data.job_area_oneword == 'environment'), 'job_area_broader'] = 'health_safety_and_security_environment'

In [29]:
# There are 32 entries of job_category that are none
# impute a job_area_broader for these entries by looking at the job title and description 
# and find the closest match

In [30]:
l = '6419 51 29 41 3301 5738 3799 3825 6443 3394 783 4006 4007 4248 4250 4251 4454 4452 3380 4000 5295 4350 4402'

In [31]:
engineering_index = l.split(' ')

In [32]:
engineering_index = [int(i) for i in engineering_index]

In [33]:
for i in engineering_index:
    data.loc[data.id == i,'job_area_broader'] = 'engineering'

In [34]:
data.loc[data.id == 4611,'job_area_broader'] = 'facility_management'
data.loc[data.id == 6423,'job_area_broader'] = 'purchasing'
data.loc[data.id == 3824,'job_area_broader'] = 'marketing_and_sales'
data.loc[data.id == 3425,'job_area_broader'] = 'logistics'
data.loc[data.id == 4411,'job_area_broader'] = 'management'
data.loc[data.id == 3480,'job_area_broader'] = 'finance'
data.loc[data.id == 2556,'job_area_broader'] = 'human_resources'
data.loc[data.id == 3614,'job_area_broader'] = 'administration_and_assistance'
data.loc[data.id == 702,'job_area_broader'] = 'project_management'

In [35]:
data.job_area_broader.isnull().value_counts()

False    6472
Name: job_area_broader, dtype: int64

In [36]:
job_area_broader_counter = Counter(data.job_area_broader.astype(str))

In [37]:
# job_area_broader in descending order of popularity

job_area_broader_counter.most_common()

[('engineering', 3002),
 ('marketing_and_sales', 569),
 ('quality', 443),
 ('finance', 420),
 ('information_technology', 373),
 ('project_management', 318),
 ('human_resources', 292),
 ('logistics', 262),
 ('purchasing', 257),
 ('manufacturing_operations_and_production', 208),
 ('administration_and_assistance', 159),
 ('general_management', 68),
 ('health_safety_and_security_environment', 38),
 ('facility_management', 27),
 ('management', 26),
 ('mergers_and_acquisitions', 10)]

### Extract job_rank (lead, senior, group lead, team leader, assistant)

In [38]:
ranks = ['manager', 'specialist', 'head', 'coach', 'assistant', 
        'expert', 'lead', 'senior', 'junior', 'director', 'supervisor']

In [39]:
# create column job_rank that extract ranks from job_title_en if job_title_en contain any word in the ranks list
# if no matching word from ranks list is found, then the job_rank should be None

In [40]:
data['job_rank'] = None

In [41]:
for i in ranks:
    data.loc[data.job_title_en.str.contains(i,na=False),'job_rank'] = i

In [42]:
data.job_rank.isnull().value_counts()

True     3297
False    3175
Name: job_rank, dtype: int64

In [43]:
data.head()

Unnamed: 0,id,job_title_en,job_requirement_en,job_description_en,job_category,job_area,job_area_oneword,job_area_broader,job_rank
0,5606,flash and deliver prototypes,,job description\n\nconstruction and electrical...,engineering--engineering,engineering,engineering,engineering,
1,14,physical verification engineer lnd_150019,your duties as a system test verification eng...,Your duties as a system test & verification en...,"research and development--mechanical,engineeri...",research and development,research_and_development,engineering,
2,5371,3d printing specialist for packaging materials,academic degree in mechanical engineering pac...,Establishing a global 3d printing strategy for...,logistics--other,logistics,logistics,logistics,specialist
3,831,a strengthening support employee,successfully completed natural or engineering ...,we are looking for an active and motivated sup...,"research and development--other,research and d...",research and development,research_and_development,engineering,
4,360,account manager,a degree in the engineering field several year...,as accoutn manager (m / w) of the business uni...,"marketing and sales--service customer support,...",marketing and sales,marketing_and_sales,marketing_and_sales,manager


### Extract degree and experience information from job_requirement_en

In [44]:
# job_requirement_en column contains null type values

In [45]:
data.job_requirement_en.isnull().value_counts()

False    6457
True       15
Name: job_requirement_en, dtype: int64

In [46]:
# job_requirement_en column contains entries that only one or two words, such as
# - "tbd", "missing", "x" , "dd", "ot", 
# "xxx","vv","xx","s","bb" "d", "cf epr", "see above"

In [47]:
data[data.job_requirement_en.str.split(" ").str.len() < 3].shape[0]

405

In [48]:
# replacing these entries with None

In [49]:
data.loc[data.job_requirement_en.isnull(),'job_requirement_en'] = None

In [50]:
data.loc[data.job_requirement_en.str.split(" ").str.len() < 3,'job_requirement_en'] = None

In [51]:
data.job_requirement_en.isnull().value_counts()

False    6052
True      420
Name: job_requirement_en, dtype: int64

In [52]:
degrees = ['bachelor','bachelors','university','master','doctor','doctorate','doctoral']

In [53]:
# create column job_degree that extract degree requirements from job_requirement_en 
# if job_requirement_en contain any word in the degrees list
# if no matching word from degrees list is found, then the job_degree should be None

In [54]:
data['job_degree'] = None

In [55]:
for c in degrees:
    data.loc[data.job_requirement_en.str.contains(c,na=False),'job_degree'] = c

In [56]:
data.job_degree.isnull().value_counts()

True     5192
False    1280
Name: job_degree, dtype: int64

In [57]:
data.head()

Unnamed: 0,id,job_title_en,job_requirement_en,job_description_en,job_category,job_area,job_area_oneword,job_area_broader,job_rank,job_degree
0,5606,flash and deliver prototypes,,job description\n\nconstruction and electrical...,engineering--engineering,engineering,engineering,engineering,,
1,14,physical verification engineer lnd_150019,your duties as a system test verification eng...,Your duties as a system test & verification en...,"research and development--mechanical,engineeri...",research and development,research_and_development,engineering,,
2,5371,3d printing specialist for packaging materials,academic degree in mechanical engineering pac...,Establishing a global 3d printing strategy for...,logistics--other,logistics,logistics,logistics,specialist,
3,831,a strengthening support employee,successfully completed natural or engineering ...,we are looking for an active and motivated sup...,"research and development--other,research and d...",research and development,research_and_development,engineering,,
4,360,account manager,a degree in the engineering field several year...,as accoutn manager (m / w) of the business uni...,"marketing and sales--service customer support,...",marketing and sales,marketing_and_sales,marketing_and_sales,manager,


### Extract # years of experience information from job_requirement_en

In [58]:

# re.search('\w+\s\w+\s\w+\s\w+\s\w+\sexperience\s\w+\s\w+\s\w+\s\w+\s\w+\s(\w+)', str(data.iloc[2]['job_requirement_en']))
                                                                                    

In [59]:
data['extracted_experience'] = None

In [60]:
for i in np.arange(data.shape[0]):
    regex_search = re.search('\w+\s\w+\s\w+\s\w+\s\w+\sexperience\s\w+\s\w+\s\w+\s\w+\s\w+\s(\w+)', str(
        data.loc[i]['job_requirement_en']))
    if regex_search is not None:
        data.ix[i,'extracted_experience'] = regex_search.group(0)
    else:
        data.ix[i,'extracted_experience'] = None

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


In [61]:
# 1243 entries did not find such pattern that includes "experience" information

data['extracted_experience'].isnull().value_counts()

False    5229
True     1243
Name: extracted_experience, dtype: int64

In [62]:
# only 135 entries contain numbers

data['extracted_experience'].str.contains('\d').value_counts()

False    5094
True      135
Name: extracted_experience, dtype: int64

In [63]:
data[data['extracted_experience'].str.contains('\d',na=False)]

Unnamed: 0,id,job_title_en,job_requirement_en,job_description_en,job_category,job_area,job_area_oneword,job_area_broader,job_rank,job_degree,extracted_experience
122,1969,algorithmic development engineer functional,preferably embedded programming skills in c kn...,Development of algorithms and functions accord...,engineering--engineering,engineering,engineering,engineering,,,experience after vzyklus desirable development...
166,231,application engineer in the area vordevelopmen...,degree preferably in the fields of mechanical ...,meter calibration of internal combustion engin...,"research and development--other,engineering--o...",research and development,research_and_development,engineering,,,at least 2 years of experience in the automoti...
181,564,area engineer it interfaces measurements systems,academic degree in electrical electronic engi...,as to area engineer (m / f) you develop and re...,engineering--other,engineering,engineering,engineering,,,engineering and minimum 3 years experience in ...
207,2386,asic ic developer radio frequency specialist,graduation of electrical engineering microele...,asic planification Including all Necessary doc...,research and development--hardware,research and development,research_and_development,engineering,specialist,,the frequency range of 2080ghz experience with...
315,367,assistant to president of powertrain division,degree of engineering industrial engineering o...,active support of the executive board of the d...,project management--project management,project management,project_management,project_management,assistant,,at least 3 years of experience in the automoti...
433,4763,business process manager,conditions training as engineer optional addit...,Main tasks:\n\n ensures that internal processe...,quality--other,quality,quality,quality,manager,,or project management 5 years experience in th...
563,4519,cbs specialist,completed studies in the technical area or sub...,as cbs specialist (m / f) for the powertrain d...,"general management,industrial engineering--ind...",general management industrial engineering,general_management,general_management,specialist,,solving 5s smed moderation of experience and k...
600,1971,cep cluster operations manager executive,bachelor degree or master degree or similar in...,support of cep plants in eastern europe and as...,"quality--manufacturing and production,manufact...",quality,quality,quality,manager,master,in engineering or equivalent professional expe...
826,103,controller,completed degree in business studies or econom...,in the firmc machinery (cm) complex tire build...,finance and controlling--controlling,finance and controlling,finance_and_controlling,finance,,,more than 5 years work experience in finance a...
947,1188,customer product quality manager,completed studies professional direction engin...,series beginning customer interface for high-q...,quality--engineering and technology,quality,quality,quality,manager,,equivalent 23 years of professional experience...


In [None]:
# try filter out the ones contain words of numbers from 1-10 "one" "two"

In [None]:
# filter out ones with words like "several" "a few" "a lot of"

In [64]:
data[data['extracted_experience'].notnull()]

Unnamed: 0,id,job_title_en,job_requirement_en,job_description_en,job_category,job_area,job_area_oneword,job_area_broader,job_rank,job_degree,extracted_experience
2,5371,3d printing specialist for packaging materials,academic degree in mechanical engineering pac...,Establishing a global 3d printing strategy for...,logistics--other,logistics,logistics,logistics,specialist,,several years of professional work experience ...
4,360,account manager,a degree in the engineering field several year...,as accoutn manager (m / w) of the business uni...,"marketing and sales--service customer support,...",marketing and sales,marketing_and_sales,marketing_and_sales,manager,,beneficial good business management basic expe...
5,5981,account manager,academic degree in economics engineering indus...,determination & development of customer relati...,key account management--key account management...,key account management,key_account_management,marketing_and_sales,manager,,engineering industrial engineering or similar ...
6,4907,account manager,degree ideally in the industrial engineering a...,development of strategic partnerships with wel...,"marketing and sales--service customer support,...",marketing and sales,marketing_and_sales,marketing_and_sales,manager,,pro several years of professional experience i...
7,80,account manager,degree in engineering mechanical engineering a...,We are looking for our automotive sector a key...,key account management--key account management,key account management,key_account_management,marketing_and_sales,manager,,and economics several years of experience in t...
8,1345,account manager,degree in industrial engineering engineering s...,support and build strategic partnerships and r...,key account management--key account management,key account management,key_account_management,marketing_and_sales,manager,,sciences or economics with technical experienc...
10,3161,account manager,successfully completed a technical or economic...,responsible and perception of the sales suppor...,key account management--key account management,key account management,key_account_management,marketing_and_sales,manager,,economics studies several years of experience ...
12,6058,account manager asian accounts emea,academic degree in engineering mechanical engi...,Ensure the business with the oem in order to a...,key account management--key account management,key account management,key_account_management,marketing_and_sales,manager,,or oe development first leadership experience ...
13,2361,account manager autohaus in passenger tire rep...,concluded kaufmtechn training with acquired in...,care and intensification of the business relat...,marketing and sales--sales and distribution,marketing and sales,marketing_and_sales,marketing_and_sales,manager,,or similar qualifications perennial sales expe...
14,3623,account manager car ersd,completed business administration studies sale...,development and implementation of strategy com...,key account management--key account management...,key account management,key_account_management,marketing_and_sales,manager,,more years of relevant work experience in sale...
