```markdown
1 - Understand the data
```

In [2]:
import pandas as pd

# load the data
market = pd.read_csv("../data/raw/monster_com-job_sample.csv")

2 - inspect the market data

In [3]:
market.head()

Unnamed: 0,country,country_code,date_added,has_expired,job_board,job_description,job_title,job_type,location,organization,page_url,salary,sector,uniq_id
0,United States of America,US,,No,jobs.monster.com,TeamSoft is seeing an IT Support Specialist to...,IT Support Technician Job in Madison,Full Time Employee,"Madison, WI 53702",,http://jobview.monster.com/it-support-technici...,,IT/Software Development,11d599f229a80023d2f40e7c52cd941e
1,United States of America,US,,No,jobs.monster.com,The Wisconsin State Journal is seeking a flexi...,Business Reporter/Editor Job in Madison,Full Time,"Madison, WI 53708",Printing and Publishing,http://jobview.monster.com/business-reporter-e...,,,e4cbb126dabf22159aff90223243ff2a
2,United States of America,US,,No,jobs.monster.com,Report this job About the Job DePuy Synthes Co...,Johnson & Johnson Family of Companies Job Appl...,"Full Time, Employee",DePuy Synthes Companies is a member of Johnson...,Personal and Household Services,http://jobview.monster.com/senior-training-lea...,,,839106b353877fa3d896ffb9c1fe01c0
3,United States of America,US,,No,jobs.monster.com,Why Join Altec? If you’re considering a career...,Engineer - Quality Job in Dixon,Full Time,"Dixon, CA",Altec Industries,http://jobview.monster.com/engineer-quality-jo...,,Experienced (Non-Manager),58435fcab804439efdcaa7ecca0fd783
4,United States of America,US,,No,jobs.monster.com,Position ID# 76162 # Positions 1 State CT C...,Shift Supervisor - Part-Time Job in Camphill,Full Time Employee,"Camphill, PA",Retail,http://jobview.monster.com/shift-supervisor-pa...,,Project/Program Management,64d0272dc8496abfd9523a8df63c184c


3 - check for the missing data

In [4]:
market.isna().sum()

country                0
country_code           0
date_added         21878
has_expired            0
job_board              0
job_description        0
job_title              0
job_type            1628
location               0
organization        6867
page_url               0
salary             18554
sector              5194
uniq_id                0
dtype: int64

In [5]:
# change the datatype

market['date_added'] = pd.to_datetime(market['date_added'], errors='coerce')
market['salary'] = pd.to_numeric(market['salary'], errors='coerce')

# fill missing values in 'date_added' with a specific date
market['date_added'] = market['date_added'].fillna(pd.Timestamp('2023-01-01'))

# fill missing values in 'organization' with 'Unknown'
market['organization'] = market['organization'].fillna('Unknown')

# fill missing values in 'salary' with 0 salary
market['salary'] = market['salary'].fillna(0)

# fill missing value in 'sector' with 'Unknown'
market['sector'] = market['sector'].fillna('Unknown')

# fill missing value in 'has_expired' with 'No'
market['has_expired'] = market['has_expired'].map({'Yes': True, 'No': False})


market.isna().any()

country            False
country_code       False
date_added         False
has_expired        False
job_board          False
job_description    False
job_title          False
job_type            True
location           False
organization       False
page_url           False
salary             False
sector             False
uniq_id            False
dtype: bool

In [6]:
print(market.duplicated().sum())

0


In [7]:
# salary check
market[(market['salary'] < 50) & (market['salary'] > 10)]

Unnamed: 0,country,country_code,date_added,has_expired,job_board,job_description,job_title,job_type,location,organization,page_url,salary,sector,uniq_id
17170,United States of America,US,2023-01-01,False,jobs.monster.com,"Details:Data entry using Delphi, Microsoft Wor...",Sales Administrative Assistant - Embassy Suite...,Full Time,"Irving, TX 75062",Hotels and Lodging,http://jobview.monster.com/Sales-Administrativ...,13.0,Experienced (Non-Manager),1becb4b00b9fc87392eaedd5f39dbe41
17637,United States of America,US,2023-01-01,False,jobs.monster.com,"Details:Maintains the guest rooms, public spac...",Maintenance Engineer - Embassy Suites DFW Sout...,Full Time,"Irving, TX 75062",Hotels and Lodging,http://jobview.monster.com/Maintenance-Enginee...,12.5,Experienced (Non-Manager),808253cdb3148849256fd6bb73d4a7ef
18289,United States of America,US,2023-01-01,False,jobs.monster.com,"Details:Maintains the guest rooms, public spac...",Maintenance Engineer - Embassy Suites DFW Sout...,Full Time,"Irving, TX 75062",Hotels and Lodging,http://jobview.monster.com/Maintenance-Enginee...,12.5,Experienced (Non-Manager),35ab0ba79acd6dfaa5dd11656057423f


In [8]:
market.isna().any()

country            False
country_code       False
date_added         False
has_expired        False
job_board          False
job_description    False
job_title          False
job_type            True
location           False
organization       False
page_url           False
salary             False
sector             False
uniq_id            False
dtype: bool

In [9]:
market.head()

Unnamed: 0,country,country_code,date_added,has_expired,job_board,job_description,job_title,job_type,location,organization,page_url,salary,sector,uniq_id
0,United States of America,US,2023-01-01,False,jobs.monster.com,TeamSoft is seeing an IT Support Specialist to...,IT Support Technician Job in Madison,Full Time Employee,"Madison, WI 53702",Unknown,http://jobview.monster.com/it-support-technici...,0.0,IT/Software Development,11d599f229a80023d2f40e7c52cd941e
1,United States of America,US,2023-01-01,False,jobs.monster.com,The Wisconsin State Journal is seeking a flexi...,Business Reporter/Editor Job in Madison,Full Time,"Madison, WI 53708",Printing and Publishing,http://jobview.monster.com/business-reporter-e...,0.0,Unknown,e4cbb126dabf22159aff90223243ff2a
2,United States of America,US,2023-01-01,False,jobs.monster.com,Report this job About the Job DePuy Synthes Co...,Johnson & Johnson Family of Companies Job Appl...,"Full Time, Employee",DePuy Synthes Companies is a member of Johnson...,Personal and Household Services,http://jobview.monster.com/senior-training-lea...,0.0,Unknown,839106b353877fa3d896ffb9c1fe01c0
3,United States of America,US,2023-01-01,False,jobs.monster.com,Why Join Altec? If you’re considering a career...,Engineer - Quality Job in Dixon,Full Time,"Dixon, CA",Altec Industries,http://jobview.monster.com/engineer-quality-jo...,0.0,Experienced (Non-Manager),58435fcab804439efdcaa7ecca0fd783
4,United States of America,US,2023-01-01,False,jobs.monster.com,Position ID# 76162 # Positions 1 State CT C...,Shift Supervisor - Part-Time Job in Camphill,Full Time Employee,"Camphill, PA",Retail,http://jobview.monster.com/shift-supervisor-pa...,0.0,Project/Program Management,64d0272dc8496abfd9523a8df63c184c


In [10]:
def standardize_job_type(job_type):
    if pd.isna(job_type):
        return 'Unknown'
    job_type = job_type.lower()
    if 'full' in job_type:
        return 'Full-time'
    elif 'part' in job_type:
        return 'Part-time'
    elif 'contract' in job_type:
        return 'Contract'
    elif 'intern' in job_type:
        return 'Internship'
    elif 'temp' in job_type or 'temporary' in job_type:
        return 'Temporary'
    else:
        return 'Other'
    
market['job_type'] = market['job_type'].apply(standardize_job_type)

In [11]:
market['location'] = market['location'].str.replace(r'\s+\d{5}(-\d{4})?$', '', regex=True)
market['clean_location'] = market['location'].str.strip()
market['city'] = market['location'].str.extract(r'^([^,]+)')
market['state'] = market['location'].str.extract(r',\s*([A-Z]{2})')

In [12]:
market.head()

Unnamed: 0,country,country_code,date_added,has_expired,job_board,job_description,job_title,job_type,location,organization,page_url,salary,sector,uniq_id,clean_location,city,state
0,United States of America,US,2023-01-01,False,jobs.monster.com,TeamSoft is seeing an IT Support Specialist to...,IT Support Technician Job in Madison,Full-time,"Madison, WI",Unknown,http://jobview.monster.com/it-support-technici...,0.0,IT/Software Development,11d599f229a80023d2f40e7c52cd941e,"Madison, WI",Madison,WI
1,United States of America,US,2023-01-01,False,jobs.monster.com,The Wisconsin State Journal is seeking a flexi...,Business Reporter/Editor Job in Madison,Full-time,"Madison, WI",Printing and Publishing,http://jobview.monster.com/business-reporter-e...,0.0,Unknown,e4cbb126dabf22159aff90223243ff2a,"Madison, WI",Madison,WI
2,United States of America,US,2023-01-01,False,jobs.monster.com,Report this job About the Job DePuy Synthes Co...,Johnson & Johnson Family of Companies Job Appl...,Full-time,DePuy Synthes Companies is a member of Johnson...,Personal and Household Services,http://jobview.monster.com/senior-training-lea...,0.0,Unknown,839106b353877fa3d896ffb9c1fe01c0,DePuy Synthes Companies is a member of Johnson...,DePuy Synthes Companies is a member of Johnson...,MA
3,United States of America,US,2023-01-01,False,jobs.monster.com,Why Join Altec? If you’re considering a career...,Engineer - Quality Job in Dixon,Full-time,"Dixon, CA",Altec Industries,http://jobview.monster.com/engineer-quality-jo...,0.0,Experienced (Non-Manager),58435fcab804439efdcaa7ecca0fd783,"Dixon, CA",Dixon,CA
4,United States of America,US,2023-01-01,False,jobs.monster.com,Position ID# 76162 # Positions 1 State CT C...,Shift Supervisor - Part-Time Job in Camphill,Full-time,"Camphill, PA",Retail,http://jobview.monster.com/shift-supervisor-pa...,0.0,Project/Program Management,64d0272dc8496abfd9523a8df63c184c,"Camphill, PA",Camphill,PA


In [13]:
market['job_title'] = market['job_title'].str.strip()
market['organization'] = market['organization'].str.strip()

In [14]:
market.head()

Unnamed: 0,country,country_code,date_added,has_expired,job_board,job_description,job_title,job_type,location,organization,page_url,salary,sector,uniq_id,clean_location,city,state
0,United States of America,US,2023-01-01,False,jobs.monster.com,TeamSoft is seeing an IT Support Specialist to...,IT Support Technician Job in Madison,Full-time,"Madison, WI",Unknown,http://jobview.monster.com/it-support-technici...,0.0,IT/Software Development,11d599f229a80023d2f40e7c52cd941e,"Madison, WI",Madison,WI
1,United States of America,US,2023-01-01,False,jobs.monster.com,The Wisconsin State Journal is seeking a flexi...,Business Reporter/Editor Job in Madison,Full-time,"Madison, WI",Printing and Publishing,http://jobview.monster.com/business-reporter-e...,0.0,Unknown,e4cbb126dabf22159aff90223243ff2a,"Madison, WI",Madison,WI
2,United States of America,US,2023-01-01,False,jobs.monster.com,Report this job About the Job DePuy Synthes Co...,Johnson & Johnson Family of Companies Job Appl...,Full-time,DePuy Synthes Companies is a member of Johnson...,Personal and Household Services,http://jobview.monster.com/senior-training-lea...,0.0,Unknown,839106b353877fa3d896ffb9c1fe01c0,DePuy Synthes Companies is a member of Johnson...,DePuy Synthes Companies is a member of Johnson...,MA
3,United States of America,US,2023-01-01,False,jobs.monster.com,Why Join Altec? If you’re considering a career...,Engineer - Quality Job in Dixon,Full-time,"Dixon, CA",Altec Industries,http://jobview.monster.com/engineer-quality-jo...,0.0,Experienced (Non-Manager),58435fcab804439efdcaa7ecca0fd783,"Dixon, CA",Dixon,CA
4,United States of America,US,2023-01-01,False,jobs.monster.com,Position ID# 76162 # Positions 1 State CT C...,Shift Supervisor - Part-Time Job in Camphill,Full-time,"Camphill, PA",Retail,http://jobview.monster.com/shift-supervisor-pa...,0.0,Project/Program Management,64d0272dc8496abfd9523a8df63c184c,"Camphill, PA",Camphill,PA


In [15]:
market['location'].value_counts().head()

location
Dallas, TX        1388
Cincinnati, OH     837
Columbus, OH       703
Chicago, IL        402
Houston, TX        399
Name: count, dtype: int64

In [16]:
market['state'] = market['state'].fillna('Unknown')

In [17]:
market.drop(columns=['location', 'job_board', 'country', 'country_code'])

Unnamed: 0,date_added,has_expired,job_description,job_title,job_type,organization,page_url,salary,sector,uniq_id,clean_location,city,state
0,2023-01-01,False,TeamSoft is seeing an IT Support Specialist to...,IT Support Technician Job in Madison,Full-time,Unknown,http://jobview.monster.com/it-support-technici...,0.0,IT/Software Development,11d599f229a80023d2f40e7c52cd941e,"Madison, WI",Madison,WI
1,2023-01-01,False,The Wisconsin State Journal is seeking a flexi...,Business Reporter/Editor Job in Madison,Full-time,Printing and Publishing,http://jobview.monster.com/business-reporter-e...,0.0,Unknown,e4cbb126dabf22159aff90223243ff2a,"Madison, WI",Madison,WI
2,2023-01-01,False,Report this job About the Job DePuy Synthes Co...,Johnson & Johnson Family of Companies Job Appl...,Full-time,Personal and Household Services,http://jobview.monster.com/senior-training-lea...,0.0,Unknown,839106b353877fa3d896ffb9c1fe01c0,DePuy Synthes Companies is a member of Johnson...,DePuy Synthes Companies is a member of Johnson...,MA
3,2023-01-01,False,Why Join Altec? If you’re considering a career...,Engineer - Quality Job in Dixon,Full-time,Altec Industries,http://jobview.monster.com/engineer-quality-jo...,0.0,Experienced (Non-Manager),58435fcab804439efdcaa7ecca0fd783,"Dixon, CA",Dixon,CA
4,2023-01-01,False,Position ID# 76162 # Positions 1 State CT C...,Shift Supervisor - Part-Time Job in Camphill,Full-time,Retail,http://jobview.monster.com/shift-supervisor-pa...,0.0,Project/Program Management,64d0272dc8496abfd9523a8df63c184c,"Camphill, PA",Camphill,PA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21995,2023-01-01,False,This is a major premier Cincinnati based finan...,Assistant Vice President - Controller Job in C...,Full-time,Unknown,http://jobview.monster.com/Assistant-Vice-Pres...,0.0,Unknown,a80bc8cc3a90c17eef418963803bc640,"Cincinnati, OH",Cincinnati,OH
21996,2023-01-01,False,Luxury homebuilder in Cincinnati seeking multi...,Accountant Job in Cincinnati,Full-time,Construction - Residential & Commercial/Office,http://jobview.monster.com/Accountant-Job-Cinc...,0.0,Manager (Manager/Supervisor of Staff),419a3714be2b30a10f628de207d041de,"Cincinnati, OH",Cincinnati,OH
21997,2023-01-01,False,RE: Adobe AEM- Client - Loca...,AEM/CQ developer Job in Chicago,Full-time,Unknown,http://jobview.monster.com/AEM-CQ5-developer-J...,0.0,Unknown,5a590350b73b2cec46b05750a208e345,"Chicago, IL",Chicago,IL
21998,2023-01-01,False,Jernberg Industries was established in 1937 an...,Electrician - Experienced Forging Electrician ...,Full-time,"Jernberg Industries, Inc.",http://jobview.monster.com/Electrician-Experie...,0.0,Installation/Maintenance/Repair,40161cf61c283af9dc2b0a62947a5f1b,"Chicago, IL",Chicago,IL


In [18]:
market = market[["uniq_id", "job_title", "job_description", "job_type", "sector", "organization", "city", "state", "date_added", "salary", "has_expired"]]
market

Unnamed: 0,uniq_id,job_title,job_description,job_type,sector,organization,city,state,date_added,salary,has_expired
0,11d599f229a80023d2f40e7c52cd941e,IT Support Technician Job in Madison,TeamSoft is seeing an IT Support Specialist to...,Full-time,IT/Software Development,Unknown,Madison,WI,2023-01-01,0.0,False
1,e4cbb126dabf22159aff90223243ff2a,Business Reporter/Editor Job in Madison,The Wisconsin State Journal is seeking a flexi...,Full-time,Unknown,Printing and Publishing,Madison,WI,2023-01-01,0.0,False
2,839106b353877fa3d896ffb9c1fe01c0,Johnson & Johnson Family of Companies Job Appl...,Report this job About the Job DePuy Synthes Co...,Full-time,Unknown,Personal and Household Services,DePuy Synthes Companies is a member of Johnson...,MA,2023-01-01,0.0,False
3,58435fcab804439efdcaa7ecca0fd783,Engineer - Quality Job in Dixon,Why Join Altec? If you’re considering a career...,Full-time,Experienced (Non-Manager),Altec Industries,Dixon,CA,2023-01-01,0.0,False
4,64d0272dc8496abfd9523a8df63c184c,Shift Supervisor - Part-Time Job in Camphill,Position ID# 76162 # Positions 1 State CT C...,Full-time,Project/Program Management,Retail,Camphill,PA,2023-01-01,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...
21995,a80bc8cc3a90c17eef418963803bc640,Assistant Vice President - Controller Job in C...,This is a major premier Cincinnati based finan...,Full-time,Unknown,Unknown,Cincinnati,OH,2023-01-01,0.0,False
21996,419a3714be2b30a10f628de207d041de,Accountant Job in Cincinnati,Luxury homebuilder in Cincinnati seeking multi...,Full-time,Manager (Manager/Supervisor of Staff),Construction - Residential & Commercial/Office,Cincinnati,OH,2023-01-01,0.0,False
21997,5a590350b73b2cec46b05750a208e345,AEM/CQ developer Job in Chicago,RE: Adobe AEM- Client - Loca...,Full-time,Unknown,Unknown,Chicago,IL,2023-01-01,0.0,False
21998,40161cf61c283af9dc2b0a62947a5f1b,Electrician - Experienced Forging Electrician ...,Jernberg Industries was established in 1937 an...,Full-time,Installation/Maintenance/Repair,"Jernberg Industries, Inc.",Chicago,IL,2023-01-01,0.0,False


In [19]:
dirty_titles = market['job_title'].str.contains('>|<|=|var', regex=True, case=False, na=False)
market.loc[dirty_titles, 'job_title'] = market.loc[dirty_titles, 'job_title'].str.split('|').str[0].str.split('{').str[0].str.strip()

In [20]:
market

Unnamed: 0,uniq_id,job_title,job_description,job_type,sector,organization,city,state,date_added,salary,has_expired
0,11d599f229a80023d2f40e7c52cd941e,IT Support Technician Job in Madison,TeamSoft is seeing an IT Support Specialist to...,Full-time,IT/Software Development,Unknown,Madison,WI,2023-01-01,0.0,False
1,e4cbb126dabf22159aff90223243ff2a,Business Reporter/Editor Job in Madison,The Wisconsin State Journal is seeking a flexi...,Full-time,Unknown,Printing and Publishing,Madison,WI,2023-01-01,0.0,False
2,839106b353877fa3d896ffb9c1fe01c0,Johnson & Johnson Family of Companies Job Appl...,Report this job About the Job DePuy Synthes Co...,Full-time,Unknown,Personal and Household Services,DePuy Synthes Companies is a member of Johnson...,MA,2023-01-01,0.0,False
3,58435fcab804439efdcaa7ecca0fd783,Engineer - Quality Job in Dixon,Why Join Altec? If you’re considering a career...,Full-time,Experienced (Non-Manager),Altec Industries,Dixon,CA,2023-01-01,0.0,False
4,64d0272dc8496abfd9523a8df63c184c,Shift Supervisor - Part-Time Job in Camphill,Position ID# 76162 # Positions 1 State CT C...,Full-time,Project/Program Management,Retail,Camphill,PA,2023-01-01,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...
21995,a80bc8cc3a90c17eef418963803bc640,Assistant Vice President - Controller Job in C...,This is a major premier Cincinnati based finan...,Full-time,Unknown,Unknown,Cincinnati,OH,2023-01-01,0.0,False
21996,419a3714be2b30a10f628de207d041de,Accountant Job in Cincinnati,Luxury homebuilder in Cincinnati seeking multi...,Full-time,Manager (Manager/Supervisor of Staff),Construction - Residential & Commercial/Office,Cincinnati,OH,2023-01-01,0.0,False
21997,5a590350b73b2cec46b05750a208e345,AEM/CQ developer Job in Chicago,RE: Adobe AEM- Client - Loca...,Full-time,Unknown,Unknown,Chicago,IL,2023-01-01,0.0,False
21998,40161cf61c283af9dc2b0a62947a5f1b,Electrician - Experienced Forging Electrician ...,Jernberg Industries was established in 1937 an...,Full-time,Installation/Maintenance/Repair,"Jernberg Industries, Inc.",Chicago,IL,2023-01-01,0.0,False


In [21]:
market.to_csv("../data/cleaned/cleaned_monster_com_job_sample.csv", index=False)