### Data Cleaning and Exploration on Indeed 2020's dataset

In [1]:
import numpy as np
import pandas as pd
import re
import glob
INDEED_JOB_POSTING_DATA = '../data/'

In [2]:
def read_csv(dataset):
    data_csv = glob.glob( INDEED_JOB_POSTING_DATA + dataset + "*.csv")
    indeed_df = pd.DataFrame()
    for i, csv_file in enumerate(data_csv):
        raw_df = pd.read_csv(csv_file, index_col=0)
        indeed_df = indeed_df.append(raw_df)
    indeed_df['postdate_yyyymmdd'] = pd.to_datetime(indeed_df['postdate_yyyymmdd'], format='%Y%m%d' )
    print("Before dropping duplicate ids: " + str(indeed_df.shape[0]))
    # drop duplicate 
    indeed_df.drop_duplicates(subset = ['uniq_id'], inplace = True)
    print("After dropping duplicate ids: " + str(indeed_df.shape[0]))
    return indeed_df

In [3]:
indeed_2020_df = read_csv('Indeed_2020/')
indeed_2020_df['postdate_yyyymmdd'].isna().sum()

Before dropping duplicate ids: 29995
After dropping duplicate ids: 29995


0

In [4]:
indeed_2020_df.head()

Unnamed: 0,uniq_id,crawl_timestamp,url,job_title,category,company_name,city,state,country,post_date,...,salary_offered,test_contact_email,contact_email,inferred_salary_time_unit,inferred_salary_currency,is_currency_valid,is_salary_unit_correct,is_from_range_valid,is_to_range_valid,dataset
0,970589e837dfe5d7b96a4edb68ed53fc,2020-06-27 22:48:05 +0000,https://www.indeed.com/viewjob?jk=6e2f10dbcc82...,Assistant Teacher,Education-or-training,Tutor Time Learning Centers,Powell,OH,US,2020-06-27,...,,,,,,,,,,
1,5b8e77e647a868558bd14d77faa20d11,2020-06-03 14:34:46 +0000,https://www.indeed.com/viewjob?jk=e2ef934dfe3d...,Team Member,Restaurant-or-food-Service,Jack's Family Restaurant | Jacks Family Restau...,Gadsden,AL,US,2020-06-03,...,,,,,,,,,,
2,ffaa733004c1791099e8481a5fead2e4,2020-05-10 03:25:40 +0000,https://www.indeed.com/cmp/ZEELAND-FARM-SERVIC...,Driver ZFS - Elevator Miscellaneous,Construction-or-facilities,"Zeeland Farm Services, Inc.",Zeeland,MI,US,2020-05-10,...,,,,,,,,,,
3,6388e9094ff67beb57a04c1bb84613d1,2020-05-30 06:55:44 +0000,https://www.indeed.com/viewjob?jk=e44bcf868eea...,NURSE (FLIGHT OPERATIONS),Accounting-or-finance,US Department of the Air Force,Keesler AFB,MS,US,2020-05-30,...,,,,,,,,,,
4,f34f23997811c9c163d9c14347725cd0,2020-05-14 07:10:50 +0000,https://www.indeed.com/viewjob?jk=b551a641bce8...,"Sr Mgr, Clinic Operations",Administrative,Emory,Decatur,GA,US,2020-05-14,...,,,,,,,,,,


In [5]:
indeed_2020_df.country.unique()

array(['US'], dtype=object)

In [6]:
indeed_2020_df.columns.to_list()

['uniq_id',
 'crawl_timestamp',
 'url',
 'job_title',
 'category',
 'company_name',
 'city',
 'state',
 'country',
 'post_date',
 'job_description',
 'job_type',
 'company_description',
 'job_board',
 'geo',
 'job_post_lang',
 'inferred_iso2_lang_code',
 'inferred_iso3_lang_code',
 'site_name',
 'html_job_description',
 'domain',
 'postdate_yyyymmdd',
 'has_expired',
 'last_expiry_check_date',
 'latest_expiry_check_date',
 'duplicate_status',
 'postdate_in_indexname_format',
 'inferred_city',
 'inferred_state',
 'inferred_country',
 'fitness_score',
 'inferred_salary_from',
 'inferred_salary_to',
 'salary_offered',
 'test_contact_email',
 'contact_email',
 'inferred_salary_time_unit',
 'inferred_salary_currency',
 'is_currency_valid',
 'is_salary_unit_correct',
 'is_from_range_valid',
 'is_to_range_valid',
 'dataset']

In [7]:
indeed_2020_df['category'].isna().sum()

0

In [8]:
indeed_2020_cat = indeed_2020_df.groupby('category').agg('count')['uniq_id'].reset_index()
indeed_2020_cat

Unnamed: 0,category,uniq_id
0,Accounting-or-finance,1396
1,Administrative,3750
2,Arts-or-entertainment-or-publishing,302
3,Banking-or-loans,403
4,Computer-or-internet,2032
5,Construction-or-facilities,2439
6,Customer-Service,1215
7,Education-or-training,1043
8,Engineering-or-architecture,869
9,Government-or-military,212


### Drop jobs that were not posted during May and June 2020

In [9]:
selected_months = [5, 6]
indeed_2020_df['posted_month'] =pd.DatetimeIndex(indeed_2020_df['postdate_yyyymmdd']).month
indeed_2020_filtered = indeed_2020_df.loc[ indeed_2020_df['posted_month'].isin(selected_months)]
indeed_2020_filtered.reset_index(drop = True, inplace = True)
indeed_2020_filtered

Unnamed: 0,uniq_id,crawl_timestamp,url,job_title,category,company_name,city,state,country,post_date,...,test_contact_email,contact_email,inferred_salary_time_unit,inferred_salary_currency,is_currency_valid,is_salary_unit_correct,is_from_range_valid,is_to_range_valid,dataset,posted_month
0,970589e837dfe5d7b96a4edb68ed53fc,2020-06-27 22:48:05 +0000,https://www.indeed.com/viewjob?jk=6e2f10dbcc82...,Assistant Teacher,Education-or-training,Tutor Time Learning Centers,Powell,OH,US,2020-06-27,...,,,,,,,,,,6
1,5b8e77e647a868558bd14d77faa20d11,2020-06-03 14:34:46 +0000,https://www.indeed.com/viewjob?jk=e2ef934dfe3d...,Team Member,Restaurant-or-food-Service,Jack's Family Restaurant | Jacks Family Restau...,Gadsden,AL,US,2020-06-03,...,,,,,,,,,,6
2,ffaa733004c1791099e8481a5fead2e4,2020-05-10 03:25:40 +0000,https://www.indeed.com/cmp/ZEELAND-FARM-SERVIC...,Driver ZFS - Elevator Miscellaneous,Construction-or-facilities,"Zeeland Farm Services, Inc.",Zeeland,MI,US,2020-05-10,...,,,,,,,,,,5
3,6388e9094ff67beb57a04c1bb84613d1,2020-05-30 06:55:44 +0000,https://www.indeed.com/viewjob?jk=e44bcf868eea...,NURSE (FLIGHT OPERATIONS),Accounting-or-finance,US Department of the Air Force,Keesler AFB,MS,US,2020-05-30,...,,,,,,,,,,5
4,f34f23997811c9c163d9c14347725cd0,2020-05-14 07:10:50 +0000,https://www.indeed.com/viewjob?jk=b551a641bce8...,"Sr Mgr, Clinic Operations",Administrative,Emory,Decatur,GA,US,2020-05-14,...,,,,,,,,,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29045,df3c978e729a912b16f19b59d1472058,2020-06-20 02:46:06 +0000,https://www.indeed.com/cmp/Law-Offices/jobs/Re...,Real Estate Paralegal,Legal,Law Offices of Gregory Schadone Ltd,North Providence,RI,US,2020-06-20,...,,,,,,,,,,6
29046,7d22175efec4bfe607ace13086c8dcfa,2020-05-14 17:44:10 +0000,https://www.indeed.com/cmp/Herrington-Harbour-...,Administrative Assistant / Slip Rentals,Insurance,Herrington Harbour Marinas,North Beach,MD,US,2020-05-14,...,,,,,,,,,,5
29047,062c00200a61772c89a43090c6e8ad4a,2020-06-16 01:15:28 +0000,https://www.indeed.com/cmp/Fertility-Specialis...,New Patient Coordinator,Customer-Service,Fertility Specialists of Texas,Frisco,TX,US,2020-06-16,...,,,,,,,,,,6
29048,333d49a311dc601890c16d72607b5131,2020-06-27 20:52:36 +0000,https://www.indeed.com/viewjob?jk=0eb431c23fc3...,VP Medical Affairs,Sales,Bon Secours,Portsmouth,VA,US,2020-06-27,...,['recruitment@mercy.com'],recruitment@mercy.com,,,,,,,,6


In [10]:
indeed_2020_filtered.groupby('posted_month').size()

posted_month
5    14071
6    14979
dtype: int64

### Select jobs based on categories 

In [11]:
selected_categories = ['Computer-or-internet', 'Healthcare', 'Retail', 'Restaurant-or-food-Service']
indeed_2020_filtered = indeed_2020_filtered.loc[ indeed_2020_filtered['category'].isin(selected_categories)]
indeed_2020_filtered.reset_index(drop= True, inplace=True)

In [12]:
indeed_2020_filtered.groupby('category').size()

category
Computer-or-internet          1944
Healthcare                    3115
Restaurant-or-food-Service    1461
Retail                        1041
dtype: int64

In [13]:
indeed_2020_filtered.groupby(['category','posted_month']).size()

category                    posted_month
Computer-or-internet        5                943
                            6               1001
Healthcare                  5               1495
                            6               1620
Restaurant-or-food-Service  5                583
                            6                878
Retail                      5                479
                            6                562
dtype: int64

In [14]:
indeed_2020_filtered[ ['uniq_id','job_title','category','company_name','city','state','country','post_date','job_description','job_type','company_description','html_job_description','inferred_city', 'inferred_state', 'inferred_country', 'fitness_score','inferred_salary_from','inferred_salary_to','salary_offered']].head()

Unnamed: 0,uniq_id,job_title,category,company_name,city,state,country,post_date,job_description,job_type,company_description,html_job_description,inferred_city,inferred_state,inferred_country,fitness_score,inferred_salary_from,inferred_salary_to,salary_offered
0,5b8e77e647a868558bd14d77faa20d11,Team Member,Restaurant-or-food-Service,Jack's Family Restaurant | Jacks Family Restau...,Gadsden,AL,US,2020-06-03,JOB TITLE: Team Member Job Summary Prepare and...,,Jack’s Family Restaurants was started in 1960 ...,"<div id=""jobDescriptionText"" class=""jobsearch-...",Gadsden,Alabama,United states,10,,,
1,12f862ed9eeada12ca7a15d2163c4351,"Clinical Nurse(PT) 24 hours, Radiation Oncolog...",Healthcare,D-H Lebanon-MHMH,Lebanon,NH,US,2020-06-21,The Clinical Nurse is an engaged and credentia...,Part-time,Dartmouth-Hitchcock Medical Center (DHMC) is a...,"<div id=""jobDescriptionText"" class=""jobsearch-...",Lebanon,New hampshire,United states,10,,,
2,68f57af0b7eaff77b6d7a06748d92604,Nurse Specialist,Healthcare,East Carolina University,Greenville,NC,US,2020-06-09,The primary function of this position will be ...,,"East Carolina University is a public, doctoral...","<div id=""jobDescriptionText"" class=""jobsearch-...",Greenville,North carolina,United states,10,.,.,If no applicants apply who meet the required c...
3,50873a53f1c51b599030a9edc8e2ea4a,Grill/Fry Cook (Seasonal),Restaurant-or-food-Service,Whale's Tale Water Park,Lincoln,NH,US,2020-05-21,We're looking for a Grill/Fry Cook for the sum...,,,"<div id=""jobDescriptionText"" class=""jobsearch-...",Lincoln,New hampshire,United states,10,9.00,13.00,Salary: $9.00 to $13.00 /hour
4,2862980353e9a2c8cbf8adf5f662961d,Business Systems Analyst,Computer-or-internet,"City of El Paso, TX",El Paso,TX,US,2020-05-21,Requirements Candidate must have a Bachelor's ...,,,"<div id=""jobDescriptionText"" class=""jobsearch-...",El paso,Texas,United states,10,.,.,"For complete job specification, click here. Sa..."


In [17]:
print(indeed_2020_filtered.loc[indeed_2020_filtered['uniq_id']== '2862980353e9a2c8cbf8adf5f662961d']['job_description'])

4    Requirements Candidate must have a Bachelor's ...
Name: job_description, dtype: object


In [21]:
it_df = indeed_2020_filtered.loc[indeed_2020_filtered['category']== 'Computer-or-internet']
it_df.reset_index(drop = True, inplace = True)
it_df.to_csv('../processed_data/2020/20200501_20200630_IT.csv')

healthcare_df = indeed_2020_filtered.loc[indeed_2020_filtered['category']== 'Healthcare']
healthcare_df.reset_index(drop = True, inplace = True)
healthcare_df.to_csv('../processed_data/2020/20200501_20200630_healthcare.csv')

retail_df = indeed_2020_filtered.loc[indeed_2020_filtered['category']== 'Retail']
retail_df.reset_index(drop = True, inplace = True)
retail_df.to_csv('../processed_data/2020/20200501_20200630_retail.csv')

food_df = indeed_2020_filtered.loc[indeed_2020_filtered['category']== 'Restaurant-or-food-Service']
food_df.reset_index(drop = True, inplace = True)
food_df.to_csv('../processed_data/2020/20200501_20200630_food.csv')