In [1]:
from seek_scraper import SeekScraper
from data_preparation import keywords_finder, salary_processor, date_processor
import json
import pandas as pd
import os

In [None]:
# load data from csv
raw_data_file = 'data/seek_scraper_raw_data.csv'
data = pd.read_csv(raw_data_file)
data.head()

In [None]:
sal_processed = data['jobSalary'].apply(salary_processor)
sal_df = pd.DataFrame(sal_processed.tolist())
data = pd.concat([data, sal_df], axis=1)
data['jobPostedTime'] = data['jobPostedTime'].apply(date_processor)

In [None]:
# skills keywords list
with open('data/keywords/skill_keywords.json') as f:
    skills = json.load(f)
# programming languages keywords list
with open('data/keywords/programming_keywords.json') as f:
    langugages = json.load(f)

In [None]:
data['skills'] = data['jobDescription'].apply(lambda x: keywords_finder(x, skills))
data['programmingLanguage'] = data['jobDescription'].apply(lambda x: keywords_finder(x, langugages))
# drop the job description column
data.drop('jobDescription', axis=1, inplace=True)
data.head()

In [None]:
data.columns

In [None]:
# save the processed data to csv
processed_location = 'data/processed'
if not os.path.exists(processed_location):
    os.makedirs(processed_location)
# save the processed data to csv
data.to_csv(f"{processed_location}/seek_scraper_processed_data.csv", index=False)
# print success message
print(f"Processed data saved to {processed_location}/seek_scraper_processed_data.csv")

In [None]:
# get the count of each skill
skills_count = {}
langugages_count = {}
for index, row in data.iterrows():
    skills = row['skills'].split(':')
    skills = [skill.strip() for skill in skills]
    langugages = row['programmingLanguage'].split(',')
    langugages = [language.strip() for language in langugages]
    for skill in skills:
        if skill in skills_count:
            skills_count[skill] += 1
        else:
            skills_count[skill] = 1
    for language in langugages:
        if language.strip() in langugages_count:
            langugages_count[language] += 1
        else:
            langugages_count[language] = 1
# sort the skills and languages by count
skills_count = {k: v for k, v in sorted(skills_count.items(), key=lambda item: item[1], reverse=True)}
langugages_count = {k: v for k, v in sorted(langugages_count.items(), key=lambda item: item[1], reverse=True)}

In [4]:
df = pd.read_csv('data/processed/seek_scraper_processed_data.csv', na_filter=False)
df.head()

Unnamed: 0,jobId,searchKeywords,searchLocation,searchDate,country,advertiserName,jobTitle,jobStatus,jobListingType,jobPostedTime,...,jobWorkType,jobSalary,Skill,Qualification,Programming,min,max,per_annum,per_hour,per_day
0,66075582,Data analyst,Perth,2023-03-10,au,Australian Investment Exchange,Lead Technical Business Analyst - Financial Se...,active,Premium,2023-03-11,...,full time,Unknown,Atlassian,,SQL,0.0,0.0,0.0,0.0,0.0
1,66069973,Data analyst,Perth,2023-03-10,au,Environmental Water Solutions Pty Ltd,CRM Management & Marketing Automation,active,Premium,2023-03-11,...,full time,"$70,000 - $89,999",,,,70000.0,89999.0,79999.0,40.0,307.0
2,66065686,Data analyst,Perth,2023-03-10,au,Indigo Australasia Incorporated,Data Analyst,active,Branded,2023-03-10,...,contract/temp,Unknown,Excel,Higher Degree,SQL,0.0,0.0,0.0,0.0,0.0
3,66086597,Data analyst,Perth,2023-03-10,au,DialogIT,Data Analyst,active,Branded,2023-03-11,...,full time,Unknown,Tableau,,,0.0,0.0,0.0,0.0,0.0
4,65999050,Data analyst,Perth,2023-03-10,au,Peoplebank Australia VIC,Data Analyst,active,Branded,2023-03-08,...,full time,Unknown,,,SQL,0.0,0.0,0.0,0.0,0.0


In [5]:
# top 10 jobs by salary per annum
df.sort_values(by='per_annum', ascending=False).head(20)[['jobTitle', 'jobSalary', 'min', 'max','per_annum', 'per_hour', 'per_day']]

Unnamed: 0,jobTitle,jobSalary,min,max,per_annum,per_hour,per_day
8393,Program Director - Cyber Security,$0 - $1500 per day,0.0,390000.0,390000.0,197.0,1500.0
9011,SAP Data Migration Lead,"$1,400 per day (inclusive of super)",364000.0,364000.0,364000.0,184.0,1400.0
3340,Senior AWS Solution Architect,$1300.00 - $1350.00 p.d. + + Super,338000.0,351000.0,344500.0,174.0,1325.0
1286,Senior PowerBI Developer,Median expected day rate: $1300 + super p/d,338000.0,338000.0,338000.0,171.0,1300.0
7901,General Manager - Residential Role in Onslow,$320k - $350000.00 p.a. + benefits,320000.0,350000.0,335000.0,169.0,1288.0
2951,Threat Detection Engineer,$150 - $180 per hour,296400.0,355680.0,326040.0,165.0,1254.0
2834,Python Quantitative Developer - Prop Trading F...,"$250,000 - $400,000",250000.0,400000.0,325000.0,164.0,1250.0
2913,Data Solution Architect,$1k - $1500 p.d.,260000.0,390000.0,325000.0,164.0,1250.0
1727,Senior Business Analyst,$160 – $170 per hour,316160.0,316160.0,316160.0,160.0,1216.0
9050,Senior Manager Operations Research & Insights,$1100 - $1300 per day,286000.0,338000.0,312000.0,157.0,1200.0


In [None]:
df['jobSalary'][:10].apply(salary_processor)

In [None]:
df['jobWorkType'].unique()

In [None]:
for jtype in df['jobWorkType'].unique():
    print(jtype)