# Data Engineering Jobs Exploration and Salary Prediction Project based on Glassdoor Listed Jobs 2023

## I. Data Cleaning

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

color = sns.color_palette()
sns.set_style('darkgrid')

In [3]:
df = pd.read_csv("../data/glassdoor-data-engineer.csv")
df.head()

Unnamed: 0,company,company_rating,location,job_title,job_description,salary_estimate,company_size,company_type,company_sector,company_industry,company_founded,company_revenue
0,PCS Global Tech\n4.7,4.7,"Riverside, CA",Data Engineer | PAID BOOTCAMP,Responsibilities\n· Analyze and organize raw d...,"$70,000 /yr (est.)",501 to 1000 Employees,Company - Private,Information Technology,Information Technology Support Services,,Unknown / Non-Applicable
1,Futuretech Consultants LLC,,"Newton, MS",Snowflake Data Engineer,My name is Dileep and I am a recruiter at Futu...,$42.50 /hr (est.),,,,,,
2,Clairvoyant\n4.4,4.4,Remote,Data Engineer (MDM),Required Skills:\nMust have 5-8+ Years of expe...,$67.50 /hr (est.),51 to 200 Employees,Company - Private,Pharmaceutical & Biotechnology,Biotech & Pharmaceuticals,,Unknown / Non-Applicable
3,Apple\n4.2,4.2,"Cupertino, CA",Data Engineer,"Summary\nPosted: Dec 22, 2021\nWeekly Hours: 4...",,10000+ Employees,Company - Public,Information Technology,Computer Hardware Development,1976.0,$10+ billion (USD)
4,Skytech Consultancy Services\n5.0,5.0,"Baltimore, MD",Data Engineer,Description of Work:\nTechnical experience in ...,$65.00 /hr (est.),1 to 50 Employees,Company - Public,,,,Unknown / Non-Applicable


In [4]:
df.isnull().sum()

company             150
company_rating      265
location            150
job_title           150
job_description     150
salary_estimate     197
company_size        236
company_type        236
company_sector      391
company_industry    391
company_founded     464
company_revenue     236
dtype: int64

In [5]:
df.shape

(750, 12)

Glassdoor stops showing the job listing details after you reach page 20. So all scraped jobs after page 20 contains no informations so they must be dropped from the dataframe.

In [6]:
df = df.dropna(subset=['company'])

In [7]:
df.isnull().sum()

company               0
company_rating      115
location              0
job_title             0
job_description       0
salary_estimate      47
company_size         86
company_type         86
company_sector      241
company_industry    241
company_founded     314
company_revenue      86
dtype: int64

In [8]:
df.shape

(600, 12)

Cleaning the company name by removing the associated rating

In [9]:
df['company'] = df['company'].apply(lambda x: x.split('\n')[0].strip())
df.head()

Unnamed: 0,company,company_rating,location,job_title,job_description,salary_estimate,company_size,company_type,company_sector,company_industry,company_founded,company_revenue
0,PCS Global Tech,4.7,"Riverside, CA",Data Engineer | PAID BOOTCAMP,Responsibilities\n· Analyze and organize raw d...,"$70,000 /yr (est.)",501 to 1000 Employees,Company - Private,Information Technology,Information Technology Support Services,,Unknown / Non-Applicable
1,Futuretech Consultants LLC,,"Newton, MS",Snowflake Data Engineer,My name is Dileep and I am a recruiter at Futu...,$42.50 /hr (est.),,,,,,
2,Clairvoyant,4.4,Remote,Data Engineer (MDM),Required Skills:\nMust have 5-8+ Years of expe...,$67.50 /hr (est.),51 to 200 Employees,Company - Private,Pharmaceutical & Biotechnology,Biotech & Pharmaceuticals,,Unknown / Non-Applicable
3,Apple,4.2,"Cupertino, CA",Data Engineer,"Summary\nPosted: Dec 22, 2021\nWeekly Hours: 4...",,10000+ Employees,Company - Public,Information Technology,Computer Hardware Development,1976.0,$10+ billion (USD)
4,Skytech Consultancy Services,5.0,"Baltimore, MD",Data Engineer,Description of Work:\nTechnical experience in ...,$65.00 /hr (est.),1 to 50 Employees,Company - Public,,,,Unknown / Non-Applicable


Correctly formating the salary estimate, and converting the hourly to annually

In [10]:
import re

df['salary_estimate'] = df['salary_estimate'].astype(str)

for index, row in df.iterrows():
    salary_string = row['salary_estimate']
    match_year = re.search(r'\$(\d{1,3},?\d{0,3},?\d{0,3}) \/yr \(est.\)', salary_string)
    match_hour = re.search(r'\$(\d+(\.\d+)?) \/hr \(est.\)', salary_string)
    
    if match_year:
        salary_amount = float(match_year.group(1).replace(',', ''))
    elif match_hour:
        hourly_salary = float(match_hour.group(1))
        salary_amount = hourly_salary * 2080
    else:
        salary_amount = None

    df.at[index, 'salary_estimate'] = salary_amount

In [11]:
df.head()

Unnamed: 0,company,company_rating,location,job_title,job_description,salary_estimate,company_size,company_type,company_sector,company_industry,company_founded,company_revenue
0,PCS Global Tech,4.7,"Riverside, CA",Data Engineer | PAID BOOTCAMP,Responsibilities\n· Analyze and organize raw d...,70000.0,501 to 1000 Employees,Company - Private,Information Technology,Information Technology Support Services,,Unknown / Non-Applicable
1,Futuretech Consultants LLC,,"Newton, MS",Snowflake Data Engineer,My name is Dileep and I am a recruiter at Futu...,88400.0,,,,,,
2,Clairvoyant,4.4,Remote,Data Engineer (MDM),Required Skills:\nMust have 5-8+ Years of expe...,140400.0,51 to 200 Employees,Company - Private,Pharmaceutical & Biotechnology,Biotech & Pharmaceuticals,,Unknown / Non-Applicable
3,Apple,4.2,"Cupertino, CA",Data Engineer,"Summary\nPosted: Dec 22, 2021\nWeekly Hours: 4...",,10000+ Employees,Company - Public,Information Technology,Computer Hardware Development,1976.0,$10+ billion (USD)
4,Skytech Consultancy Services,5.0,"Baltimore, MD",Data Engineer,Description of Work:\nTechnical experience in ...,135200.0,1 to 50 Employees,Company - Public,,,,Unknown / Non-Applicable


Extracting the state from the job location

In [12]:
df['location'] = df['location'].astype(str)
df['job_state'] = df['location'].apply(lambda x: x if x.lower() == 'remote' else x.split(', ')[-1])

In [13]:
df.job_state.value_counts()

Remote           100
CA                66
TX                62
GA                61
NJ                51
DC                35
VA                33
MN                29
WI                27
IL                24
MD                16
CT                15
MS                15
NY                14
United States     11
MA                10
OR                 9
UT                 5
PA                 5
TN                 4
FL                 4
OH                 3
DE                 1
Name: job_state, dtype: int64

Replace 'United States' in states with 'CA' as it is the most common state

In [14]:
df['job_state']= df['job_state'].replace('United States','CA')
df.job_state.value_counts()

Remote    100
CA         77
TX         62
GA         61
NJ         51
DC         35
VA         33
MN         29
WI         27
IL         24
MD         16
CT         15
MS         15
NY         14
MA         10
OR          9
UT          5
PA          5
TN          4
FL          4
OH          3
DE          1
Name: job_state, dtype: int64

Replacing rating null values with median

In [15]:
cr_median = df.company_rating.median()

In [16]:
df['company_rating'] = df['company_rating'].fillna(cr_median)

Adding a new column that contains the age of the company

In [17]:
df['company_founded'] = df['company_founded'].fillna(-1)
df['company_founded'] = df['company_founded'].astype(int)

In [18]:
import datetime

today = datetime.datetime.now()

df['company_age'] = df.company_founded.apply(lambda x: x if x < 0 else today.year - x)

df['company_age'].head()

0    -1
1    -1
2    -1
3    47
4    -1
Name: company_age, dtype: int64

In [19]:
df.head()

Unnamed: 0,company,company_rating,location,job_title,job_description,salary_estimate,company_size,company_type,company_sector,company_industry,company_founded,company_revenue,job_state,company_age
0,PCS Global Tech,4.7,"Riverside, CA",Data Engineer | PAID BOOTCAMP,Responsibilities\n· Analyze and organize raw d...,70000.0,501 to 1000 Employees,Company - Private,Information Technology,Information Technology Support Services,-1,Unknown / Non-Applicable,CA,-1
1,Futuretech Consultants LLC,4.4,"Newton, MS",Snowflake Data Engineer,My name is Dileep and I am a recruiter at Futu...,88400.0,,,,,-1,,MS,-1
2,Clairvoyant,4.4,Remote,Data Engineer (MDM),Required Skills:\nMust have 5-8+ Years of expe...,140400.0,51 to 200 Employees,Company - Private,Pharmaceutical & Biotechnology,Biotech & Pharmaceuticals,-1,Unknown / Non-Applicable,Remote,-1
3,Apple,4.2,"Cupertino, CA",Data Engineer,"Summary\nPosted: Dec 22, 2021\nWeekly Hours: 4...",,10000+ Employees,Company - Public,Information Technology,Computer Hardware Development,1976,$10+ billion (USD),CA,47
4,Skytech Consultancy Services,5.0,"Baltimore, MD",Data Engineer,Description of Work:\nTechnical experience in ...,135200.0,1 to 50 Employees,Company - Public,,,-1,Unknown / Non-Applicable,MD,-1


Simplifying the job title

In [20]:
def title_simplifier(title):
    if 'data scientist' in title.lower():
        return 'data scientist'
    elif 'data engineer' in title.lower():
        return 'data engineer'
    elif 'data analyst' in title.lower():
        return 'data analyst'
    elif 'machine learning' in title.lower():
        return 'mle'
    else:
        return 'na'

In [21]:
df['job_simp'] = df['job_title'].apply(title_simplifier)
df.job_simp.value_counts()

data engineer     556
na                 37
data scientist      7
Name: job_simp, dtype: int64

In [26]:
df = df[df['job_simp'] != 'na']
df = df[df['job_simp'] != 'data scientist']

df.job_simp.value_counts()

data engineer    556
Name: job_simp, dtype: int64

In [27]:
def seniority(title):
    if 'sr' in title.lower() or 'senior' in title.lower() or 'sr' in title.lower() or 'lead' in title.lower() or 'principal' in title.lower():
            return 'senior'
    elif 'jr' in title.lower() or 'jr.' in title.lower():
        return 'junior'
    else:
        return 'na'

In [28]:
df['seniority'] = df['job_title'].apply(seniority)
df.seniority.value_counts()

na        402
senior    153
junior      1
Name: seniority, dtype: int64

In [40]:
df = df[df['seniority'] != "junior"]

df.seniority.value_counts()

na        401
senior    153
Name: seniority, dtype: int64

Extracting relevant skills from job description

In [29]:
prog_languages = ['python', 'java', 'scala', 'go', 'r', 'c', 'c++', 'sql', 'nosql']
cloud_tools = ['aws', 'azure', 'google cloud']
viz_tools = ['power bi', 'tableau']
bigdata_tools = ['spark', 'hadoop', 'flink', 'mongodb', 'gcp']
data_tools = ['kafka', 'hive', 'snowflake', 'airflow']

In [30]:
import re

def extract_keywords(description, keywords):
    pattern = r'\b(?:{})\b'.format('|'.join(map(re.escape, keywords)))
    matches = set(re.findall(pattern, description.lower(), flags=re.IGNORECASE))
    
    return list(matches)

In [31]:
df['job_languages'] = df['job_description'].apply(lambda x: extract_keywords(x, prog_languages))
df['job_cloud'] = df['job_description'].apply(lambda x: extract_keywords(x, cloud_tools))
df['job_viz'] = df['job_description'].apply(lambda x: extract_keywords(x, viz_tools))
df['job_bigdata'] = df['job_description'].apply(lambda x: extract_keywords(x, bigdata_tools))
df['job_data'] = df['job_description'].apply(lambda x: extract_keywords(x, data_tools))

Extracting Education from job description

In [32]:
education = ['associate', 'bachelor', 'master', 'phd']

In [33]:
def extract_degree(description, degrees):
    pattern = r'\b(?:{})\b'.format('|'.join(map(re.escape, degrees)))
    matches = re.findall(pattern, description.lower(), flags=re.IGNORECASE)
    
    if matches:
        return matches[0]
    else:
        return None

In [36]:
df['job_education'] = df['job_description'].apply(lambda x: extract_degree(x, education))

df['job_education'].value_counts()

bachelor     129
master        55
associate      1
Name: job_education, dtype: int64

In [41]:
df = df[df['job_education'] != "associate"]

df['job_education'].value_counts()

bachelor    129
master       55
Name: job_education, dtype: int64

In [42]:
df.head()

Unnamed: 0,company,company_rating,location,job_title,job_description,salary_estimate,company_size,company_type,company_sector,company_industry,...,job_state,company_age,job_simp,seniority,job_languages,job_cloud,job_viz,job_bigdata,job_data,job_education
0,PCS Global Tech,4.7,"Riverside, CA",Data Engineer | PAID BOOTCAMP,Responsibilities\n· Analyze and organize raw d...,70000.0,501 to 1000 Employees,Company - Private,Information Technology,Information Technology Support Services,...,CA,-1,data engineer,na,"[python, sql, java]",[],[],[],[],
1,Futuretech Consultants LLC,4.4,"Newton, MS",Snowflake Data Engineer,My name is Dileep and I am a recruiter at Futu...,88400.0,,,,,...,MS,-1,data engineer,na,"[sql, c]",[],[],[],[snowflake],bachelor
2,Clairvoyant,4.4,Remote,Data Engineer (MDM),Required Skills:\nMust have 5-8+ Years of expe...,140400.0,51 to 200 Employees,Company - Private,Pharmaceutical & Biotechnology,Biotech & Pharmaceuticals,...,Remote,-1,data engineer,na,"[python, sql]",[aws],[],[spark],[],master
3,Apple,4.2,"Cupertino, CA",Data Engineer,"Summary\nPosted: Dec 22, 2021\nWeekly Hours: 4...",,10000+ Employees,Company - Public,Information Technology,Computer Hardware Development,...,CA,47,data engineer,na,"[python, c]",[],[tableau],[],[],
4,Skytech Consultancy Services,5.0,"Baltimore, MD",Data Engineer,Description of Work:\nTechnical experience in ...,135200.0,1 to 50 Employees,Company - Public,,,...,MD,-1,data engineer,na,[sql],[],[tableau],[],[],bachelor


Exporting the dataframe as a new cleaned data file

In [43]:
data_path = '../data/'

df.to_csv(data_path + "glassdoor-data-engineer-cleaned.csv", index=False)