In [2]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("glassdoor_jobs.csv")
df = df.iloc[:, 1:]
df.columns

Index(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors'],
      dtype='object')

## data cleaning and parsing

In [4]:
# salary parsing
df = df[df['Salary Estimate'] != "-1"]
salary = df["Salary Estimate"].apply(lambda x: x.split('(')[0])
salary = salary.apply(lambda x: x.replace('K', '').replace('$', ''))
df['Hourly Salary'] = df['Salary Estimate'].apply(lambda x: 1 if 'per hour' in x.lower() else 0)
salary= salary.apply(lambda x: x.lower().replace('per hour', '').replace('employer provided salary:', ''))
df['min_salary'] = salary.apply(lambda x: int(x.split('-')[0]))
df['max_salary'] = salary.apply(lambda x: int(x.split('-')[1]))
df['average_salary'] = (df.min_salary + df.max_salary) / 2

In [5]:
# company name (text only)
company_names = df['Company Name'].apply(lambda x: re.sub(r'\d+\.\d+$', '', x))
df['company_name'] = company_names.apply(lambda x: x.strip())

In [6]:
# state field
df['job_state'] = df['Location'].str.split(',').str[1].str.strip()
df['same_state'] = 0
df.loc[df['Location'] == df['Headquarters'], 'same_state'] = 1
# df.job_state.value_counts()

In [7]:
# age of company
df['company_age'] = 2023 - df['Founded']
df.loc[df['Founded'] == -1, 'company_age'] = np.nan

In [8]:
# parsing job description
df['r_yn'] = df['Job Description'].apply(lambda x: 1 if re.search("[\b\s\.,]r[\b\s\.,]|rstudio|r.studio", x.lower()) else 0)
df['powerBI_yn'] = df['Job Description'].apply(lambda x: 1 if re.search("powerbi|power.bi", x.lower()) else 0)
skill_list = ['python', 'aws', 'spark', 'tableau', 'matlab', 'excel', 'sql']
for skill in skill_list:
    df[f'{skill}_yn'] = df['Job Description'].apply(lambda x: 1 if skill in x.lower() else 0)


In [10]:
df.powerBI_yn.sum()

56

## Exploratory data analysis