In [33]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [34]:
df = pd.read_csv('job_data_cleaned.csv')

In [35]:
df.columns

Index(['Unnamed: 0', 'Job Title', 'Salary Estimate', 'Job Description',
       'Rating', 'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors',
       'min_salary', 'max_salary', 'avg_salary', 'state', 'age', 'python',
       'r_studio', 'spark', 'aws', 'tableau', 'excel', 'ml'],
      dtype='object')

In [36]:
def job_title_cleaning(title):
    """
    input: Job titles as a string
    output: A string containing one of the five basic categories or na if none apply 
    """
    if 'data scientist' in title.lower():
        return 'data_scientist'
    elif 'data engineer' in title.lower():
        return "data_engineer"
    elif 'analyst' in title.lower():
        return 'analyst'
    elif 'machine learning' in title.lower():
        return 'mle'
    elif 'manager' in title.lower():
        return 'manager'
    elif 'director' in title.lower():
        return 'director'
    else:
        return 'na'

In [37]:
def seniority(title):
    if 'sr' in title.lower() or 'senior' in title.lower() or 'lead' in title.lower() or 'principal' in title.lower():
        return 'senior'
    elif 'jr' in title.lower():
        return 'junior'
    else:
        return 'na'
    

In [40]:
# Simplify job description into 5 categories
df['job_simplified'] = df['Job Title'].apply(job_title_cleaning)

# Extract seniority information from jobt title
df['seniority'] = df['Job Title'].apply(seniority)

In [50]:
# Check number of jobs in each category
df.job_simplified.value_counts()

data_scientist    503
na                171
data_engineer      57
analyst            37
mle                19
manager             9
director            4
Name: job_simplified, dtype: int64

In [51]:
# Check seniority distribution
df.seniority.value_counts()

na        600
senior    200
Name: seniority, dtype: int64

In [46]:
# Job description length
df['desc_length'] = df['Job Description'].apply(lambda x: len(x))

In [56]:
# 
df.desc_length.value_counts()

2495    4
3361    3
8173    3
4286    3
3204    3
       ..
1673    1
2696    1
1671    1
3284    1
3073    1
Name: desc_length, Length: 679, dtype: int64

In [49]:
# We didnt find any competitors while scraping
df.Competitors.unique()

array([-1], dtype=int64)

In [65]:
# 
df.describe()

Unnamed: 0.1,Unnamed: 0,Rating,Headquarters,Competitors,min_salary,max_salary,avg_salary,age,python,r_studio,spark,aws,tableau,excel,ml,desc_length
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,680.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,399.5,3.783125,-1.0,-1.0,76.5025,142.57375,109.538125,51.302941,0.0825,0.0025,0.03625,0.0525,0.02875,0.2525,0.25375,3885.5375
std,231.0844,1.151525,0.0,0.0,15.07254,27.060763,20.621009,50.563913,0.275297,0.049969,0.187029,0.223173,0.167208,0.434718,0.435428,1843.944127
min,0.0,-1.0,-1.0,-1.0,48.0,97.0,73.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0
25%,199.75,3.7,-1.0,-1.0,65.0,127.0,94.5,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2695.0
50%,399.5,4.0,-1.0,-1.0,74.0,136.0,108.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3596.5
75%,599.25,4.3,-1.0,-1.0,90.0,158.0,122.0,73.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4766.5
max,799.0,5.0,-1.0,-1.0,104.0,196.0,146.5,275.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,18466.0
