In [85]:
import pandas as pd
import numpy as np
import statistics
import os
import shutil

In [None]:
# Getting the Dataset using the Kaggle API
!kaggle datasets download rashikrahmanpritom/data-science-job-posting-on-glassdoor

# Unziping the Data
shutil.unpack_archive("data-science-job-posting-on-glassdoor.zip",extract_dir='data')

# Deleting the Zip File and the Cleaned Version
os.remove("data-science-job-posting-on-glassdoor.zip")
os.remove("data/Cleaned_DS_Jobs.csv")

# Renaming our dataset
os.rename("data/Uncleaned_DS_jobs.csv", "data/Data_Science_Jobs.csv")

In [None]:
# Starting the dataframe and correcting the index
df = pd.read_csv("data/Data_Science_Jobs.csv",index_col='index')

In [None]:
# Getting the Salary Estimate
def get_salary_estimate(x):
    estimate = ''
    for char in x:
        if char.isdigit() or char == '-':
            estimate += char 
    return estimate

df['Salary Estimate'] = df['Salary Estimate'].apply(get_salary_estimate)

In [None]:
# Cleaning the Company Name
def get_company_name(x):
    if str("\n") in x:
        array = x.split(str("\n"))
        return array[0].strip()
    else:
        return x
    
df['Company Name'] = df['Company Name'].apply(get_company_name)

In [None]:
# Cleaning the Company Size
def get_size(x):
    size = ''
    x = x.replace(' to ','-')
    for char in x:
        if char.isdigit() or char == '-':
            size += char 
    return size
    
df['Size'] = df['Size'].apply(get_size)

In [None]:
# Getting the Minimum Salary
df['Min Salary'] = df['Salary Estimate'].apply(lambda x: int(min(x.split('-'))))

# Getting the Maximum Salary
df['Max Salary'] = df['Salary Estimate'].apply(lambda x: int(max(x.split('-'))))

# Getting the Average Salary
df['Avg Salary'] = df['Salary Estimate'].apply(lambda x: statistics.mean(list(map(int,x.split('-')))))

In [None]:
# Creating the Job State Column
def get_job_state(x):
    # States that does not have the acronym
    standardize_states = {"Utah":"UT",
                      "United States":"US",
                      "Remote":"RMT",
                      "New Jersey":"NJ",
                      "Texas":"TX",
                      "California":"CA"}
    
    try:
        state = x.split(",")[1].strip()
    except:
        for state_key,acronym in standardize_states.items():
            if x == state_key:
                state = acronym
    return state

find_different = df['Location'].apply(get_job_state)
df['Job State'] = df['Location'].apply(get_job_state)

In [75]:
# Creating the Same State Boolean Column
def get_headquarters_state(x):
    try:
        state = x.split(",")[1].strip()
    except:
        state = x
    return state

headquarters_states = df['Headquarters'].apply(get_headquarters_state)
df['Same State'] = df['Job State'] == headquarters_states

In [79]:
# Including the most appeared skills in boolean columns form using Lambda
df['Python - Required'] = df['Job Description'].apply(lambda x: True if "python" in x.lower() else False)
df['Excel - Required'] = df['Job Description'].apply(lambda x: True if "excel" in x.lower() else False)
df['Hadoop - Required'] = df['Job Description'].apply(lambda x: True if "hadoop" in x.lower() else False)
df['Spark - Required'] = df['Job Description'].apply(lambda x: True if "spark" in x.lower() else False)
df['AWS - Required'] = df['Job Description'].apply(lambda x: True if "aws" in x.lower() else False)
df['Tableau - Required'] = df['Job Description'].apply(lambda x: True if "tableau" in x.lower() else False)
df['Big Data - Required'] = df['Job Description'].apply(lambda x: True if "big data" in x.lower() else False)


In [88]:
# Getting a simple version of the Job
def get_simpler_job(x):
    x = x.lower()
    if "data scientist" in x:
        simpler_job = "Data Scientist"
    elif "machine learning engineer" in x:
        simpler_job = "M.L.E"
    elif "analyst" in x:
        simpler_job = "Analyst"
    elif "data engineer" in x:
        simpler_job = "Data Engineer"
    elif "computer scientist" in x:
        simpler_job = "Computer Scientist"
    else:
        simpler_job = np.nan
    return simpler_job
    
df['Simpler Job'] = df['Job Title'].apply(get_simpler_job)

In [94]:
# Ending the project getting the desired level of the job
def get_job_level(x):
    x = x.lower()
    if "jr" in x:
        level = "Junior"
    elif any(k in x for k in ["principal","senior",'experienced','sr']):
        level = "Senior"
    else:
        level = np.nan
    return level

df['Desired Level'] = df['Job Title'].apply(get_job_level)

In [95]:
# It looks better than before :/
df

Unnamed: 0_level_0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,...,Same State,Python - Required,Excel - Required,Hadoop - Required,Spark - Required,AWS - Required,Tableau - Required,Big Data - Required,Simpler Job,Desired Level
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Sr Data Scientist,137-171,Description\n\nThe Senior Data Scientist is re...,3.1,Healthfirst,"New York, NY","New York, NY",1001-5000,1993,Nonprofit Organization,...,True,False,False,False,False,True,False,False,Data Scientist,Senior
1,Data Scientist,137-171,"Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech,"Chantilly, VA","Herndon, VA",5001-10000,1968,Company - Public,...,True,False,False,True,False,False,False,True,Data Scientist,
2,Data Scientist,137-171,Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group,"Boston, MA","Boston, MA",1001-5000,1981,Private Practice / Firm,...,True,True,True,False,False,True,False,False,Data Scientist,
3,Data Scientist,137-171,JOB DESCRIPTION:\n\nDo you have a passion for ...,3.5,INFICON,"Newton, MA","Bad Ragaz, Switzerland",501-1000,2000,Company - Public,...,False,True,True,False,False,True,False,False,Data Scientist,
4,Data Scientist,137-171,Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions,"New York, NY","New York, NY",51-200,1998,Company - Private,...,True,True,True,False,False,False,False,False,Data Scientist,
5,Data Scientist,137-171,About Us:\n\nHeadquartered in beautiful Santa ...,4.2,HG Insights,"Santa Barbara, CA","Santa Barbara, CA",51-200,2010,Company - Private,...,True,True,True,True,True,False,False,False,Data Scientist,
6,Data Scientist / Machine Learning Expert,137-171,Posting Title\nData Scientist / Machine Learni...,3.9,Novartis,"Cambridge, MA","Basel, Switzerland",10000,1996,Company - Public,...,False,True,False,False,False,False,False,False,Data Scientist,
7,Data Scientist,137-171,Introduction\n\nHave you always wanted to run ...,3.5,iRobot,"Bedford, MA","Bedford, MA",1001-5000,1990,Company - Public,...,True,True,False,False,False,False,False,False,Data Scientist,
8,Staff Data Scientist - Analytics,137-171,Intuit is seeking a Staff Data Scientist to co...,4.4,Intuit - Data,"San Diego, CA","Mountain View, CA",5001-10000,1983,Company - Public,...,True,False,False,False,False,False,False,False,Data Scientist,
9,Data Scientist,137-171,Ready to write the best chapter of your career...,3.6,XSELL Technologies,"Chicago, IL","Chicago, IL",51-200,2014,Company - Private,...,True,True,False,False,False,False,False,False,Data Scientist,
