In [56]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data cleaning
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [None]:

df['salary_year_avg'][df['salary_year_avg'].notna()]
df['job_skills'][df['job_skills'].isna()]


28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

In [20]:
df[pd.notna(df['salary_year_avg'])]['salary_year_avg']

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

# Calculate Project Salary Next Year

In [23]:
# using the apply function with a calculated function
df_salary = df[df['salary_year_avg'].notna()].copy()

def projected_salary(salary):
    return salary * 1.03

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(projected_salary)

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [None]:
# using the apply function with an anonymous function like lambda

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [None]:
# simple method without apply function
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'] * 1.03

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [57]:
df['job_skills'][1]

"['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']"

In [58]:
import ast

ast.literal_eval(df['job_skills'][1])

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [45]:
# to apply the ast.literal_eval function to the entire job skills column
def clean_list(skill_list):
    if pd.notna(skill_list):
        return ast.literal_eval(skill_list)

df['job_skills'] = df['job_skills'].apply(clean_list)

In [None]:
# to confirm it worked
df['job_skills'][1]

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [None]:
# df[pd.isna(df['job_skills'])] # or df[df['job_skills'].isna()]

In [None]:
# doing the same with a lambda function
df['job_skills'] = df['job_skills'].apply(lambda skill_list: ast.literal_eval(skill_list) if pd.notna(skill_list) else skill_list)

In [60]:
df['job_skills'][1]

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

# Calculate projected salary next year
- Senior roles assume 5%
- other roles assume 3%


In [None]:
# using the applying function on rows 
def projected_salary(row):
    if 'Senior' in row['job_title_short']:
        return 1.05 * row['salary_year_avg']
    else:
        return 1.03 * row['salary_year_avg']


df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis=1)

df_salary.iloc[200:250][['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
5554,Business Analyst,64600.0,66538.0
5579,Senior Data Scientist,170000.0,178500.0
5580,Data Engineer,200000.0,206000.0
5628,Data Scientist,361000.0,371830.0
5658,Senior Data Engineer,200000.0,210000.0
5684,Data Engineer,125000.0,128750.0
5688,Senior Data Scientist,180000.0,189000.0
5703,Data Analyst,65000.0,66950.0
5709,Data Scientist,192000.0,197760.0
5795,Data Engineer,100000.0,103000.0


In [None]:
# using the apply function with an anonymous function like lambda on rows

df_salary['salary_year_inflated'] = df_salary.apply(lambda row: 1.05 * row['salary_year_avg'] if 'Senior' in row['job_title_short'] else 1.03 * row['salary_year_avg'], axis=1)

df_salary.iloc[200:250]

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills,salary_year_inflated
5554,Business Analyst,Associate Business Intelligence Analyst,Canada,via Ai-Jobs.net,Full-time,False,Canada,2023-10-31 13:37:08,False,False,Canada,year,64600.0,,Manulife,"['sql', 'power bi']","{'analyst_tools': ['power bi'], 'programming':...",66538.0
5579,Senior Data Scientist,Senior Data Scientist,United States,via LinkedIn,Full-time,False,"Illinois, United States",2023-05-03 13:05:09,False,False,United States,year,170000.0,,Harnham,"['python', 'sql']","{'programming': ['python', 'sql']}",178500.0
5580,Data Engineer,Data Engineer,"New York, NY",via Dice,Full-time,False,"Texas, United States",2023-09-29 13:07:23,False,False,United States,year,200000.0,,Jobot,"['python', 'sql', 'mongodb', 'mongodb', 'spark...","{'databases': ['mongodb'], 'libraries': ['spar...",206000.0
5628,Data Scientist,"Data Scientist Lead, Global Monetization Strat...","San Jose, CA",via LinkedIn,Full-time,False,"California, United States",2023-11-29 13:04:38,False,True,United States,year,361000.0,,TikTok,"['r', 'python', 'sql']","{'programming': ['r', 'python', 'sql']}",371830.0
5658,Senior Data Engineer,Senior Data Engineer,"Toronto, ON, Canada",via Ladders,Full-time,False,Canada,2023-08-14 13:10:47,True,False,Canada,year,200000.0,,Mercury,"['snowflake', 'react']","{'cloud': ['snowflake'], 'libraries': ['react']}",210000.0
5684,Data Engineer,Lead Data Engineer,"Charlotte, NC",via BeBee,Full-time,False,"California, United States",2023-12-16 13:04:22,False,True,United States,year,125000.0,,LTI - Larsen & Toubro Infotech,"['sql', 'snowflake', 'aws', 'tableau']","{'analyst_tools': ['tableau'], 'cloud': ['snow...",128750.0
5688,Senior Data Scientist,Senior Data Scientist,"San Diego, CA",via Central Illinois Proud Jobs,Full-time,False,"California, United States",2023-01-04 13:03:59,False,False,United States,year,180000.0,,Harnham,"['python', 'sql', 'aws', 'tensorflow', 'spark'...","{'cloud': ['aws'], 'libraries': ['tensorflow',...",189000.0
5703,Data Analyst,Healthcare Data Analyst,"Dallas, TX",via Dice,Full-time,False,"Texas, United States",2023-10-31 13:01:13,False,False,United States,year,65000.0,,"Citron IT, Inc.","['sql', 'python', 'r', 'azure', 'power bi', 't...","{'analyst_tools': ['power bi', 'tableau'], 'cl...",66950.0
5709,Data Scientist,"Data Scientist, Product Analytics (Multiple Op...","San Jose, CA",via LinkedIn,Full-time,False,"California, United States",2023-12-29 13:00:33,False,True,United States,year,192000.0,,TikTok,"['python', 'r', 'sql']","{'programming': ['python', 'r', 'sql']}",197760.0
5795,Data Engineer,Higher Education Data Engineer,"Tempe, AZ",via Get.It,Full-time,False,"New York, United States",2023-08-29 13:25:09,False,True,United States,year,100000.0,,Get It Recruit - Educational Services,"['python', 'java', 'r', 'spark', 'tensorflow',...","{'analyst_tools': ['excel', 'power bi'], 'libr...",103000.0
