In [2]:
import pandas as pd
import matplotlib.pyplot as plt

from datasets import load_dataset
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

Example 1
Calculate projected salaries next year, using an assumed rate of 3.0% for all roles.

In [3]:
def inflation(salary):
    return salary*1.03

df['salary_year_inflated']= df['salary_year_avg'].apply(inflation)

df[pd.notna(df['salary_year_avg'])][['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


We can actually simplify this with a lambda function.

In [4]:
df['salary_year_inflated'] = df['salary_year_avg'].apply(lambda salary: salary*1.03)

df[pd.notna(df['salary_year_avg'])][['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


Now technically this could have been done like this..

In [5]:
df['salary_year_inflated'] = df['salary_year_avg']*1.03

df[pd.notna(df['salary_year_avg'])][['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00



Example 2
Calculate projected salaries next year, but:

For senior roles (e.g., Senior Data Analysts), assume the rate is 5%
For all other roles, assume rate is 3%

In [6]:
def projected_salary(row):
    if 'Senior' in row['job_title_short']:
        return row['salary_year_avg']* 1.05
    else:
        return row['salary_year_avg']* 1.03

df['salary_year_inflated']= df.apply(projected_salary, axis= 1)

df[pd.notna(df['salary_year_avg'])][['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00


Technically you could write this with a lambda function:

In [7]:
df['salary_year_inflated']= df.apply(lambda row:row['salary_year_avg']* 1.05 if 'Senior' in row['job_title_short'] else   row['salary_year_avg']*1.03, axis= 1)

df[pd.notna(df['salary_year_avg'])][['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00


Example 3
Convert the job_skills from a generic object to an actual list object (hint this is very important for later). Let's try doing that by just using ast.literal_eval and then look at our new column.

In [8]:
type(df['job_skills'][1])

str

In [9]:
import ast 

ast.literal_eval(df['job_skills'][1])

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [10]:
type(ast.literal_eval(df['job_skills'][1]))

list

In [11]:
df['job_skills']= df['job_skills'].apply(ast.literal_eval)

ValueError: malformed node or string: None

In [None]:
df['job_skills']= df['job_skills'].apply(lambda k: ast.literal_eval(k) if pd.notna(k) else k)


In [None]:

type(df['job_skills'][1])

list

In [None]:
def clean_list(skill_list):
    if pd.notna(skill_list):
        return ast.literal_eval(skill_list)
    
df['job_skills']= df['job_skills'].apply(clean_list)

In [None]:
df['job_skills'][1]

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']