Applying Functions

In [1]:
#Importing Libraries 

import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset

#Loading Data 

dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

#Data Cleanup 

df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [9]:
#returning the datafram with not na values for salary_year_avg

df_salary = df[pd.notna(df['salary_year_avg'])].copy()

df_salary.head(3)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
28,Data Scientist,CRM Data Specialist,"San José Province, San José, Costa Rica",via Ai-Jobs.net,Full-time,False,Costa Rica,2023-08-01 13:37:57,False,False,Costa Rica,year,109500.0,,Netskope,"['gdpr', 'excel']","{'analyst_tools': ['excel'], 'libraries': ['gd..."
77,Data Engineer,Data Engineer,"Arlington, VA",via LinkedIn,Full-time,False,Sudan,2023-06-26 14:22:54,False,False,Sudan,year,140000.0,,Intelletec,"['mongodb', 'mongodb', 'python', 'r', 'sql', '...","{'analyst_tools': ['tableau'], 'cloud': ['orac..."
92,Data Engineer,Remote - Data Engineer - Permanent - W2,Anywhere,via LinkedIn,Full-time,True,"Illinois, United States",2023-02-21 13:29:59,False,True,United States,year,120000.0,,Apex Systems,"['sql', 'python']","{'programming': ['sql', 'python']}"


In [11]:
#I want to calculate the projected salaries next year 
#I can use the apply method to apply a function to all the salaries 
# so they are adjusted for possible inflation next year
#You pass a function as an argument in the apply method 


def projected_salary(salary): 
    return salary * 1.03

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(projected_salary)

df_salary[['salary_year_avg', 'salary_year_inflated']].head()

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.0
77,140000.0,144200.0
92,120000.0,123600.0
100,228222.0,235068.66
109,89000.0,91670.0


In [17]:
#another example is converting job_skills column from a string to a list

import ast 

df['job_skills'][1]

"['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']"

In [20]:
ast.literal_eval(df['job_skills'][1])

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [21]:
def clean_list(skill_list): 
    if pd.notna(skill_list):
        return ast.literal_eval(skill_list)
    


df['job_skills'] = df['job_skills'].apply(clean_list)


In [22]:
df['job_skills'].head(3)

0                                                 None
1           [r, python, sql, nosql, power bi, tableau]
2    [python, sql, c#, azure, airflow, dax, docker,...
Name: job_skills, dtype: object

In [30]:
#you can also use the apply method across rows instead of columns 
#I want to see next year projected salary assuming all senior roles get a 5 % increase
#and all other roles get a 3% increase
#you can use keyword argument axis=1 to specify that this will be applied to rows

def projected_salary(row): 
    if "Senior" in row['job_title_short']:
        return 1.05 * row['salary_year_avg']
    else:
        return 1.03 * row['salary_year_avg']


df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis = 1)

df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']].head(5)


Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.0
77,Data Engineer,140000.0,144200.0
92,Data Engineer,120000.0,123600.0
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.0
