In [117]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt  

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [118]:
df_salary = df[pd.notna(df["salary_year_avg"])].copy()

# More clear and concise version
# df["salary_year_avg"].dropna()

In [119]:
# def projected_salary(salary):
#     return salary * 1.03

df_salary["salary_year_inflated"] = df_salary["salary_year_avg"].apply(lambda salary: salary * 1.03)
df_salary[["salary_year_avg", "salary_year_inflated"]]

# df_salary[pd.notna(df_salary['salary_year_avg'])][['salary_year_avg', 'salary_year_inflated']] # Another option

# Another more simplistic way
# df_salary["salary_year_inflated"] = df_salary["salary_year_avg"] * 1.03

df_salary[["salary_year_avg", "salary_year_inflated"]]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [123]:
import ast

# def clean_list(skill_list):
#     if isinstance(skill_list, str):
#         return ast.literal_eval(skill_list)
#     return skill_list  # <-- important!

df["job_skills"] = df["job_skills"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


# df[pd.isna(df["job_skills"])] # Show rows where it's None or NaN

In [121]:
df["job_skills"]

0                                                      None
1                [r, python, sql, nosql, power bi, tableau]
2         [python, sql, c#, azure, airflow, dax, docker,...
3         [python, c++, java, matlab, aws, tensorflow, k...
4         [bash, python, oracle, aws, ansible, puppet, j...
                                ...                        
785736    [bash, python, perl, linux, unix, kubernetes, ...
785737                               [sas, sas, sql, excel]
785738                                  [powerpoint, excel]
785739    [python, go, nosql, sql, mongo, shell, mysql, ...
785740                                          [aws, flow]
Name: job_skills, Length: 785741, dtype: object

In [122]:
print(repr(df["job_skills"][1]))
print(type(df["job_skills"][1]))


['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']
<class 'list'>


In [168]:
# def projected_salary(row):
#     if "senior" in str(row["job_title_short"].lower()):
#         return row["salary_year_avg"] * 1.05
#     else:
#         return row["salary_year_avg"] * 1.03

# Same, but with lambda function
df_salary["salary_year_inflated"] = df_salary.apply(lambda salary: salary["salary_year_avg"] * 1.05 if "senior" in str(salary["job_title_short"]).lower() else salary["salary_year_avg"] * 1.03, axis=1)

In [169]:
df_senior = df_salary["job_title_short"].str.contains("senior", case=False, na=False)
df_nonsenior = ~df_salary["job_title_short"].str.contains("senior", case=False, na=False)


df_salary[["job_title_short", "salary_year_inflated", "salary_year_avg"]]

# To filter NaN rows
df_salary[pd.notna(df_salary["salary_year_avg"])][["job_title_short", "salary_year_inflated", "salary_year_avg"]][df_senior]

Unnamed: 0,job_title_short,salary_year_inflated,salary_year_avg
495,Senior Data Engineer,176925.000,168500.0
573,Senior Data Engineer,168000.000,160000.0
657,Senior Data Engineer,173250.000,165000.0
726,Senior Data Engineer,182175.000,173500.0
733,Senior Data Engineer,168000.000,160000.0
...,...,...,...
784947,Senior Data Engineer,157500.000,150000.0
785106,Senior Data Engineer,131250.000,125000.0
785132,Senior Data Scientist,104107.500,99150.0
785330,Senior Data Scientist,210982.275,200935.5
