In [1]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt  

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [2]:
df_salary = df[pd.notna(df['salary_year_avg'])].copy()

def projected_salary(salary):
    return salary * 1.03



In [3]:
df_salary['df_salary_inflation'] = df_salary['salary_year_avg'].apply(projected_salary)
df_salary[['salary_year_avg', 'df_salary_inflation']]

Unnamed: 0,salary_year_avg,df_salary_inflation
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [4]:
# Using a lambda instead
df_salary['df_salary_inflation'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)

df_salary[['salary_year_avg', 'df_salary_inflation']]

Unnamed: 0,salary_year_avg,df_salary_inflation
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [5]:
import ast

def clean_list(skill_list):
    if pd.notna(skill_list):
        return ast.literal_eval(skill_list)

df['job_skills'] = df['job_skills'].apply(clean_list)

In [6]:
df['job_skills'].head()

0                                                 None
1           [r, python, sql, nosql, power bi, tableau]
2    [python, sql, c#, azure, airflow, dax, docker,...
3    [python, c++, java, matlab, aws, tensorflow, k...
4    [bash, python, oracle, aws, ansible, puppet, j...
Name: job_skills, dtype: object

In [7]:
def calc_avg_salary(job_title_short):
    return df[df['job_title_short']== job_title_short]['salary_year_avg'].median()

In [8]:
calc_avg_salary('Data Analyst')

90000.0

In [21]:
def projected_salary(row):
    if "Senior" in row['job_title_short']: 
        return 1.05 * row['salary_year_avg']  
    else:
        return 1.03 * row['salary_year_avg']  

df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis = 1)

df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']].head(20)

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.0
77,Data Engineer,140000.0,144200.0
92,Data Engineer,120000.0,123600.0
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.0
116,Data Scientist,114000.0,117420.0
146,Data Engineer,129500.0,133385.0
180,Data Analyst,90250.0,92957.5
212,Data Scientist,157500.0,162225.0
257,Data Scientist,103128.0,106221.84


# Marketing Analytics Problem

In [9]:
import pandas as pd

# Sample transaction data
data = {
    'customer_id': [101, 102, 103, 101, 102, 101],
    'purchase_amount': [50, 75, 100, 60, 90, 40],
    'purchase_date': pd.to_datetime(['2024-01-05', '2024-02-10', '2024-03-15', '2024-04-20', '2024-05-25', '2024-06-30'])
}

df = pd.DataFrame(data)

In [10]:
clv = df.groupby('customer_id')['purchase_amount'].sum().reset_index()

In [11]:
clv.columns = ['customer_id', 'lifetime_value']

clv

Unnamed: 0,customer_id,lifetime_value
0,101,150
1,102,165
2,103,100


In [12]:
df_pivot = df.pivot_table(index='customer_id', values='purchase_amount', aggfunc='sum')

In [13]:
df_pivot.reset_index()

Unnamed: 0,customer_id,purchase_amount
0,101,150
1,102,165
2,103,100


In [14]:
df_pivot.set_index('customer_id')

KeyError: "None of ['customer_id'] are in the columns"