In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

In [3]:
path = Path("data") / "2025 Mercer DA project.xlsx"
df = pd.read_excel(path, sheet_name="mercer benchmark")
print(df.head())

          Geo Comptryx Code              Major Function  \
0  GB: London         RSMP3  R - Research & Development   
1  GB: London         RSMP4  R - Research & Development   
2  GB: London         RSIP1  R - Research & Development   
3  GB: London         RSIP2  R - Research & Development   
4  GB: London         RSIP3  R - Research & Development   

                          Sub Function       Function             Level  \
0  RSM - Mobile Applications Developer  RS - Software  P3 - Proficiency   
1  RSM - Mobile Applications Developer  RS - Software      P4 - Mastery   
2          RSI - User Interface Design  RS - Software        P1 - Entry   
3          RSI - User Interface Design  RS - Software   P2 - Developing   
4          RSI - User Interface Design  RS - Software  P3 - Proficiency   

  Breakout Breakout Value Currency  # Co's  ...  Mkt LTI Total 30th %''ile  \
0  Overall        Overall      GBP     5.0  ...                        0.0   
1  Overall        Overall      GBP    

In [None]:
def standardize_location(Geo):
    if pd.isna(Geo):
        return np.nan
    Geo = str(Geo).strip()

    if "Tokyo" in Geo or Geo.startswith("JP"):
        return np.nan
    # drop Tokyo and Japan-based entries

    if Geo.startswith("GB"):
        if "London" in Geo:
            return "London"
    # standardize UK locations

    if Geo.startswith("SE") or "Sweden" in Geo or "Malmo" in Geo or "Malmö" in Geo:
        return "Malmö"
    # standardize Sweden locations

    if Geo.startswith("PT") or "Lisbon" in Geo or "Portugal" in Geo:
        return "Lisbon"
    # standardize Portugal locations

    if Geo.startswith("US") and ("NY" in Geo or "Tri State" in Geo or "New York" in Geo):
        return "New York"
    # standardize New York locations

    return Geo

df["location_clean"] = df["Geo"].apply(standardize_location)
df = df.dropna(subset=["location_clean"]).copy()



<function print(*args, sep=' ', end='\n', file=None, flush=False)>

In [5]:
subfunc_title = {
    "RSM - Mobile Applications Developer": "Mobile Engineer",
    "RSG - Software - Generalist": "Full Stack Engineer",
    "RQS - Software Quality Assurance": "QA Engineer",
}

ux_title = {
    "RSI - User Interface Design",
    "RUH - Human Factors Engineering"
}
# standardize job titles

def standardize_job_title(subfunc):
    if pd.isna(subfunc):
        return np.nan
    subfunc = str(subfunc).strip()

    if subfunc in ux_title:
        return "UX Designer"
    return subfunc_title.get(subfunc, np.nan)

df["job_title_clean"] = df["Sub Function"].apply(standardize_job_title)
df = df.dropna(subset=["job_title_clean"]).copy()

In [6]:
level_mapping = {
    "P1 - Entry": "Junior",
    "P2 - Developing": "Mid",
    "P3 - Proficiency": "Senior",
    "P4 - Mastery": "Lead",
    "P5 - Expert": "Principal",
    "M5 - Director": "Head of"
}
# standardize job levels

df["job_level_clean"] = df["Level"].map(level_mapping)
df = df.dropna(subset=["job_level_clean"]).copy()

In [7]:
tx_gbp = {
    "GBP": 1.00,
    "USD": 0.80,
    "EUR": 0.85,
    "SEK": 0.075
}
# currency conversion rates to GBP

def to_gbp(amount, currency):
    if pd.isna(amount) or pd.isna(currency):
        return np.nan
    currency = str(currency).strip().upper()
    rate = tx_gbp.get(currency)
    if rate is None:
        return np.nan
    return amount * rate
# convert salaries to GBP

salary_columns = [
    "Mkt Base Salary 25th %''ile",
    "Mkt Base Salary 50th %''ile",
    "Mkt Base Salary 75th %''ile",
]
for col in salary_columns:
    df[col + "_GBP"] = df.apply(lambda row: to_gbp(row[col], row["Currency"]), axis=1)
# convert all salary columns to GBP

df["mkt_base_avg_GBP"] = df[[col + "_GBP" for col in salary_columns]].mean(axis=1)
# average market base salary in GBP

    

In [8]:
mercer_clean = df[
    [
        "location_clean",
        "job_title_clean",
        "job_level_clean",
        "Mkt Base Salary 25th %''ile_GBP",
        "Mkt Base Salary 50th %''ile_GBP",
        "Mkt Base Salary 75th %''ile_GBP",
        "mkt_base_avg_GBP"
    ]
].rename(columns={
    "Mkt Base Salary 25th %''ile_GBP": "p25_GBP",
    "Mkt Base Salary 50th %''ile_GBP": "p50_GBP",
    "Mkt Base Salary 75th %''ile_GBP": "p75_GBP",
})
# final cleaned DataFrame
print(mercer_clean.head(10))
mercer_clean

  location_clean      job_title_clean job_level_clean      p25_GBP  \
0         London      Mobile Engineer          Senior   65182.0948   
1         London      Mobile Engineer            Lead   87476.0651   
2         London          UX Designer          Junior   42740.2442   
3         London          UX Designer             Mid   42344.5234   
4         London          UX Designer          Senior   60424.4016   
5         London          UX Designer            Lead   75569.3333   
6         London          UX Designer       Principal   90897.6310   
7         London          UX Designer         Head of  113252.9600   
8         London  Full Stack Engineer          Junior   35872.6943   
9         London  Full Stack Engineer             Mid   64366.3255   

       p50_GBP      p75_GBP  mkt_base_avg_GBP  
0   71944.8550   85015.8194      74047.589733  
1   88593.3182   90590.5172      88886.633500  
2   46535.4578   49760.7136      46345.471867  
3   49554.1926   53958.4365      4861

Unnamed: 0,location_clean,job_title_clean,job_level_clean,p25_GBP,p50_GBP,p75_GBP,mkt_base_avg_GBP
0,London,Mobile Engineer,Senior,65182.09480,71944.85500,85015.81940,74047.589733
1,London,Mobile Engineer,Lead,87476.06510,88593.31820,90590.51720,88886.633500
2,London,UX Designer,Junior,42740.24420,46535.45780,49760.71360,46345.471867
3,London,UX Designer,Mid,42344.52340,49554.19260,53958.43650,48619.050833
4,London,UX Designer,Senior,60424.40160,73127.88960,82628.55890,72060.283367
...,...,...,...,...,...,...,...
67,New York,Full Stack Engineer,Principal,175112.20744,193963.78584,215681.16240,194919.051893
68,New York,Full Stack Engineer,Head of,192021.82400,225458.34520,247376.02936,221618.732853
69,New York,QA Engineer,Mid,77075.69712,82832.56000,91115.81600,83674.691040
70,New York,QA Engineer,Senior,98157.05680,102114.32488,110709.51600,103660.299227


In [9]:
mercer_clean["location_clean"].value_counts()
mercer_clean["job_title_clean"].value_counts()
mercer_clean["job_level_clean"].value_counts()
mercer_clean.isnull().sum()

location_clean      0
job_title_clean     0
job_level_clean     0
p25_GBP             0
p50_GBP             0
p75_GBP             0
mkt_base_avg_GBP    0
dtype: int64

In [10]:
import os, sys, importlib
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
print("cwd:", Path.cwd())
print("project root:", PROJECT_ROOT)

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import src.api as api
importlib.reload(api)


cwd: /Users/marisa/Desktop/Data Analytics/Week7/Tech-Salary-Benchmarking-Cross-Market-Analysis-for-Competitive-Salary-Bands/notebooks
project root: /Users/marisa/Desktop/Data Analytics/Week7/Tech-Salary-Benchmarking-Cross-Market-Analysis-for-Competitive-Salary-Bands


<module 'src.api' from '/Users/marisa/Desktop/Data Analytics/Week7/Tech-Salary-Benchmarking-Cross-Market-Analysis-for-Competitive-Salary-Bands/src/api.py'>

In [11]:
import json
from pathlib import Path
import pandas as pd

RAW_API_DIR = Path("data/raw_api")
print("Loading from:", RAW_API_DIR.resolve())

Loading from: /Users/marisa/Desktop/Data Analytics/Week7/Tech-Salary-Benchmarking-Cross-Market-Analysis-for-Competitive-Salary-Bands/notebooks/data/raw_api


In [12]:
def load_cached_api_rows(raw_dir=RAW_API_DIR):
    rows = []
    for fp in raw_dir.glob("*.json"):
        with open(fp, "r", encoding="utf-8") as f:
            playload = json.load(f)

        data = playload.get("data", [])
        if isinstance(data, dict):
            data = [data]
        if not data:
            continue

        for d in data:
            if d.get("min_salary") is None and d.get("max_salary") is None:
                continue

            rows.append({
                "job_title": d.get("job_title"),
                "location": d.get("location"),
                "min_salary": d.get("min_salary"),
                "max_salary": d.get("max_salary"),
                "avg_salary": d.get("avg_salary"),
                "currency": d.get("currency"),
                "salary_period": d.get("salary_period"),
                "confidence": d.get("confidence"),
                "publisher_name": d.get("publisher_name"),
                "source_file": fp.name
            })

    return pd.DataFrame(rows)

api_rows = load_cached_api_rows()
api_rows.head(10)

Unnamed: 0,job_title,location,min_salary,max_salary,avg_salary,currency,salary_period,confidence,publisher_name,source_file
0,UX Designer,"Lund, Sweden",37152.004167,46809.8875,,,MONTH,VERY_HIGH,Glassdoor,"ux-designer_malmo,-sweden.json"
1,QA Engineer,"Lisbon, Portugal",1354.791667,2708.0,,,MONTH,VERY_HIGH,Glassdoor,"qa_engineer__lisbon,_portugal.json"
2,Software Developer,"Lisbon, Portugal",1612.5,2679.166667,,,MONTH,VERY_HIGH,Glassdoor,"software_developer__lisbon,_portugal.json"
3,Backend Engineer,New York State,121388.53,212611.99,,,YEAR,CONFIDENT,Glassdoor,"backend_engineer__new_york,_usa.json"
4,UX Designer,"Lund, Sweden",37148.458333,46810.3125,,,MONTH,VERY_HIGH,Glassdoor,"ux_designer__malmo,_sweden.json"
5,Frontend Engineer,New York State,113983.96,193714.94,,,YEAR,CONFIDENT,Glassdoor,"frontend_engineer__new_york,_usa.json"
6,QA Engineer,"Lisbon, Portugal",1354.791667,2708.0,,,MONTH,VERY_HIGH,Glassdoor,"qa-engineer_lisbon,-portugal.json"


In [16]:
def standardize_location_api(loc):
    if pd.isna(loc):
        return None
    loc = str(loc).lower()

    if "london" in loc or "gb" in loc or "uk" in loc:
        return "London"
    if "lisbon" in loc or "portugal" in loc or loc.startswith("pt"):
        return "Portugal"
    if "malmo" in loc or "sweden" in loc:
        return "Sweden"
    if "new york" in loc or "ny" in loc or "tri state" in loc:
        return "New York"
    if "tokyo" in loc or "japan" in loc:
        return None 
    return loc.title()
# Standardize locations and drop Tokyo

api_clean = api_rows.copy()
api_clean["location_clean"] = api_clean["location"].apply(standardize_location_api)
api_clean = api_clean.dropna(subset=["location_clean"])

In [18]:
def standardize_job_title_api(title):
    if pd.isna(title):
        return None
    t = str(title).lower()

    if "mobile" in t:
        return "Mobile Engineer"
    if "ux" in t or "user interface" in t or "human factors" in t:
        return "UX Designer"
    if "full stack" in t or "software generalist" in t or "software engineer" in t or "software developer" in t:
        return "Full Stack Engineer"
    if "qa" in t or "quality assurance" in t:
        return "QA Engineer"

    return title
#Standardize Job Titles 

api_clean["job_title_clean"] = api_clean["job_title"].apply(standardize_job_title_api)
api_clean = api_clean.dropna(subset=["job_title_clean"])


In [19]:
for c in ["min_salary", "max_salary", "avg_salary"]:
    api_clean[c] = pd.to_numeric(api_clean[c], errors="coerce")
# Convert to numbers

api_clean["avg_salary"] = api_clean["avg_salary"].fillna(
    (api_clean["min_salary"] + api_clean["max_salary"]) / 2
)
# Add missing average

In [20]:
FX_TO_GBP = {
    "GBP": 1.00,
    "USD": 0.80,
    "EUR": 0.85,
    "SEK": 0.075
}
#Convertion rates

def to_gbp(amount, currency):
    if pd.isna(amount) or pd.isna(currency):
        return None
    rate = FX_TO_GBP.get(str(currency).upper())
    if rate is None:
        return None
    return amount * rate

api_clean["avg_salary_gbp"] = api_clean.apply(
    lambda r: to_gbp(r["avg_salary"], r["currency"]), axis=1
)
# Currency convertion

api_clean = api_clean.dropna(subset=["avg_salary_gbp"])
api_clean.head(10)
print("API clean shape:", api_clean.shape)

API clean shape: (0, 13)


In [21]:
def annualize_gbp(avg_gbp, period):
    if pd.isna(avg_gbp) or pd.isna(period):
        return None
    p = str(period).upper()
    if p == "MONTH":
        return avg_gbp * 12
    if p == "YEAR":
        return avg_gbp
    return None

api_clean["avg_salary_annual_gbp"] = api_clean.apply(
    lambda r: annualize_gbp(r["avg_salary_gbp"], r["salary_period"]), axis=1
)
# Convert any monthly salary to annual

api_clean = api_clean.dropna(subset=["avg_salary_annual_gbp"])
api_clean.head(10)

Unnamed: 0,job_title,location,min_salary,max_salary,avg_salary,currency,salary_period,confidence,publisher_name,source_file,location_clean,job_title_clean,avg_salary_gbp,avg_salary_annual_gbp
