In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
from dotenv import load_dotenv
load_dotenv()
import chart_studio.plotly as py
import chart_studio.tools as tls
import chart_studio
import os

In [6]:
# To embed charts in online article
plotly_username = os.environ['plotly_username']
plotly_apikey = os.environ['plotly_apikey']

chart_studio.tools.set_credentials_file(username=plotly_username
                                        , api_key=plotly_apikey)

# Introduction

To generate relevant graphs / tables for **GES 2024** article analysis. 

In [7]:
# To track "tech-related" degrees
tech_degrees= {
    'NUS': ['Data Science and Analytics', 'Business Analytics', 'Computer Science', 'Computer Engineering', 'Information Security', 'Information Systems']
    , 'NTU': ['Computer Engineering', 'Computer Science', 'Data Science & Artificial Intelligence']
    , 'SMU': ['Information Systems']
}

In [8]:
# Data Preparation

# Data has been updated to include '24 GES + IGP data
orig_df = pd.read_csv("../../data/graphing_table.csv")
# Drop columns that are entirely missing
orig_df.dropna(axis=1, how='all', inplace=True)

# Generate summary rows # 
# Identify rows that are summary stats at the university level
orig_df['summary_row'] = 0 

# University level summaries
for uni_tmp in orig_df.uni.unique().tolist():

    # Filter for that uni, for records within the year range
    tdf = orig_df.loc[(orig_df.uni == uni_tmp)]

    # Aggregate relevant metrics
    metrics = ['Places', 'GPA', 'RP',  'employment_rate_overall','employment_rate_ft_perm','basic_monthly_mean','basic_monthly_median','gross_monthly_mean','gross_monthly_median','gross_mthly_25_percentile','gross_mthly_75_percentile']
    # 25th Percentile rows
    uni_25th_pctile = tdf.groupby('year', as_index=False)[metrics].agg(lambda x: x.quantile(0.25))
    uni_25th_pctile['uni'] = uni_tmp
    uni_25th_pctile['course'] = f'{uni_tmp} 25th Percentile'
    # Median Rows
    uni_median = tdf.groupby('year', as_index=False)[metrics].agg(lambda x: x.quantile(0.5))
    uni_median['uni'] = uni_tmp
    uni_median['course'] = f'{uni_tmp} Median'
    # 75th Percentile Rows
    uni_75th_percentile = tdf.groupby('year', as_index=False)[metrics].agg(lambda x: x.quantile(0.75))
    uni_75th_percentile['uni'] = uni_tmp
    uni_75th_percentile['course'] = f'{uni_tmp} 75th Percentile'
    stats_table = pd.concat([uni_median, uni_25th_pctile, uni_75th_percentile], ignore_index=True)
    stats_table['summary_row'] = 1
    # Join back to orig_df
    orig_df = pd.concat([orig_df,stats_table], ignore_index=True)

# Summary at the overall level
uni_25th_pctile = orig_df.groupby('year', as_index=False)[metrics].agg(lambda x: x.quantile(0.25))
uni_25th_pctile['uni'] = np.nan
uni_25th_pctile['course'] = f'Overall 25th Percentile'
# Median Rows
uni_median = orig_df.groupby('year', as_index=False)[metrics].agg(lambda x: x.quantile(0.5))
uni_median['uni'] = np.nan
uni_median['course'] = f'Overall Median'
# 75th Percentile Rows
uni_75th_percentile = orig_df.groupby('year', as_index=False)[metrics].agg(lambda x: x.quantile(0.75))
uni_75th_percentile['uni'] = np.nan
uni_75th_percentile['course'] = f'Overall 75th Percentile'
stats_table = pd.concat([uni_median, uni_25th_pctile, uni_75th_percentile], ignore_index=True)
stats_table['summary_row'] = 1
# Join back to orig_df
orig_df = pd.concat([orig_df,stats_table], ignore_index=True)

# track if a row is a course row or not
orig_df['course_row'] = orig_df.summary_row.replace({0: 1, 1: 0 })

# Create the is_tech_degree field explicitly
orig_df['is_tech_degree'] = 0
for uni in tech_degrees:
    for course in tech_degrees[uni]:
        orig_df.loc[(orig_df['uni'] == uni) & (orig_df['course'] == course), 'is_tech_degree'] = 1

In [5]:
# orig_df.to_csv("../../data/2024_graphing_aggrowsIncluded.csv", index=False)

# Ideas

Points to investigate for this year's article: 

- Last year it was established that tech degrees experienced the greatest surge in popularity (2013 -> 2023), has that changed this year for these degrees? Can produce a table / chart of RP changes for tech degrees from '23 to '24 + see which non-tech degrees experienced a rise in RP(?)

- Recreate the yearly Degree Starting Salaries Scatterplot including '24. Further drill down into tech-degree median salaries only? 
    - Could relate this to the "Change in Median Salaries for Tech Degrees" plot, this time comparing '23 to '24.

- Try coming up with a visualization for the higher variance in starting salaries for tech degrees in '24

- Deeper analysis of employment trends for '24. Maybe some plot of salaries vs employment? 


# Admission Criteria Changes

Charting how the admissions critera (RP) changed from '23 to '24 for tech degrees. 

Also look for any unexpected rises in popularity

In [26]:
# Duplicate Original Table
df = orig_df.copy()

# Filter for tech degrees and years 2023-2024
tech_df = pd.DataFrame()

for uni, courses in tech_degrees.items():
    for course in courses:
        temp_df = df[(df['uni'] == uni) & 
                     (df['course'] == course) & 
                     (df['year'].isin([2023, 2024])) &
                     (df['summary_row'] != 1)]  # Exclude summary rows
        tech_df = pd.concat([tech_df, temp_df])

# Create a flat list of all tech degrees tuples (uni, course)
all_tech_degrees = []
for uni, courses in tech_degrees.items():
    for course in courses:
        all_tech_degrees.append((uni, course))

# Now add non-tech degrees to the analysis
# Filter for non-tech degrees and years 2023-2024
non_tech_df = df[(df['year'].isin([2023, 2024])) & (df['summary_row'] != 1)]

# Exclude tech degrees from the dataset
non_tech_rows = []
for _, row in non_tech_df.iterrows():
    if (row['uni'], row['course']) not in all_tech_degrees:
        non_tech_rows.append(row)

non_tech_df = pd.DataFrame(non_tech_rows)

# Combine tech and non-tech dataframes
combined_df = pd.concat([tech_df, non_tech_df])

# Add a column to indicate tech degrees
combined_df['tech_degree'] = combined_df.apply(
    lambda row: 1 if (row['uni'], row['course']) in all_tech_degrees else 0, 
    axis=1
)

# Create a pivot table to compare RP scores for all degrees
rp_comparison = combined_df.pivot_table(
    index=['uni', 'course', 'tech_degree'],
    columns='year',
    values='RP',
    aggfunc='first'  # Take the first value in case of duplicates
)

# Calculate the change in RP from 2023 to 2024
rp_comparison['Change'] = rp_comparison[2024] - rp_comparison[2023]
rp_comparison['Change_abs'] = abs(rp_comparison['Change'])

# Reset index for better display
rp_comparison = rp_comparison.reset_index()

# Sort by tech_degree first (tech degrees on top) and then by absolute change
rp_comparison = rp_comparison.sort_values(['tech_degree', 'Change_abs'], ascending=[False, False])

# Format the final output
result_table = rp_comparison.copy()
result_table['2023 RP'] = result_table[2023]
result_table['2024 RP'] = result_table[2024]
result_table['Change'] = result_table['Change']
result_table['Change Direction'] = result_table['Change'].apply(
    lambda x: '↑' if x > 0 else ('↓' if x < 0 else '−')
)
result_table = result_table[['uni', 'course', 'tech_degree', '2023 RP', '2024 RP', 'Change', 'Change Direction']]\
                .reset_index(drop=True)
result_table.columns.name=None

# Display the combined table
print("RP Changes for Degrees (2023-2024)")

# # Alternatively, you can limit to top changes if the table is too large
# # Uncomment the following lines to show only tech degrees and top 15 non-tech degrees

# tech_result = result_table[result_table['tech_degree'] == 1]
# non_tech_result = result_table[result_table['tech_degree'] == 0].head(15)
# final_result = pd.concat([tech_result, non_tech_result])
# print(final_result.to_string(index=False))


RP Changes for Degrees (2023-2024)


Concerning tech-degrees, not much change. Still as popular as they ever were. 

In [27]:
result_table.loc[(result_table.tech_degree==0)].head(10)

Unnamed: 0,uni,course,tech_degree,2023 RP,2024 RP,Change,Change Direction
10,NUS,Real Estate,0,73.75,82.5,8.75,↑
11,NTU,Physics / Applied Physics,0,62.5,67.5,5.0,↑
12,NTU,Environmental Engineering,0,68.75,65.0,-3.75,↓
13,NTU,Mathematical Sciences,0,68.75,72.5,3.75,↑
14,NTU,Science (Education),0,76.25,80.0,3.75,↑
15,NTU,Accountancy,0,72.5,75.0,2.5,↑
16,NTU,"Art, Design and Media",0,72.5,70.0,-2.5,↓
17,NTU,Arts (Education),0,70.0,72.5,2.5,↑
18,NTU,Chemistry and Biological Chemistry,0,72.5,70.0,-2.5,↓
19,NTU,Civil Engineering,0,63.75,61.25,-2.5,↓


For non-tech degrees, there's a *seemingly* large jump from 2023 to 2024. However this is <mark>unlikely to be because the course itself became more popular</mark>. 

From 2024 onwards the admissions for Real Estate, Business Administration, and Accountancy were merged into a single intake. The higher RP admission scores are likely from students applying for the other two courses, not for Real Estate. 

# Degree Starting Salaries

In [29]:
df = orig_df.copy()
# Filter the DataFrame for tech-related degrees
df['is_tech_degree'] = df.apply(lambda x: 1 if x['course'] in tech_degrees.get(x['uni'], []) else 0, axis=1)

# Adjust gross_monthly_median to thousands
df['gross_monthly_median'] = df['gross_monthly_median'] / 1000

tech_degree_color = '#FF6173'
non_tech_degree_color = 'grey'

# Create the figure
fig = go.Figure()

# Plot the line plot for the overall median
median_df = df[(df['course'] == 'Overall Median') & (df['summary_row'] == 1)]
fig.add_trace(go.Scatter(
    x=median_df['year'],
    y=median_df['gross_monthly_median'],
    mode='lines',
    line=dict(color='black', dash='dash'),
    name='Overall Median'
))

# Compute the median yearlysalaary for tech degrees
tdf = df.loc[(df['is_tech_degree']==1)
             & (df.summary_row==0)]
# Median Rows
tdf_median = tdf.groupby('year', as_index=False)[metrics].agg(lambda x: x.quantile(0.5))
tdf_median['uni'] = np.nan
tdf_median['course'] = f'Tech Degree Median'
tdf_median['summary_row']=1

fig.add_trace(go.Scatter(
    x=tdf_median['year'],
    y=tdf_median['gross_monthly_median'],
    mode='lines',
    line=dict(color='red', dash='dash'),
    name='Tech Degree Median'
))

# Plot scatter plots for each non-summary course
df = df.sort_values('is_tech_degree', ascending=False)
for uni in df['uni'].unique():
    uni_df = df[(df['uni'] == uni) & (df['course_row'] == 1)]
    for course in uni_df['course'].unique():
        course_df = uni_df[uni_df['course'] == course]
        color = tech_degree_color if course_df['is_tech_degree'].iloc[0] == 1 else non_tech_degree_color
        fig.add_trace(go.Scatter(
            x=course_df['year'],
            y=course_df['gross_monthly_median'],
            mode='markers',
            marker=dict(color=color, opacity=0.7),
            name='Tech Degree' if color == tech_degree_color else 'Non-Tech Degree',
            showlegend=not any(d['name'] == ('Tech Degree' if color == tech_degree_color else 'Non-Tech Degree') for d in fig.data),
            hovertemplate=f"{uni}<br>{course}<br>Salary: %{{y:.2f}}K"
        ))

# Customize the layout
fig.update_layout(
    xaxis_fixedrange=True, yaxis_fixedrange=True,
    # title="Yearly degree starting salaries",
    title = {
        'text': 'Degree starting salaries'
        , 'y' : 0.95
        , 'x': 0.1
        , 'xanchor': 'left'
        , 'yanchor': 'top'
    },
    xaxis_title="Year",
    yaxis_title="Gross Monthly Median (SGD Thousands)",
    # legend_title="Degree Type",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="left",
        x=0
    ),
    xaxis=dict(
        tickmode='linear',
        dtick=1  # Yearly ticks
    ),
    yaxis=dict(
        tickmode='linear',
        dtick=0.5  # $500 increments
    )
    , template='ggplot2'
    , margin = dict(r=10)
)

# Show the figure
fig.show()


The gap between the starting salaries of tech-related degrees vs non tech-related degrees continues to widen. 

In [None]:
# Host on plotly
# py.plot(fig, filename= 'yearly_salaries_2024', auto_open=True)

'https://chart-studio.plotly.com/~carel/16/'

## Aggregate statistics for tech vs non-tech degrees

In [7]:
# Load the data
df=orig_df.copy()

# Filter for 2022, 2023 and 2024 data
df_recent = df[(df['year'] >= 2022) & (df['year'] <= 2024)]

# Filter out summary rows (rows where summary_row is 1)
df_courses = df_recent[df_recent['summary_row'] != 1]

# Define tech degrees (will use the tech_degrees variable that's available in your environment)
# This uses the tech_degrees dictionary available in your environment

# Create a function to determine if a row is a tech degree
def is_tech_degree(row):
    uni = row['uni']
    course = row['course']
    return uni in tech_degrees and course in tech_degrees[uni]

# Apply the function to create a new column
df_courses['is_tech'] = df_courses.apply(is_tech_degree, axis=1)

# Group by year and tech status, then calculate statistics
stats = df_courses.groupby(['year', 'is_tech'])[
    ['gross_mthly_25_percentile', 'gross_monthly_median', 'gross_mthly_75_percentile']
].median()

# Create a comparison table
rows = [
    'Tech 2022', 'Tech 2023', 'Tech 2024', 
    'Tech % Change 2022-2023', 'Tech % Change 2023-2024',
    'Non-Tech 2022', 'Non-Tech 2023', 'Non-Tech 2024', 
    'Non-Tech % Change 2022-2023', 'Non-Tech % Change 2023-2024',
    'Tech vs Non-Tech % Diff 2022', 'Tech vs Non-Tech % Diff 2023', 'Tech vs Non-Tech % Diff 2024'
]

comparison = pd.DataFrame(index=rows, 
                          columns=['25th Percentile', 'Median', '75th Percentile'])

# Mapping of column names
col_mapping = {
    'gross_mthly_25_percentile': '25th Percentile',
    'gross_monthly_median': 'Median',
    'gross_mthly_75_percentile': '75th Percentile'
}

# Fill in the data
for metric, col_name in col_mapping.items():
    # Tech values
    tech_2022 = stats.loc[(2022, True), metric]
    tech_2023 = stats.loc[(2023, True), metric]
    tech_2024 = stats.loc[(2024, True), metric]
    tech_change_22_23 = ((tech_2023 / tech_2022) - 1) * 100
    tech_change_23_24 = ((tech_2024 / tech_2023) - 1) * 100
    
    # Non-tech values
    non_tech_2022 = stats.loc[(2022, False), metric]
    non_tech_2023 = stats.loc[(2023, False), metric]
    non_tech_2024 = stats.loc[(2024, False), metric]
    non_tech_change_22_23 = ((non_tech_2023 / non_tech_2022) - 1) * 100
    non_tech_change_23_24 = ((non_tech_2024 / non_tech_2023) - 1) * 100
    
    # Calculate differences
    diff_2022 = ((tech_2022 / non_tech_2022) - 1) * 100
    diff_2023 = ((tech_2023 / non_tech_2023) - 1) * 100
    diff_2024 = ((tech_2024 / non_tech_2024) - 1) * 100
    
    # Populate table with raw values (no special formatting)
    comparison.loc['Tech 2022', col_name] = tech_2022
    comparison.loc['Tech 2023', col_name] = tech_2023
    comparison.loc['Tech 2024', col_name] = tech_2024
    comparison.loc['Tech % Change 2022-2023', col_name] = tech_change_22_23
    comparison.loc['Tech % Change 2023-2024', col_name] = tech_change_23_24
    
    comparison.loc['Non-Tech 2022', col_name] = non_tech_2022
    comparison.loc['Non-Tech 2023', col_name] = non_tech_2023
    comparison.loc['Non-Tech 2024', col_name] = non_tech_2024
    comparison.loc['Non-Tech % Change 2022-2023', col_name] = non_tech_change_22_23
    comparison.loc['Non-Tech % Change 2023-2024', col_name] = non_tech_change_23_24
    
    comparison.loc['Tech vs Non-Tech % Diff 2022', col_name] = diff_2022
    comparison.loc['Tech vs Non-Tech % Diff 2023', col_name] = diff_2023
    comparison.loc['Tech vs Non-Tech % Diff 2024', col_name] = diff_2024

# Let's also see how many tech vs non-tech courses we're comparing
# Check that these are fairly consistent every year
tech_counts = df_courses.groupby(['year', 'is_tech']).size()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_courses['is_tech'] = df_courses.apply(is_tech_degree, axis=1)


In [8]:
comparison

Unnamed: 0,25th Percentile,Median,75th Percentile
Tech 2022,4850.0,5500.0,6447.5
Tech 2023,4850.0,5450.0,6230.0
Tech 2024,4950.0,5500.0,6550.0
Tech % Change 2022-2023,0.0,-0.909091,-3.373401
Tech % Change 2023-2024,2.061856,0.917431,5.136437
Non-Tech 2022,3625.0,4000.0,4562.5
Non-Tech 2023,3800.0,4129.0,4550.0
Non-Tech 2024,4000.0,4342.0,4917.5
Non-Tech % Change 2022-2023,4.827586,3.225,-0.273973
Non-Tech % Change 2023-2024,5.263158,5.158634,8.076923


From '23 to '24, the salary gap between tech vs non-tech holding steady / dropping very slightly. 

But it's not all good, need to look at employment rates. 

# Change in Tech Salaries by year

In [16]:
# Initial years for comparison
start_year, end_year = 2023, 2024
df = orig_df.copy()

# Filter the DataFrame for tech-related degrees
df['is_tech_degree'] = df.apply(lambda x: 1 if x['course'] in tech_degrees.get(x['uni'], []) else 0, axis=1)

# Filter the DataFrame for tech-related degrees in specified years
tech_degrees_df = df[(df['is_tech_degree'] == 1) & (df['course_row'] == 1) & df['year'].isin([start_year, end_year])]

# Compute the change in salary
change_df = tech_degrees_df.pivot_table(
    index=['uni', 'course'], 
    columns='year', 
    values='gross_monthly_median'
).reset_index()
change_df['change'] = change_df[end_year] - change_df[start_year]
change_df['course_uni'] = change_df.apply(lambda x: f"{x['course']} ({x['uni']})", axis=1)

# Sort the DataFrame by the 'change' column
change_df.sort_values(by='change', ascending=False, inplace=True)

# Create the bar plot with flipped axes
fig = go.Figure()
fig.add_trace(go.Bar(
    y=change_df['course_uni'],
    x=change_df['change'],
    orientation='h',
    marker_color=['red' if x < 0 else 'green' for x in change_df['change']],
    # Modified: Add "+" before positive values
    text=[f"+{int(x)}" if x > 0 else f"{int(x)}" for x in change_df['change']],
    textposition='outside'
))

# Add a vertical dashed line at x=0
fig.add_shape(
    type="line",
    x0=0, y0=-0.5, x1=0, y1=len(change_df) - 0.5,
    line=dict(color="black", width=2, dash="dash")
)

# Customize the layout
fig.update_layout(
    xaxis_fixedrange=True, yaxis_fixedrange=True,
    title=f"Change in Median Salaries for Tech Degrees ({start_year} to {end_year})",
    xaxis_title="Change in Gross Monthly Median Salary",
    xaxis=dict(
        tickformat="$,.0f",
        zeroline=False,
        range=[min(change_df['change']) - 100, max(change_df['change']) + 100]
    ),
    yaxis=dict(autorange="reversed"),
    margin=dict(r=10, l=15)
)

# Show the figure for Task 1
fig.show()

In [None]:
# # Visualization to compare changes across three years
# # Years to include
# years = [2022, 2023, 2024]

# # Filter tech degrees for all three years
# tech_three_years = df[(df['is_tech_degree'] == 1) & (df['course_row'] == 1) & df['year'].isin(years)]

# # Pivot the data to get median salaries for each year
# pivot_df = tech_three_years.pivot_table(
#     index=['uni', 'course'], 
#     columns='year', 
#     values='gross_monthly_median'
# ).reset_index()

# # Calculate changes between years and net change
# pivot_df['change_22_23'] = pivot_df[2023] - pivot_df[2022]
# pivot_df['change_23_24'] = pivot_df[2024] - pivot_df[2023]
# pivot_df['change_22_24'] = pivot_df[2024] - pivot_df[2022]  # Net change

# # Create course_uni label
# pivot_df['course_uni'] = pivot_df.apply(lambda x: f"{x['course']} ({x['uni']})", axis=1)

# # Sort by net change
# pivot_df.sort_values(by='change_22_24', ascending=False, inplace=True)

# # Create the grouped bar chart
# fig2 = go.Figure()

# # Add bars for each type of change
# fig2.add_trace(go.Bar(
#     y=pivot_df['course_uni'],
#     x=pivot_df['change_22_23'],
#     name='2022 to 2023',
#     orientation='h',
#     marker_color='blue',
#     # Round to nearest integer
#     text=[f"+{int(round(x))}" if x > 0 else f"{int(round(x))}" for x in pivot_df['change_22_23']],
#     textposition='outside'
# ))

# fig2.add_trace(go.Bar(
#     y=pivot_df['course_uni'],
#     x=pivot_df['change_23_24'],
#     name='2023 to 2024',
#     orientation='h',
#     marker_color='green',
#     # Round to nearest integer
#     text=[f"+{int(round(x))}" if x > 0 else f"{int(round(x))}" for x in pivot_df['change_23_24']],
#     textposition='outside'
# ))

# fig2.add_trace(go.Bar(
#     y=pivot_df['course_uni'],
#     x=pivot_df['change_22_24'],
#     name='Net (2022 to 2024)',
#     orientation='h',
#     # Changed color from purple to pink
#     marker_color='pink',
#     # Round to nearest integer
#     text=[f"+{int(round(x))}" if x > 0 else f"{int(round(x))}" for x in pivot_df['change_22_24']],
#     textposition='outside'
# ))

# # Add a vertical dashed line at x=0
# fig2.add_shape(
#     type="line",
#     x0=0, y0=-0.5, x1=0, y1=len(pivot_df) - 0.5,
#     line=dict(color="black", width=2, dash="dash")
# )

# # Customize the layout
# fig2.update_layout(
#     title="Changes in Median Salaries for Tech Degrees (2022-2024)",
#     xaxis_title="Change in Gross Monthly Median Salary",
#     xaxis=dict(
#         tickformat="$,.0f",
#         zeroline=False,
#         # Widen the x-axis range to give annotations more space
#         range=[min(pivot_df['change_22_23'].min(), pivot_df['change_23_24'].min(), 
#                pivot_df['change_22_24'].min()) - 100, 
#                max(pivot_df['change_22_23'].max(), pivot_df['change_23_24'].max(), 
#                pivot_df['change_22_24'].max()) + 100]
#     ),
#     yaxis=dict(
#         autorange="reversed",
#     ),
#     # Add spacing between bar groups
#     bargap=0.3,
#     bargroupgap=0.1,
#     barmode='group',
#     legend=dict(
#         orientation="h",
#         yanchor="bottom",
#         y=1.02,
#         xanchor="right",
#         x=1
#     ),
#     # Increase the height of the plot to prevent crowding
#     height=600,
#     margin=dict(r=30, l=30, t=80, b=30)
# )

# # Show the figure for Task 2
# fig2.show()

In [38]:
# Create a copy of the original dataframe
df = orig_df.copy()

# Create the is_tech_degree field explicitly
df['is_tech_degree'] = 0
for uni in tech_degrees:
    for course in tech_degrees[uni]:
        df.loc[(df['uni'] == uni) & (df['course'] == course), 'is_tech_degree'] = 1

# Years to include
years = [2022, 2023, 2024]

# Filter tech degrees for all three years
tech_three_years = df[(df['is_tech_degree'] == 1) & (df['course_row'] == 1) & df['year'].isin(years)]

# Pivot the data to get median salaries for each year
pivot_df = tech_three_years.pivot_table(
    index=['uni', 'course'], 
    columns='year', 
    values='gross_monthly_median'
).reset_index()

# Calculate changes between years and net change
pivot_df['change_22_23'] = pivot_df[2023] - pivot_df[2022]
pivot_df['change_23_24'] = pivot_df[2024] - pivot_df[2023]
pivot_df['change_22_24'] = pivot_df[2024] - pivot_df[2022]  # Net change

# Create course_uni label
pivot_df['course_uni'] = pivot_df.apply(lambda x: f"{x['course']} ({x['uni']})", axis=1)

# Sort by net change
pivot_df.sort_values(by='change_22_24', ascending=False, inplace=True)

# Define colors
green_color = 'green'
net_color = '#a83268'  # burgundy/pink

# Create simplified chart with 2023-2024 and Net change
fig_simple = go.Figure()

# Add only the two most relevant bar types
fig_simple.add_trace(go.Bar(
    y=pivot_df['course_uni'],
    x=pivot_df['change_23_24'],
    name='2023 to 2024',
    orientation='h',
    marker_color=green_color,
    text=[f"+{int(round(x))}" if x > 0 else f"{int(round(x))}" for x in pivot_df['change_23_24']],
    textposition='outside',
    textfont=dict(
        color=green_color  # Match annotation color to bar color
    )
))

fig_simple.add_trace(go.Bar(
    y=pivot_df['course_uni'],
    x=pivot_df['change_22_24'],
    name='Net (2022 to 2024)',
    orientation='h',
    marker_color=net_color,
    text=[f"+{int(round(x))}" if x > 0 else f"{int(round(x))}" for x in pivot_df['change_22_24']],
    textposition='outside',
    textfont=dict(
        color=net_color  # Match annotation color to bar color
    )
))

# Add a vertical dashed line at x=0
fig_simple.add_shape(
    type="line",
    x0=0, y0=-0.5, x1=0, y1=len(pivot_df) - 0.5,
    line=dict(color="black", width=2, dash="dash")
)

# Customize the layout with horizontal gridlines
fig_simple.update_layout(
    title={
        'text': "Changes in Median Salaries for Tech Degrees (2022-2024)",
        'y': 0.95,
        'x': 0.5,  # Center the title
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Change in Gross Monthly Median Salary",
    xaxis=dict(
        tickformat="$,.0f",
        zeroline=False,
        range=[min(pivot_df['change_23_24'].min(), pivot_df['change_22_24'].min()) - 100, 
               max(pivot_df['change_23_24'].max(), pivot_df['change_22_24'].max()) + 100]
    ),
    yaxis=dict(
        autorange="reversed",
        gridcolor='lightgray',
        gridwidth=1,
        griddash='dot',
    ),
    bargap=0.3,
    bargroupgap=0.1,
    barmode='group',
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,  # Position above the plot area
        xanchor="center",
        x=0.5,   # Center horizontally
        font=dict(
            size=14  # Make legend text larger
        )
    ),
    height=600,
    # Increase top margin to accommodate the legend
    margin=dict(r=30, l=30, t=100, b=30),
    plot_bgcolor='white',  # White background makes gridlines more visible
)

# Show the figure
fig_simple.show()

In [None]:
# Host on plotly
# py.plot(fig_simple, filename= 'tech_degree_salary_changes_2024', auto_open=True)

'https://chart-studio.plotly.com/~carel/18/'

# Employment Trends

In [None]:
df = orig_df.copy()

# Filter out summary rows
df_courses = df[df['summary_row'] == 0]

# Create a list to store filtered rows
tech_df_rows = []

# Filter for tech degrees
for uni, courses in tech_degrees.items():
    for course in courses:
        # Get all rows matching this university and course
        filtered_rows = df_courses[(df_courses['uni'] == uni) & (df_courses['course'] == course)]
        tech_df_rows.append(filtered_rows)

# Combine all filtered rows
tech_df = pd.concat(tech_df_rows, ignore_index=True)

# Group by year and calculate average employment rates
yearly_avg = tech_df.groupby('year')[['employment_rate_overall', 'employment_rate_ft_perm']].mean().reset_index()

# Create a line graph using Plotly
fig = go.Figure()

# Add line for overall employment rate
fig.add_trace(go.Scatter(
    x=yearly_avg['year'],
    y=yearly_avg['employment_rate_overall'],
    mode='lines+markers',
    name='Overall Employment Rate',
    line=dict(color='royalblue', width=2),
    hovertemplate='Year: %{x}<br>Rate: %{y:.1f}%<extra></extra>'
))

# Add line for full-time permanent employment rate
fig.add_trace(go.Scatter(
    x=yearly_avg['year'],
    y=yearly_avg['employment_rate_ft_perm'],
    mode='lines+markers',
    name='Full-time Permanent Employment Rate',
    line=dict(color='firebrick', width=2),
    hovertemplate='Year: %{x}<br>Rate: %{y:.1f}%<extra></extra>'
))

# Update layout
fig.update_layout(
    title={
        'text': 'Average Employment Rates for Tech Degrees (2013-2024)',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title='Year',
    yaxis_title='Employment Rate (%)',
    yaxis=dict(
        tickvals=[80, 85, 90, 95, 100],  # Set specific tick values at intervals of 5
        range=[80, 100],  # Fixed y-axis range from 80 to 100
        gridcolor='lightgray'
    ),
    xaxis=dict(
        gridcolor='lightgray',
        dtick=1  # Show every year on x-axis
    ),
    legend=dict(
        x=0.05,  # Horizontal position (left)
        y=0.05,  # Vertical position (bottom)
        yanchor='bottom',
        xanchor='left',
        bgcolor='rgba(255, 255, 255, 0.8)',
        bordercolor='lightgray',
        borderwidth=1
    ),
    hovermode='x unified',
    plot_bgcolor='white',
    margin=dict(l=20, r=20, t=60, b=20)
)

# Add a gap annotation if there's a notable difference in recent years
if abs(yearly_avg['employment_rate_overall'].iloc[-1] - yearly_avg['employment_rate_ft_perm'].iloc[-1]) > 5:
    fig.add_annotation(
        x=yearly_avg['year'].iloc[-1],
        y=(yearly_avg['employment_rate_overall'].iloc[-1] + yearly_avg['employment_rate_ft_perm'].iloc[-1])/2,
        text="Gap between overall and<br>full-time employment",
        showarrow=True,
        arrowhead=2,
        arrowsize=1,
        arrowwidth=1,
        arrowcolor='gray',
        ax=-60,
        ay=0
    )

# Show the graph
fig.show()

In [6]:
# # Save to figure
# py.plot(fig, filename= 'tech_degree_emp_linechart_2024', auto_open=True)

In [11]:
df = orig_df.copy()

# Filter out summary rows
df_courses = df[df['summary_row'] == 0]
df_courses.groupby(['year'])[['employment_rate_overall', 'employment_rate_ft_perm']].mean().reset_index().round(2)

Unnamed: 0,year,employment_rate_overall,employment_rate_ft_perm
0,2013,89.61,84.02
1,2014,89.51,83.16
2,2015,89.97,83.76
3,2016,89.5,80.87
4,2017,89.23,79.21
5,2018,89.63,80.94
6,2019,90.87,82.53
7,2020,93.99,72.47
8,2021,94.43,84.9
9,2022,93.79,87.86


## Tech Degree Emp Change Table '23 to '24

In [7]:
# Read the data
df = orig_df.copy()

# Filter out summary rows
df_courses = df[df['summary_row'] == 0]

# Create a list to store filtered rows
tech_df_rows = []

# Filter for tech degrees
for uni, courses in tech_degrees.items():
    for course in courses:
        # Get all rows matching this university and course
        filtered_rows = df_courses[(df_courses['uni'] == uni) & (df_courses['course'] == course)]
        tech_df_rows.append(filtered_rows)

# Combine all filtered rows
tech_df = pd.concat(tech_df_rows, ignore_index=True)

# Filter for years 2023 and 2024
tech_df_recent = tech_df[tech_df['year'].isin([2023, 2024])]

# Create a pivot table
pivot_data = []

for _, row in tech_df_recent.iterrows():
    uni = row['uni']
    course = row['course']
    year = row['year']
    overall_rate = row['employment_rate_overall']
    ft_perm_rate = row['employment_rate_ft_perm']
    
    # Add data for overall rate
    pivot_data.append({
        'uni': uni,
        'course': course,
        'employment_type': 'Overall',
        'year': year,
        'rate': overall_rate
    })
    
    # Add data for full-time permanent rate
    pivot_data.append({
        'uni': uni,
        'course': course,
        'employment_type': 'FT / Perm',
        'year': year,
        'rate': ft_perm_rate
    })

# Convert to DataFrame
pivot_df = pd.DataFrame(pivot_data)

# Create the pivot table with multi-index
table = pivot_df.pivot_table(
    index=['uni', 'course'],
    columns=['employment_type', 'year'],
    values='rate',
    aggfunc='first'
)

# Reorder columns to group by employment type
table = table.reindex(columns=[
    ('FT / Perm', 2023), ('FT / Perm', 2024), 
    ('Overall', 2023), ('Overall', 2024)
])

# Reorder to have FT/Perm and then Overall
table = table[[('FT / Perm', 2023), ('FT / Perm', 2024), ('Overall', 2023), ('Overall', 2024)]]

# Calculate change between 2023 and 2024
table[('FT / Perm', 'Change')] = table[('FT / Perm', 2024)] - table[('FT / Perm', 2023)]
table[('Overall', 'Change')] = table[('Overall', 2024)] - table[('Overall', 2023)]

# Sort the table by university and then by course
table = table.sort_index()

# Format the values as percentages with 1 decimal place
formatted_table = table.applymap(lambda x: f"{x:.1f}%" if pd.notnull(x) else "N/A")

# For the change columns, add + sign for positive values
formatted_table[('FT / Perm', 'Change')] = table[('FT / Perm', 'Change')].apply(
    lambda x: f"+{x:.1f}%" if x > 0 else f"{x:.1f}%" if pd.notnull(x) else "N/A"
)
formatted_table[('Overall', 'Change')] = table[('Overall', 'Change')].apply(
    lambda x: f"+{x:.1f}%" if x > 0 else f"{x:.1f}%" if pd.notnull(x) else "N/A"
)

# Print the formatted table
print(formatted_table)

# Create a styled HTML table
def style_cell(val):
    """Style the cell based on the change value"""
    if isinstance(val, str) and val.startswith('+'):
        return 'background-color: #d4f7d4'  # Light green for positive
    elif isinstance(val, str) and val.startswith('-'):
        return 'background-color: #ffd6d6'  # Light red for negative
    else:
        return ''

# Apply styling
styled_table = formatted_table.style.applymap(style_cell, subset=[('FT / Perm', 'Change'), ('Overall', 'Change')])

# Save to HTML
styled_table.to_html('tech_employment_table.html')

# Create a more detailed analysis looking at which degrees have the biggest changes
analysis = table.copy()
analysis['Overall_Change'] = analysis[('Overall', 'Change')]
analysis['FT_Perm_Change'] = analysis[('FT / Perm', 'Change')]

# Sort by the change in full-time permanent employment rate
biggest_declines = analysis.sort_values('FT_Perm_Change', ascending=True).head(5)
biggest_improvements = analysis.sort_values('FT_Perm_Change', ascending=False).head(5)

# print("\nTech degrees with biggest decline in full-time employment (2023-2024):")
# for idx, row in biggest_declines.iterrows():
#     uni, course = idx
#     change = row['FT_Perm_Change']
#     print(f"{uni} - {course}: {change:.1f}%")

# print("\nTech degrees with biggest improvement in full-time employment (2023-2024):")
# for idx, row in biggest_improvements.iterrows():
#     uni, course = idx
#     change = row['FT_Perm_Change']
#     print(f"{uni} - {course}: {change:.1f}%")

employment_type                            FT / Perm        Overall         \
year                                            2023   2024    2023   2024   
uni course                                                                   
NTU Computer Engineering                       87.5%  89.9%   89.3%  91.3%   
    Computer Science                           86.5%  79.6%   88.3%  82.0%   
    Data Science & Artificial Intelligence     83.3%  76.0%   83.3%  84.0%   
NUS Business Analytics                         93.0%  87.8%   93.6%  92.5%   
    Computer Engineering                       86.2%  84.2%   89.9%  90.4%   
    Computer Science                           91.9%  87.8%   92.4%  89.1%   
    Data Science and Analytics                 80.4%  77.0%   83.2%  85.9%   
    Information Security                       89.5%  88.2%   89.5%  91.2%   
    Information Systems                        91.4%  87.5%   91.4%  90.3%   
SMU Information Systems                        88.4%  79.2%   92


DataFrame.applymap has been deprecated. Use DataFrame.map instead.


Styler.applymap has been deprecated. Use Styler.map instead.



# Salary Ranges

## Interquartile Range Table

In [8]:
df=orig_df.copy()

# Filter out summary rows
course_data = df[df['summary_row'] != 1].copy()

# Create a column to identify tech courses
course_data['is_tech'] = False

# Mark tech courses based on tech_degrees dictionary
for uni, courses in tech_degrees.items():
    mask = (course_data['uni'] == uni) & (course_data['course'].isin(courses))
    course_data.loc[mask, 'is_tech'] = True

# Calculate the IQR for each course (difference between 75th and 25th percentile)
course_data['salary_iqr'] = course_data['gross_mthly_75_percentile'] - course_data['gross_mthly_25_percentile']

# Create two tables: 
# 1. Average IQR by year for tech vs non-tech
iqr_by_type = course_data.groupby(['year', 'is_tech'])['salary_iqr'].agg(['mean', 'median', 'count']).reset_index()
iqr_by_type_pivot = iqr_by_type.pivot(index='year', columns='is_tech', values=['mean', 'median', 'count'])

# Rename columns for clarity
iqr_by_type_pivot.columns = [f"{col[0]}_{('Tech' if col[1] else 'Non-Tech')}" for col in iqr_by_type_pivot.columns]
iqr_table1 = iqr_by_type_pivot.reset_index()

# 2. Average IQR by year and university for tech courses only
tech_iqr_by_uni = course_data[course_data['is_tech']].groupby(['year', 'uni'])['salary_iqr'].agg(['mean', 'median', 'count']).reset_index()
tech_iqr_by_uni_pivot = tech_iqr_by_uni.pivot(index='year', columns='uni', values=['mean', 'median', 'count'])

# Rename columns for clarity
tech_iqr_by_uni_pivot.columns = [f"{col[0]}_{col[1]}" for col in tech_iqr_by_uni_pivot.columns]
iqr_table2 = tech_iqr_by_uni_pivot.reset_index()

# Display the tables
print("Table 1: Average Interquartile Range by Year for Tech vs Non-Tech Degrees")
print(iqr_table1.to_string(index=False))
print("\n")
print("Table 2: Average Interquartile Range by Year and University for Tech Degrees Only")
print(iqr_table2.to_string(index=False))

# Optional: Calculate year-over-year change in IQR
iqr_by_type_pivot['mean_Tech_pct_change'] = iqr_by_type_pivot['mean_Tech'].pct_change() * 100
iqr_by_type_pivot['mean_Non-Tech_pct_change'] = iqr_by_type_pivot['mean_Non-Tech'].pct_change() * 100
iqr_table3 = iqr_by_type_pivot[['mean_Tech', 'mean_Tech_pct_change', 'mean_Non-Tech', 'mean_Non-Tech_pct_change']].reset_index()

print("\nTable 3: Year-over-Year Change in Average Interquartile Range")



Table 1: Average Interquartile Range by Year for Tech vs Non-Tech Degrees
 year  mean_Non-Tech   mean_Tech  median_Non-Tech  median_Tech  count_Non-Tech  count_Tech
 2013     587.611111  795.000000            550.0        800.0            54.0         6.0
 2014     632.945455  803.166667            600.0        790.0            55.0         6.0
 2015     674.767857  936.333333            678.0        905.0            56.0         6.0
 2016     709.421053 1131.333333            675.0       1076.5            57.0         6.0
 2017     724.655172  998.428571            700.0       1000.0            58.0         7.0
 2018     747.736842 1042.857143            725.0       1000.0            57.0         7.0
 2019     737.655738 1052.142857            700.0       1100.0            61.0         7.0
 2020     752.377049 1321.555556            700.0       1320.0            61.0         9.0
 2021     777.066667 1291.111111            700.0       1200.0            60.0         9.0
 2022     950.20

In [10]:
iqr_table1.round(2)

Unnamed: 0,year,mean_Non-Tech,mean_Tech,median_Non-Tech,median_Tech,count_Non-Tech,count_Tech
0,2013,587.61,795.0,550.0,800.0,54.0,6.0
1,2014,632.95,803.17,600.0,790.0,55.0,6.0
2,2015,674.77,936.33,678.0,905.0,56.0,6.0
3,2016,709.42,1131.33,675.0,1076.5,57.0,6.0
4,2017,724.66,998.43,700.0,1000.0,58.0,7.0
5,2018,747.74,1042.86,725.0,1000.0,57.0,7.0
6,2019,737.66,1052.14,700.0,1100.0,61.0,7.0
7,2020,752.38,1321.56,700.0,1320.0,61.0,9.0
8,2021,777.07,1291.11,700.0,1200.0,60.0,9.0
9,2022,950.2,1714.0,952.5,1560.0,60.0,10.0


## Percentile Ratio

In [15]:
# Create a copy of the original data
df = orig_df.copy()

# Filter out summary rows
course_data = df[df['summary_row'] != 1].copy()

# Create a column to identify tech courses
course_data['is_tech'] = False

# Mark tech courses based on tech_degrees dictionary
for uni, courses in tech_degrees.items():
    mask = (course_data['uni'] == uni) & (course_data['course'].isin(courses))
    course_data.loc[mask, 'is_tech'] = True

# Calculate the percentile ratio (75th percentile / 25th percentile)
course_data['salary_ratio'] = course_data['gross_mthly_75_percentile'] / course_data['gross_mthly_25_percentile']

# Average percentile ratio by year for tech vs non-tech
ratio_by_type = course_data.groupby(['year', 'is_tech'])['salary_ratio'].agg(['mean', 'median', 'count']).reset_index()
ratio_by_type_pivot = ratio_by_type.pivot(index='year', columns='is_tech', values=['mean', 'median', 'count'])

# Rename columns for clarity
ratio_by_type_pivot.columns = [f"{col[0]}_{('Tech' if col[1] else 'Non-Tech')}" for col in ratio_by_type_pivot.columns]
ratio_table1 = ratio_by_type_pivot.reset_index()

# Display the table
print("Average Percentile Ratio (75th/25th) by Year for Tech vs Non-Tech Degrees")
print(ratio_table1.to_string(index=False))

# Calculate absolute and percentage change from 2013 to latest year
first_year = ratio_table1['year'].min()
last_year = ratio_table1['year'].max()

tech_first = ratio_table1.loc[ratio_table1['year'] == first_year, 'mean_Tech'].values[0]
tech_last = ratio_table1.loc[ratio_table1['year'] == last_year, 'mean_Tech'].values[0]
nontech_first = ratio_table1.loc[ratio_table1['year'] == first_year, 'mean_Non-Tech'].values[0]
nontech_last = ratio_table1.loc[ratio_table1['year'] == last_year, 'mean_Non-Tech'].values[0]

tech_change = tech_last - tech_first
tech_pct_change = (tech_last / tech_first - 1) * 100
nontech_change = nontech_last - nontech_first
nontech_pct_change = (nontech_last / nontech_first - 1) * 100

print(f"\nChange in Average Ratio from {first_year} to {last_year}:")
print(f"Tech Degrees: {tech_change:.3f} ({tech_pct_change:.1f}%)")
print(f"Non-Tech Degrees: {nontech_change:.3f} ({nontech_pct_change:.1f}%)")

Average Percentile Ratio (75th/25th) by Year for Tech vs Non-Tech Degrees
 year  mean_Non-Tech  mean_Tech  median_Non-Tech  median_Tech  count_Non-Tech  count_Tech
 2013       1.199376   1.265000         1.201852     1.266667            54.0         6.0
 2014       1.210361   1.257365         1.200000     1.254167            55.0         6.0
 2015       1.220491   1.288942         1.222273     1.282812            56.0         6.0
 2016       1.226938   1.336405         1.214286     1.325163            57.0         6.0
 2017       1.230691   1.281147         1.233105     1.285714            58.0         7.0
 2018       1.234554   1.282977         1.237188     1.277778            57.0         7.0
 2019       1.220400   1.268503         1.200000     1.283951            61.0         7.0
 2020       1.220889   1.324547         1.200000     1.335815            61.0         9.0
 2021       1.224806   1.290950         1.202941     1.277584            60.0         9.0
 2022       1.253982   1.3

## Range Graph

In [16]:
import plotly.graph_objects as go
import locale

# Set locale for thousands separator
locale.setlocale(locale.LC_ALL, '')

# Create a copy of the original data
df = orig_df.copy()

# Filter out summary rows
course_data = df[df['summary_row'] != 1].copy()

# Create a flattened list of all tech courses
all_tech_courses = []
for uni, courses in tech_degrees.items():
    for course in courses:
        all_tech_courses.append((uni, course))

# Filter data for tech courses in years 2023 and 2024
tech_data = []
for uni, course in all_tech_courses:
    course_rows = course_data[(course_data['uni'] == uni) & 
                              (course_data['course'] == course) & 
                              (course_data['year'].isin([2023, 2024]))]
    if not course_rows.empty:
        tech_data.append(course_rows)

tech_data = pd.concat(tech_data)

# Create labels for x-axis (combining uni and course)
tech_data['label'] = tech_data['uni'] + ': ' + tech_data['course']

# Modify specific course names
tech_data['label'] = tech_data['label'].str.replace('Data Science and Analytics', 'Data Science & Analytics')
tech_data['label'] = tech_data['label'].str.replace('Data Science & Artificial Intelligence', 'Data Science & AI')

# Create a unique identifier for each degree to position them on x-axis
unique_degrees = tech_data[['uni', 'course', 'label']].drop_duplicates()
unique_degrees['x_pos'] = range(len(unique_degrees))

# Merge the x positions back to the main dataframe
tech_data = tech_data.merge(unique_degrees[['uni', 'course', 'x_pos']], 
                           on=['uni', 'course'], how='left')

# Calculate salary range for hover information
tech_data['salary_range'] = tech_data['gross_mthly_75_percentile'] - tech_data['gross_mthly_25_percentile']

# Create separate dataframes for 2023 and 2024
data_2023 = tech_data[tech_data['year'] == 2023]
data_2024 = tech_data[tech_data['year'] == 2024]

# Create the figure
fig = go.Figure()

# Add 2023 data (orange) - offset to the left
fig.add_trace(go.Scatter(
    x=data_2023['x_pos'] - 0.2,  # Offset to the left
    y=data_2023['gross_monthly_median'],
    mode='markers',
    name='2023',
    marker=dict(color='orange', size=10),
    error_y=dict(
        type='data',
        symmetric=False,
        array=data_2023['gross_mthly_75_percentile'] - data_2023['gross_monthly_median'],
        arrayminus=data_2023['gross_monthly_median'] - data_2023['gross_mthly_25_percentile'],
        color='orange',
        thickness=1.5,
        width=6
    ),
    hovertemplate='<b>%{text}</b><br>Median: $%{y:,.0f}<br>Range: $%{customdata[2]:,.0f}<br>25th: $%{customdata[0]:,.0f}<br>75th: $%{customdata[1]:,.0f}<extra>2023</extra>',
    text=data_2023['label'],
    customdata=data_2023[['gross_mthly_25_percentile', 'gross_mthly_75_percentile', 'salary_range']].values
))

# Add 2024 data (purple) - offset to the right
fig.add_trace(go.Scatter(
    x=data_2024['x_pos'] + 0.2,  # Offset to the right
    y=data_2024['gross_monthly_median'],
    mode='markers',
    name='2024',
    marker=dict(color='purple', size=10),
    error_y=dict(
        type='data',
        symmetric=False,
        array=data_2024['gross_mthly_75_percentile'] - data_2024['gross_monthly_median'],
        arrayminus=data_2024['gross_monthly_median'] - data_2024['gross_mthly_25_percentile'],
        color='purple',
        thickness=1.5,
        width=6
    ),
    hovertemplate='<b>%{text}</b><br>Median: $%{y:,.0f}<br>Range: $%{customdata[2]:,.0f}<br>25th: $%{customdata[0]:,.0f}<br>75th: $%{customdata[1]:,.0f}<extra>2024</extra>',
    text=data_2024['label'],
    customdata=data_2024[['gross_mthly_25_percentile', 'gross_mthly_75_percentile', 'salary_range']].values
))

# Get the updated x-axis tick labels with modified course names
unique_labels = unique_degrees['label']

# Set x-axis tick labels
fig.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=unique_degrees['x_pos'],
        ticktext=unique_labels,
        tickangle=45
    ),
    yaxis=dict(
        title='Monthly Gross Salary (SGD)',
        tickprefix='$',
        tickformat=',',  # Add thousands separator
    ),
    title='Tech Degrees Salary Ranges: 2023 vs 2024',
    legend=dict(
        title='Year',
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    ),
    height=600,
    width=1000,  # Increased width for a longer plot
    margin=dict(b=150),  # Add bottom margin for rotated labels
    template='plotly_white'  # Use a cleaner template
)

# Show the figure
fig.show()

In [23]:
# Host on plotly
py.plot(fig, filename= 'tech_salary_range_compare', auto_open=True)

'https://chart-studio.plotly.com/~carel/24/'

In [19]:
import pandas as pd
import locale

# Set locale for thousands separator
locale.setlocale(locale.LC_ALL, '')

# Create a copy of the original data
df = orig_df.copy()

# Filter out summary rows
course_data = df[df['summary_row'] != 1].copy()

# Create a flattened list of all tech courses
all_tech_courses = []
for uni, courses in tech_degrees.items():
    for course in courses:
        all_tech_courses.append((uni, course))

# Filter data for tech courses in years 2023 and 2024
tech_data = []
for uni, course in all_tech_courses:
    course_rows = course_data[(course_data['uni'] == uni) & 
                              (course_data['course'] == course) & 
                              (course_data['year'].isin([2023, 2024]))]
    if not course_rows.empty:
        tech_data.append(course_rows)

tech_data = pd.concat(tech_data)

# Replace specific course names
tech_data['course'] = tech_data['course'].replace({
    'Data Science and Analytics': 'Data Science & Analytics',
    'Data Science & Artificial Intelligence': 'Data Science & AI'
})

# Calculate the salary range
tech_data['salary_range'] = tech_data['gross_mthly_75_percentile'] - tech_data['gross_mthly_25_percentile']

# Create degree label
tech_data['degree'] = '(' + tech_data['uni'] + ') ' + tech_data['course']

# Create a pivot table
range_table = tech_data.pivot_table(
    index='degree',
    columns='year',
    values='salary_range',
    aggfunc='first'
)

# Sort by 2024 range (descending)
range_table = range_table.sort_values(by=2024, ascending=False)

# Format the values with $ and thousands separator
range_table_formatted = range_table.applymap(lambda x: f"${x:,.0f}" if pd.notnull(x) else "N/A")

# Reset index to make 'degree' a column
range_table_formatted = range_table_formatted.reset_index()

# Rename columns for clarity
range_table_formatted.columns = ['Tech Degree', '2023 Range', '2024 Range']

# Calculate the difference between 2024 and 2023 ranges
range_table['diff'] = range_table[2024] - range_table[2023]
range_table['pct_change'] = (range_table[2024] / range_table[2023] - 1) * 100

# Add change columns to the formatted table
range_table_formatted['Change'] = range_table['diff'].apply(lambda x: f"${x:,.0f}" if pd.notnull(x) else "N/A")
range_table_formatted['% Change'] = range_table['pct_change'].apply(lambda x: f"{x:.1f}%" if pd.notnull(x) else "N/A")

# Display the table
# print("Salary Ranges (75th - 25th Percentile) for Tech Degrees: 2023 vs 2024")
# print(range_table_formatted.to_string(index=False))


DataFrame.applymap has been deprecated. Use DataFrame.map instead.

