In [2]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np

# Setup

In [3]:
# Import data
df = pd.read_csv("../../data/graphing_table.csv")

In [4]:
# Which degree & university to examine
selected_course = 'Arts (Hons)'
selected_uni = 'NUS'
# Obtain the school for this selected_course
school = df.loc[(df.course==selected_course) & (df.uni==selected_uni), 'school'].drop_duplicates().item()
# Copy the main dataframe
input_df = df.copy().sort_values('year', ascending=True)

In [5]:
# What years is there information for that course on? 
course_years = input_df.loc[(input_df.course == selected_course)
                            & (input_df.uni==selected_uni), 'year']
min_year, max_year = course_years.min().item(), course_years.max().item()


# Identify rows that are summary stats at the university level
input_df['summary_row'] = 0 
# Do this for EVERY university
for uni_tmp in df.uni.unique().tolist():

    # Filter for that uni, for records within the year range
    tdf = input_df.loc[(input_df.uni == uni_tmp)
                   & (input_df.year.between(min_year, max_year))]

    # Aggregate relevant metrics
    metrics = ['Places', 'GPA', 'RP',  'employment_rate_overall','employment_rate_ft_perm','basic_monthly_mean','basic_monthly_median','gross_monthly_mean','gross_monthly_median','gross_mthly_25_percentile','gross_mthly_75_percentile']
    # 25th Percentile rows
    uni_25th_pctile = tdf.groupby('year', as_index=False)[metrics].agg(lambda x: x.quantile(0.25))
    uni_25th_pctile['uni'] = uni_tmp
    uni_25th_pctile['course'] = f'{uni_tmp} 25th Percentile'
    # Median Rows
    uni_median = tdf.groupby('year', as_index=False)[metrics].agg(lambda x: x.quantile(0.5))
    uni_median['uni'] = uni_tmp
    uni_median['course'] = f'{uni_tmp} Median'
    # 75th Percentile Rows
    uni_75th_percentile = tdf.groupby('year', as_index=False)[metrics].agg(lambda x: x.quantile(0.75))
    uni_75th_percentile['uni'] = uni_tmp
    uni_75th_percentile['course'] = f'{uni_tmp} 75th Percentile'
    stats_table = pd.concat([uni_median, uni_25th_pctile, uni_75th_percentile], ignore_index=True)
    stats_table['summary_row'] = 1
    # Join back to input_df
    input_df = pd.concat([input_df,stats_table], ignore_index=True)


# track if a row is a course row or not
input_df['course_row'] = input_df.summary_row.replace({0: 1, 1: 0 })

# Admission (RP)

In [6]:
# Extracting data for the selected course and the university's aggregation rows
course_data = input_df[(input_df['course'] == selected_course) & (input_df['uni'] == selected_uni)]
agg_data = input_df[(input_df['summary_row'] == 1) & (input_df['uni'] == selected_uni)]

# Creating the plot
fig = go.Figure()

# Adding line for the selected course
fig.add_trace(go.Scatter(
    x=course_data['year'], y=course_data['RP'],
    mode='lines', name=f"{selected_course}",
    line=dict(color='#0F7A8F', width=3, shape='linear')
))

# Colormapping. Will cycle 
agg_col_map = ['#7c6354', '#DB3F3F', '#317A41', '#222323', '#011638']
# Adding lines for the aggregation row
for idx, agg_row in enumerate(agg_data['course'].unique()):
    agg_row_data = agg_data[agg_data['course'] == agg_row]
    fig.add_trace(go.Scatter(
        x=agg_row_data['year'], y=agg_row_data['RP'],
        mode='lines', name=agg_row,
        line=dict(color= agg_col_map[idx%(len(agg_col_map)-1)], width=2, dash='dash'),
        opacity=1
    ))

# Lower RP limit rounded down to the nearest 5
min_rp = min(course_data.RP.min(), agg_data.RP.min())
min_rp_limit = 5 * (min_rp // 5)

# Adding padding to the X-axis
padding = 0.5
x_min = course_data['year'].min() - padding
x_max = course_data['year'].max() + padding


# Updating layout for the presentation preferences
fig.update_layout(
    title = f"RP cutoff standing for {selected_course} in {selected_uni}", 
    xaxis_title = 'Year', 
    yaxis_title = 'Rank Points', 
    yaxis=dict(range=[min_rp_limit, input_df['RP'].max()]),
    legend=dict(orientation="h", yanchor="bottom", y=-0.3, xanchor="center", x=0.5),
    xaxis=dict(range = [x_min, x_max], tickmode='linear', tick0=course_data['year'].min(), dtick=1),
    margin=dict(l=50, r=50, t=50, b=100), 
    hovermode='x unified'
)

fig.update_xaxes(tickangle=45)
fig.show()


# RP vs Median

In [7]:
# Filtering data for the latest year available for the selected course
latest_year = input_df.loc[(input_df['course'] == selected_course)
                           & (input_df['uni']==selected_uni), 'year'].max()
filtered_df = input_df[(input_df['year'] == latest_year)
                       & (input_df.summary_row==0)].copy()

# Divide gross_monthly_median by 1,000 for formatting
filtered_df['gross_monthly_median'] = filtered_df['gross_monthly_median'] / 1000

# Add custom data for tooltips (course name, RP, and gross monthly median)
filtered_df['tooltip'] = filtered_df.apply(lambda row: f"{row['course']}<br>RP: {row['RP']}<br>Median Salary: ${row['gross_monthly_median']:.2f}K", axis=1)
filtered_df['empty'] = ""

# Creating the scatter plot [NUS, NTU, SMU]
colors = ['#fe4a49', '#09814a', '#1438CA']
color_map = {uni: colors[i % len(colors)] for i, uni in enumerate(filtered_df['uni'].unique())}
# Store Graph Object traces
traces = []

# Highlighting the selected course
selected_data = filtered_df[(filtered_df['course'] == selected_course)
                            & (filtered_df.uni==selected_uni)]
selected_trace= go.Scatter(
            x=selected_data['RP'], 
            y=selected_data['gross_monthly_median'], 
            mode='markers', 
            name=f"{selected_course} ({selected_uni})", 
            marker=dict(color='#121619', size=16),
            hovertemplate='<b>{}</b><br>RP: {}<br>Median Salary: ${:.2f}K'.format(selected_course, selected_data['RP'].values[0], selected_data['gross_monthly_median'].values[0])
        )
traces.append(selected_trace)

# Other courses data
other_courses_df = filtered_df.loc[~((filtered_df.course == selected_course)
                                 & (filtered_df.uni==selected_uni))]

# Iterate over all other uni courses by uni
for uni in other_courses_df['uni'].unique():
    uni_data = other_courses_df[other_courses_df['uni'] == uni]
    trace = go.Scatter(
        x=uni_data['RP'],
        y=uni_data['gross_monthly_median'],
        mode='markers',
        name=uni,
        marker=dict(color=color_map[uni], size=8, opacity=0.4),
        customdata=uni_data['tooltip'],
        hovertemplate='%{customdata}'  # This ensures only the custom tooltip is displayed
    )
    # Add the trace
    traces.append(trace)

# Create the layout
layout = go.Layout(
    title=f"RP vs Gross Monthly Median Salary (Year: {latest_year})",
    xaxis=dict(title='RP'),
    yaxis=dict(title='Gross Monthly Median Salary (in thousands)'),
    legend=dict(title=None, orientation="h", yanchor="bottom", y=-0.25, xanchor="center", x=0.5),
    margin=dict(l=50, r=50, t=50, b=100)
)

# Create the figure
fig = go.Figure(data=traces, layout=layout)

fig.show()

Note there will be points that overlap. Currently no workaround for this in plotly. 

# Salary Changes

In [18]:
plot_course = 'Data Science and Analytics'
plot_uni = 'NUS'

# Filter data for the selected course and university
course_data = input_df[(input_df['course'] == plot_course) & (input_df['uni'] == plot_uni) & (input_df['course_row'] == 1)].copy()

# Filter data for the university median
uni_median_data = input_df[(input_df['course'] == f"{plot_uni} Median") & (input_df['summary_row'] == 1)].copy()

# X axis: Years where gross_monthly_median is available
years = course_data['year']

# Convert gross monthly salary values to thousands of dollars
course_data['gross_monthly_median'] = course_data['gross_monthly_median'] / 1000
course_data['gross_mthly_75_percentile'] = course_data['gross_mthly_75_percentile'] / 1000
course_data['gross_mthly_25_percentile'] = course_data['gross_mthly_25_percentile'] / 1000

uni_median_data['gross_monthly_median'] = uni_median_data['gross_monthly_median'] / 1000
uni_median_data['gross_mthly_75_percentile'] = uni_median_data['gross_mthly_75_percentile'] / 1000
uni_median_data['gross_mthly_25_percentile'] = uni_median_data['gross_mthly_25_percentile'] / 1000

# Course line with error bars and custom hover text
course_trace = go.Scatter(
    x=years,
    y=course_data['gross_monthly_median'],
    error_y=dict(
        type='data',
        symmetric=False,
        array=course_data['gross_mthly_75_percentile'] - course_data['gross_monthly_median'],
        arrayminus=course_data['gross_monthly_median'] - course_data['gross_mthly_25_percentile'],
        visible=True
    ),
    mode='lines+markers',
    name=f"{plot_course}",
    hovertemplate=
        '%{x}<br>' +
        'Median: %{y:.2f}k<br>' +
        '75th Pct: %{customdata[0]:.2f}k<br>' +
        '25th Pct: %{customdata[1]:.2f}k',
    customdata=np.stack((course_data['gross_mthly_75_percentile'], course_data['gross_mthly_25_percentile']), axis=-1)
)

# University median line
uni_median_trace = go.Scatter(
    x=uni_median_data['year'],
    y=uni_median_data['gross_monthly_median'],
    mode='lines',
    line=dict(dash='dot'),
    name=f'{plot_uni} Median', 
    hovertemplate=
        "%{x}<br>"+"%{y:.2f}k"
)

# University 75th percentile line
uni_75th_trace = go.Scatter(
    x=uni_median_data['year'],
    y=uni_median_data['gross_mthly_75_percentile'],
    mode='lines',
    line=dict(dash='dot'),
    name=f'{plot_uni} 75th Percentile', 
    hovertemplate=
        "%{x}<br>"+"%{y:.2f}k"
)

# University 25th percentile line
uni_25th_trace = go.Scatter(
    x=uni_median_data['year'],
    y=uni_median_data['gross_mthly_25_percentile'],
    mode='lines',
    line=dict(dash='dot'),
    name=f'{plot_uni} 25th Percentile', 
    hovertemplate=
        "%{x}<br>"+"%{y:.2f}k"
)

# Create the figure
fig = go.Figure(data=[course_trace, uni_median_trace, uni_75th_trace, uni_25th_trace])

# Adding padding to the X-axis
padding = 0.5
x_min = course_data['year'].min() - padding
x_max = course_data['year'].max() + padding



# Set titles and labels
fig.update_layout(
    title=f"Gross Monthly Salary for {plot_course} at {plot_uni} Over Time",
    title_x=0.5,
    xaxis_title="Year",
    xaxis=dict(range = [x_min, x_max], tickmode='linear', tick0=course_data['year'].min(), dtick=1),
    yaxis_title="Gross Monthly Salary (Thousands)",
    legend=dict(
        orientation="h",
        yanchor="top",
        y=-0.2,  # Move legend below x-axis
        xanchor="center",
        x=0.5
    )
)

# Set x-axis ticks to show every year
fig.update_xaxes(
    dtick=1  # Set tick interval to 1 year
)

# Show the figure
fig.show()


Aggregated values are the **median** for that field. E.g `NUS 75th Percentile` is the median value across all NUS degree's 75th percentile income for that year. 

In [10]:
df.head(2)

Unnamed: 0,uni,school,course,year,Places,GPA,RP,employment_rate_overall,employment_rate_ft_perm,basic_monthly_mean,basic_monthly_median,gross_monthly_mean,gross_monthly_median,gross_mthly_25_percentile,gross_mthly_75_percentile,university
0,NUS,Faculty of Arts and Social Sciences,Arts (Hons),2013,1562.0,3.68,77.5,86.6,74.6,3057.0,3200.0,3154.0,3200.0,2800.0,3500.0,National University of Singapore
1,NUS,Faculty of Arts and Social Sciences,Arts (Hons),2014,1591.0,3.76,76.25,83.1,76.6,3141.0,3210.0,3277.0,3300.0,3000.0,3600.0,National University of Singapore


In [13]:
df.loc[(df.year==2022)
             & (df.uni==selected_uni)
             , 'gross_mthly_75_percentile'].median()

np.float64(5000.0)

# Employment rates

In [22]:
plot_course = selected_course
plot_uni = selected_uni

# Filter the data for the selected course
course_data = input_df[(input_df['course'] == plot_course) & (input_df['uni'] == plot_uni) & (input_df['course_row'] == 1)].copy()

# Filter the summary data for the university's median
summary_data = input_df[(input_df['course'] == f"{plot_uni} Median") & (input_df['uni'] == plot_uni) & (input_df['summary_row'] == 1)].copy()

# Create the grouped bar plot
fig = go.Figure()

# Add bars for 'employment_rate_ft_perm'
fig.add_trace(go.Bar(
    x=course_data['year'],
    y=course_data['employment_rate_ft_perm'],
    name='FT Permanent Employment Rate',
    marker_color='indianred',
    hovertemplate='%{y:.1f}%'  # Hover template for bar plot
))

# Add bars for 'employment_rate_overall'
fig.add_trace(go.Bar(
    x=course_data['year'],
    y=course_data['employment_rate_overall'],
    name='Overall Employment Rate',
    marker_color='lightsalmon',
    hovertemplate='%{y:.1f}%'  # Hover template for bar plot
))

# Add dotted line for 'employment_rate_ft_perm' from summary data
fig.add_trace(go.Scatter(
    x=summary_data['year'],
    y=summary_data['employment_rate_ft_perm'],
    mode='lines',
    name='Median FT Permanent Employment Rate',
    line=dict(color='royalblue', dash='dot')
))

# Add dotted line for 'employment_rate_overall' from summary data
fig.add_trace(go.Scatter(
    x=summary_data['year'],
    y=summary_data['employment_rate_overall'],
    mode='lines',
    name='Median Overall Employment Rate',
    line=dict(color='darkblue', dash='dot')
))

# Update layout for the legend, axes, and X-ticks
fig.update_layout(
    barmode='group',
    title=f"Employment Rates for {plot_course} at {plot_uni}",
    xaxis_title="Year",
    yaxis_title="Employment Rate (%)",
    legend=dict(
        orientation="h",
        yanchor="top",
        y=-0.15,
        xanchor="center",
        x=0.5
    ),
    xaxis=dict(
        tickmode='linear',  # Ensure every year is shown on X-axis
        tick0=course_data['year'].min(),
        dtick=1  # Set interval to 1 year
    )
)

# Show the plot
fig.show()

