In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [15]:
df_programs = []

for i in range(4):
    #Train data
    df = pd.read_excel('BSA-dataset_2122_2223-Train_stud_a.xlsx', sheet_name=i)
    df1 = df[df["Year"] == "21/22"]
    df2 =df[df["Year"] == "22/23"]

    df1_val = df1.sample(frac=0.3, random_state=25)
    df1_train = df1.drop(df1_val.index)

    df1_val["Origin"] = "test"
    df1_train["Origin"] = "train"

    df1_final = pd.concat([df1_val,df1_train])



    df2_val = df2.sample(frac=0.3, random_state=25)
    df2_train = df2.drop(df2_val.index)

    df2_val["Origin"] = "test"
    df2_train["Origin"] = "train"


    df2_final = pd.concat([df1_final,df2_train]) 


    df_final = pd.concat([df2_val,df2_final]) 
    df_programs.append(df_final)


In [46]:
df = pd.concat(df_programs,ignore_index=True)
columns_to_drop = [col for col in df.columns if col.startswith("Co")]
df = df.drop(columns=columns_to_drop)
df.columns

Index(['train', 'Gender', 'Nationality', 'PreEducation', 'Program', 'Year',
       'BSA', 'Credits-Y1', 'Crd-B1B2', 'Origin'],
      dtype='object')

In [35]:
# Define possible values for each column
programs = ['Program1', 'Program2', 'Program3', 'Program4']
years = ['21/22', '22/23']
pre_educations = ['VWO', 'HBO', 'Buitenlands']
bsa_categories = ['STGA', 'NE', 'PS', 'DI']
credits_y1_values = np.arange(0, 61, 6)
credits_b1b2_values = np.arange(0, 25, 6)

# Generate random data
num_rows = 2000  # Specify the number of rows you want
data = {
    'train': np.random.choice(['Yes', 'No'], size=num_rows),
    'Gender': np.random.choice(['Male', 'Female'], size=num_rows),
    'Nationality': np.random.choice(['Dutch', 'Foreign'], size=num_rows),
    'PreEducation': np.random.choice(pre_educations, size=num_rows),
    'Program': np.random.choice(programs, size=num_rows),
    'Year': np.random.choice(years, size=num_rows),
    'BSA': np.random.choice(bsa_categories, size=num_rows),
    'Credits-Y1': np.random.choice(credits_y1_values, size=num_rows),
    'Crd-B1B2': np.random.choice(credits_b1b2_values, size=num_rows)
}

# Create DataFrame
df = pd.DataFrame(data)

In [36]:
df

Unnamed: 0,train,Gender,Nationality,PreEducation,Program,Year,BSA,Credits-Y1,Crd-B1B2
0,Yes,Male,Dutch,VWO,Program1,21/22,NE,42,6
1,Yes,Female,Foreign,HBO,Program2,21/22,NE,30,6
2,Yes,Male,Foreign,VWO,Program4,22/23,NE,6,24
3,No,Female,Dutch,HBO,Program1,22/23,PS,0,6
4,No,Female,Dutch,HBO,Program4,21/22,STGA,12,12
...,...,...,...,...,...,...,...,...,...
1995,Yes,Male,Dutch,Buitenlands,Program1,21/22,PS,60,18
1996,No,Female,Foreign,Buitenlands,Program1,21/22,NE,42,24
1997,Yes,Male,Foreign,HBO,Program4,22/23,DI,42,12
1998,No,Male,Foreign,VWO,Program1,22/23,STGA,0,24


In [65]:
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

In [72]:
# Define the order of 'Program' facet columns
program_order = ['Program1', 'Program2', 'Program3', 'Program4']
bsa_order = ['STGA', 'NE', 'PS', 'DI']
bsa_colors = {
    'STGA': '#4d4d4d',
    'NE': '#b30000',
    'PS': '#258e25',
    'DI': 'lightblue'
}

df_agg = df.groupby(['Year', 'Program', 'BSA']).size().reset_index(name='Count')

# Create bar count plot using Plotly Express
fig = px.bar(df_agg, x='Year', y='Count', color='BSA', barmode='stack', facet_col='Program', facet_col_wrap=2,
             category_orders={'Year': years, 'BSA': bsa_order, 'Program': program_order},
             color_discrete_map=bsa_colors,
             labels={'Year': 'Year', 'Count': 'Count', 'BSA': 'BSA'}, title='Count of Years Layered by BSA and Program')
fig.update_xaxes(title_text='')
fig.update_yaxes(title_text='Count')

# Adjust size and spacing
fig.update_layout(height=800, width=1200, bargap=0.2, xaxis={'categoryorder':'total ascending'})

fig.show()

In [93]:
# Define the order of 'Program' facet columns
program_order = ['Program1', 'Program2', 'Program3', 'Program4']
bsa_order = ['STGA', 'NE', 'PS', 'DI']
bsa_colors = {
    'STGA': '#4d4d4d',
    'NE': '#b30000',
    'PS': '#258e25',
    'DI': 'lightblue'
}

# Group by 'Year', 'Program', and 'BSA', and count the number of occurrences
df_agg = df.groupby(['Year', 'Program', 'BSA']).size().reset_index(name='Count')

# Create a figure with subplots arranged in a 2x2 grid
fig = make_subplots(rows=2, cols=2, subplot_titles=program_order, horizontal_spacing=0.1, vertical_spacing=0.1)

# Loop through each 'Program' facet column
for i, program in enumerate(program_order, start=1):
    # Filter data for the current program
    df_program = df_agg[df_agg['Program'] == program]
    
    # Loop through each 'BSA' category
    for j, bsa in enumerate(bsa_order, start=1):
        # Filter data for the current BSA category
        df_bsa = df_program[df_program['BSA'] == bsa]
        
        # Create bar trace
        bar = go.Bar(
            x=df_bsa['Year'],
            y=df_bsa['Count'],
            name=bsa,
            marker=dict(color=bsa_colors[bsa]),
            text=df_bsa['Count'],  # Add text on top of each bar
            textposition='auto'  # Automatically position the text
        )
        
        # Add trace to the subplot
        fig.add_trace(bar, row=(i+1)//2, col=(i+1)%2+1)

# Update layout
fig.update_layout(title='Count of Years Layered by BSA and Program', height=800, width=1200)
fig.update_layout(bargap=0.2)
fig.update_layout(height=1200, width=1600, bargap=0.2)
fig.update_xaxes(title_text='Year', row=1, col=1)
fig.update_xaxes(title_text='Year', row=1, col=2)
fig.update_yaxes(title_text='Count', row=1, col=1)
fig.update_yaxes(title_text='Count', row=2, col=1)
fig.show()

In [90]:
# Group by 'Year', 'Program', 'BSA', and 'Crd-B1B2', and count the number of occurrences
df_agg = df.groupby(['Year', 'Program', 'BSA', 'Crd-B1B2']).size().reset_index(name='Count')

# Define custom order and colors for BSA categories
bsa_order = ['STGA', 'NE', 'PS', 'DI']
bsa_colors = {
    'STGA': '#4d4d4d',
    'NE': '#b30000',
    'PS': '#258e25',
    'DI': 'lightblue'
}


# Create a figure with subplots arranged in a 4x2 grid
fig = make_subplots(rows=4, cols=2, subplot_titles=[f'{program} - {year}' for program in programs for year in years], horizontal_spacing=0.05, vertical_spacing=0.1)

# Loop through each subplot
for i, program in enumerate(programs):
    for j, year in enumerate(years):
        # Filter data for the current year and program
        df_subset = df_agg[(df_agg['Year'] == year) & (df_agg['Program'] == program)]
        
        # Change the order of df_subset based on bsa_order
        df_subset['BSA'] = pd.Categorical(df_subset['BSA'], categories=bsa_order, ordered=True)
        df_subset = df_subset.sort_values(by='BSA')
        
        # Create bar plot
        for bsa in bsa_order:
            df_bsa = df_subset[df_subset['BSA'] == bsa]
            bar = go.Bar(
                x=df_bsa['Crd-B1B2'].astype(str),  # Treat Crd-B1B2 values as strings
                y=df_bsa['Count'],
                marker=dict(color=bsa_colors[bsa]),
                name=bsa,
                legendgroup=f'{program}-{year}',  # Specify the legend group
                text=df_bsa['Count'],  # Text for each bar
                textposition='auto'  # Position of the text
            )
            
            # Add bar plot to the corresponding subplot
            fig.add_trace(bar, row=i+1, col=j+1)

        # Update x-axis and y-axis labels
        fig.update_xaxes(title_text=f'Credits B1B2 - {program} - {year}', type='category', row=i+1, col=j+1)
        fig.update_yaxes(title_text='Count', row=i+1, col=j+1)


# Manually create legend annotations
legend_annotations = [
    dict(
        x=0.5,
        y=-0.1,
        xref='paper',
        yref='paper',
        text='Legend:',
        showarrow=False,
        font=dict(size=14)
    )
]

# Add legend annotations for each BSA category
for i, bsa in enumerate(bsa_order):
    legend_annotations.append(
        dict(
            x=0.5 + 0.15 * i,
            y=-0.1,
            xref='paper',
            yref='paper',
            text=f'{bsa}',
            showarrow=False,
            font=dict(size=14),
            bgcolor=bsa_colors[bsa],
            bordercolor='black',
            borderwidth=1,
            borderpad=4,
            yshift=-30
        )
    )

# Update layout
fig.update_layout(title='Count of Credits B1B2 Layered by BSA and Program', height=800, width=1200)
fig.update_layout(height=1200, width=1600, bargap=0.2)
fig.update_xaxes(categoryorder='array', categoryarray=[str(x) for x in credits_b1b2_values])
fig.update_yaxes(title_text='Count')
fig.update_layout(legend=dict(title=''), annotations=legend_annotations)  # Remove legend title
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/