# Hausarbeit - Reporting und Visualisierung
### Einflussfaktoren auf die Schulleistungen im Rahmen der Oberschule
#### MADS 2023oB

## Environment setup

In [1]:
%%capture
# !pip install numpy pandas plotly
# or
# !. venv/bin/activate

In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## Data transformations

### Data description

**Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets:**

1 school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)

2 sex - student's sex (binary: 'F' - female or 'M' - male)

3 age - student's age (numeric: from 15 to 22)

4 address - student's home address type (binary: 'U' - urban or 'R' - rural)

5 famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)

6 Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart)

7 Medu - mother's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – 
secondary education or 4 – higher education)

8 Fedu - father's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – 
secondary education or 4 – higher education)

9 Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or 
police), 'at_home' or 'other')

10 Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or 
police), 'at_home' or 'other')

11 reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 
'other')

12 guardian - student's guardian (nominal: 'mother', 'father' or 'other')

13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 
4 - >1 hour)

14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)

15 failures - number of past class failures (numeric: n if 1<=n<3, else 4)

16 schoolsup - extra educational support (binary: yes or no)

17 famsup - family educational support (binary: yes or no)

18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)

19 activities - extra-curricular activities (binary: yes or no)

20 nursery - attended nursery school (binary: yes or no)

21 higher - wants to take higher education (binary: yes or no)

22 internet - Internet access at home (binary: yes or no)

23 romantic - with a romantic relationship (binary: yes or no)

24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)

25 freetime - free time after school (numeric: from 1 - very low to 5 - very high)

26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)

27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)

28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)

29 health - current health status (numeric: from 1 - very bad to 5 - very good)

30 absences - number of school absences (numeric: from 0 to 93)

**these grades are related with the course subject, Math or Portuguese:**

31 G1 - first period grade (numeric: from 0 to 20)

31 G2 - second period grade (numeric: from 0 to 20)

32 G3 - final grade (numeric: from 0 to 20, output target)

Additional note: there are several (382) students that belong to both datasets . 
These students can be identified by searching for identical attributes
that characterize each student, as shown in the annexed R file.

### Data loading and examination


In [3]:
mat_df = pd.read_csv('data/student-mat.csv', sep=';')
por_df = pd.read_csv('data/student-por.csv', sep=';')

info_df = pd.DataFrame([['mat_df', len(mat_df), len(mat_df.columns), np.mean(mat_df.isnull().sum())],
                        ['por_df', len(por_df), len(por_df.columns), np.mean(por_df.isnull().sum())]], 
                        columns=['table name', 'table size', 'number of columns', 'average missing values per column'])
info_df

Unnamed: 0,table name,table size,number of columns,average missing values per column
0,mat_df,395,33,0.0
1,por_df,649,33,0.0


In [4]:
print(f'mat_df columns: \n\n {mat_df.columns} \n')
print(f'por_df columns: \n\n {por_df.columns} \n')

mat_df columns: 

 Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object') 

por_df columns: 

 Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object') 



In [5]:
mat_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [6]:
por_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [7]:
px.imshow(mat_df.corr(numeric_only=True))
px.imshow(por_df.corr(numeric_only=True))

### Data merging

In [8]:
merging_colums = ['school','sex','age','address','famsize','Pstatus','Medu','Fedu','Mjob','Fjob','reason','nursery','internet']

In [9]:
left_mixed_df = pd.merge(por_df,mat_df,'left', on=merging_colums)

# uncomment to save data file
# left_mixed_df.to_csv('data/student-merge-left.csv')

In [10]:
inner_mixed_df = pd.merge(por_df,mat_df,'inner', on=merging_colums)

# uncomment to save data file
# inner_mixed_df.to_csv('data/student-merge-inner.csv')

In [11]:
def check_contraditories(df:pd.DataFrame, columns='all'):
    # This cell checks for contraditory values within the merged dataset
    double_columns = [x for x in list(por_df.columns) if x not in merging_colums]
    check_columns = columns if columns != 'all' else double_columns
    df = pd.DataFrame(inner_mixed_df).copy()

    for column in check_columns: 
        df['equal'] = df[str(column)+'_x'] == df[str(column)+'_y'] # TODO: fix overwriting after each column 
        
        # anticipate that the following attributes from student-por also apply for student-mat
        # df[str(column)+'_y'] = df[str(column)+'_y'].combine_first(df[str(column)+'_x'])

    xy_columns = []
    xy_columns.extend([str(column)+'_x' for column in check_columns])
    xy_columns.extend([str(column)+'_y' for column in check_columns])
    xy_columns.sort()
    return df[xy_columns][df['equal'] == False]

check_contraditories(inner_mixed_df)

Unnamed: 0,Dalc_x,Dalc_y,G1_x,G1_y,G2_x,G2_y,G3_x,G3_y,Walc_x,Walc_y,...,paid_x,paid_y,romantic_x,romantic_y,schoolsup_x,schoolsup_y,studytime_x,studytime_y,traveltime_x,traveltime_y
0,1,1,0,5,11,6,11,6,1,1,...,no,no,no,no,yes,yes,2,2,2,2
1,1,1,9,5,11,5,11,6,1,1,...,no,no,no,no,no,no,2,2,1,1
2,2,2,12,7,13,8,12,10,3,3,...,no,yes,no,no,yes,yes,2,2,1,1
3,1,1,14,15,14,14,14,15,1,1,...,no,yes,yes,yes,no,no,3,3,1,1
4,1,1,11,6,13,10,13,10,2,2,...,no,yes,no,no,no,no,2,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377,1,1,10,7,11,5,10,0,2,2,...,no,no,no,no,no,no,3,3,1,1
378,1,1,15,7,15,9,16,8,1,1,...,no,yes,no,no,no,no,2,2,1,1
379,1,1,11,6,12,5,9,0,1,1,...,no,no,no,no,no,no,2,2,2,2
380,3,3,10,14,10,16,10,16,4,4,...,no,no,no,no,no,no,1,1,2,2


In [12]:
inner_mixed_info_series = pd.DataFrame([['inner_mixed_df', len(inner_mixed_df), len(inner_mixed_df.columns), np.mean(inner_mixed_df.isnull().sum())]], 
                                      columns=['table name', 'table size', 'number of columns', 'average missing values per column'])
left_mixed_info_series = pd.DataFrame([['left_mixed_df', len(left_mixed_df), len(left_mixed_df.columns), np.mean(left_mixed_df.isnull().sum())]], 
                                      columns=['table name', 'table size', 'number of columns', 'average missing values per column'])
info_df = pd.concat([info_df, inner_mixed_info_series, left_mixed_info_series])
info_df

Unnamed: 0,table name,table size,number of columns,average missing values per column
0,mat_df,395,33,0.0
1,por_df,649,33,0.0
0,inner_mixed_df,382,53,0.0
0,left_mixed_df,657,53,103.773585


## Data analysis

### Individuelle und strukturelle Faktoren

#### Individuelle Faktoren

In [13]:
# g3 groups are the median of both g3 points
stacked_bar_df = pd.DataFrame()
stacked_bar_df['g3'] = pd.concat([inner_mixed_df['G3_x'], inner_mixed_df['G3_y']], axis=1).median(axis=1).astype(float)

# check contradictories
result = check_contraditories(inner_mixed_df, ['Dalc'])
print(f'{np.round(len(result) / len(inner_mixed_df),2)} % of contradictory values')
result2 = check_contraditories(inner_mixed_df, ['Walc'])
print(f'{np.round(len(result2) / len(inner_mixed_df),2)} % of contradictory values')

# data filtering
stacked_bar_df['dalc'] = pd.concat([inner_mixed_df['Dalc_x'], inner_mixed_df['Dalc_y']], axis=1).median(axis=1)
stacked_bar_df['walc'] = pd.concat([inner_mixed_df['Walc_x'], inner_mixed_df['Walc_y']], axis=1).median(axis=1)
stacked_bar_df.drop(index=result.index.append(result2.index), inplace=True)

# Define the g3 ranges (bins)
g3_bins = range(2,22,2)

# Create a new column with the bin labels for each g3 value
stacked_bar_df['g3_range'] = pd.cut(stacked_bar_df['g3'], bins=g3_bins)

# Group by the g3_range column and compute the median dalc and walc
stacked_bar_df = stacked_bar_df.groupby('g3_range')[['dalc', 'walc']].median().reset_index()

# Convert the Interval objects in the 'g3_range' column to strings
stacked_bar_df['g3_range'] = stacked_bar_df['g3_range'].astype(str)
stacked_bar_df

0.01 % of contradictory values
0.03 % of contradictory values






Unnamed: 0,g3_range,dalc,walc
0,"(2, 4]",2.0,3.0
1,"(4, 6]",1.0,1.0
2,"(6, 8]",1.0,2.0
3,"(8, 10]",1.0,2.0
4,"(10, 12]",1.0,2.0
5,"(12, 14]",1.0,2.0
6,"(14, 16]",1.0,1.5
7,"(16, 18]",1.0,1.5
8,"(18, 20]",1.0,1.0


In [14]:
# Create a stacked bar graph using Plotly
stacked_bar = go.Figure()

# Add the first trace (bar) to the stacked bar
stacked_bar.add_trace(go.Bar(
    x=stacked_bar_df['g3_range'],
    y=stacked_bar_df['dalc'],
    name='Alkoholkonsum an Werktagen',
    marker_color='#B11226',  # Set color to wine
    width=0.7
))

# Add the second trace (bar) to the stacked bar
stacked_bar.add_trace(go.Bar(
    x=stacked_bar_df['g3_range'],
    y=stacked_bar_df['walc'],
    name='Alkoholkonsum am Wochenende',
    marker_color='#FBB117',  # Set color to yellow
    width=0.7
))

# Configure the layout for a stacked bar graph
stacked_bar.update_layout(
    barmode='stack',
    title='Einfluss von Alkoholkonsum auf die Lernleistung',
    yaxis = dict(range=[0,10]),

    height=700,
    width=1200,

    legend=dict(title='', orientation='v', traceorder='normal', x=0.75, y=1.1, font=dict(size=14)),
    margin=dict(l=50, r=20, t=10, b=50),
)

stacked_bar.update_xaxes(title_text='Median der erreichten Notenpunkte <br>in beiden Fächern', title_font=dict(size=15))
stacked_bar.update_yaxes(title_text='Median der Alkoholkonsumeinschätzung pro Kategorie<br>(1: sehr gering - 5: sehr stark)', title_font=dict(size=15))

# Show the plot
stacked_bar.write_image('stacked_bar.png')
stacked_bar.show()

#### Strukturelle Faktoren

In [15]:
violin_df = pd.DataFrame()

mapping = {1: '<15 min', 2: '15 - 30 min', 3: '30-60 min', 4: '>60 min'}
violin_df['traveltime'] = inner_mixed_df['traveltime_x'].map(mapping)

violin_df['internet'] = inner_mixed_df['internet']
violin_df['g3'] = pd.concat([inner_mixed_df['G3_x'], inner_mixed_df['G3_y']], axis=1).median(axis=1)

In [16]:
# Define colors for 'internet' categories
colors = {'yes': 'green', 'no': 'red'}

# Create a violin plot
violin = go.Figure()

def translate(x):
    return 'Ja' if x=='yes' else 'Nein'

for internet_status, color in colors.items():
    subset = violin_df[violin_df['internet'] == internet_status]
    side_value = 'positive' if internet_status=='yes' else 'negative'
    violin.add_trace(go.Violin(
        x=subset['traveltime'],
        y=subset['g3'],
        box_visible=False,
        side=side_value,
        meanline_visible=True,
        line_color= 'black',
        fillcolor=color,
        opacity=0.5,
        name=translate(internet_status)
    ))

# Update layout
violin.update_layout(
    title='Zusammenhang struktureller Faktoren und Lernleistungen',
    yaxis = dict(range=[0,20]),
    xaxis=dict(
        categoryorder='array',
        categoryarray=['<15 min', '15 - 30 min', '30-60 min', '>60 min']
    ),

    legend=dict(title='Internet verfügbar', orientation='v', traceorder='normal', x=0.83, y=1.15, font=dict(size=14)),
    height=700,
    width=1200,
    margin=dict(l=50, r=20, t=10, b=50),
)

violin.update_xaxes(title_text='Pendelzeit', title_font=dict(size=15))
violin.update_yaxes(title_text='Median der erreichten Notenpunkte <br>in beiden Fächern', title_font=dict(size=15))

# Show the plot
violin.write_image('violin.png')
violin.show()

### Soziale Faktoren

#### familiäres Umfeld

In [17]:
bar_df = pd.DataFrame()
bar_df['g3'] = pd.concat([inner_mixed_df['G3_x'], inner_mixed_df['G3_y']], axis=1).median(axis=1)
bar_df['famrel'] = pd.concat([inner_mixed_df['famrel_x'], inner_mixed_df['famrel_y']], axis=1).median(axis=1)
bar_df['famsize'] = inner_mixed_df['famsize']

bar_df

Unnamed: 0,g3,famrel,famsize
0,8.5,4.0,GT3
1,8.5,5.0,GT3
2,11.0,4.0,LE3
3,14.5,3.0,GT3
4,11.5,4.0,GT3
...,...,...,...
377,5.0,5.0,GT3
378,12.0,4.0,LE3
379,4.5,1.0,GT3
380,13.0,2.0,LE3


In [18]:
agg_bar_df = bar_df.groupby(['famsize', 'famrel'])['g3'].median().reset_index()

# Create histograms for each family size
bar = go.Figure()
bar.add_trace(go.Bar(x=agg_bar_df[agg_bar_df['famsize'] == 'LE3']['famrel'], y=agg_bar_df['g3'],name='Kleine Familie (<=3 Pers.)'))
bar.add_trace(go.Bar(x=agg_bar_df[agg_bar_df['famsize'] == 'GT3']['famrel'], y=agg_bar_df['g3'], name='Große Familie (>3 Pers.)'))

# Update layout
bar.update_layout(
    title='Einfluss der Familiengröße und dessen <br>Beziehungsqualität auf die Lernleistung',
    yaxis = dict(range=[0,20]),

    legend=dict(title='Familiengröße', orientation='v', traceorder='normal', x=0.78, y=1.15, font=dict(size=14)),
    height=700,
    width=1200,
    margin=dict(l=50, r=20, t=10, b=50),
)

bar.update_xaxes(title_text='Qualität der Familienbeziehung<br>(1: sehr schlecht - 5: sehr gut)', title_font=dict(size=15))
bar.update_yaxes(title_text='Median der erreichten Notenpunkte <br>in beiden Fächern', title_font=dict(size=15))

# Show the plot
bar.write_image('bar.png')
bar.show()

#### partnerliches und freundschaftliches Umfeld

In [19]:
# data filtering
result = check_contraditories(inner_mixed_df, ['goout'])
print(f'{np.round(len(result) / len(inner_mixed_df),2)} % of contradictory values')
result2 = check_contraditories(inner_mixed_df, ['romantic'])
print(f'{np.round(len(result2) / len(inner_mixed_df),2)} % of contradictory values')

box_df = pd.DataFrame()
box_df['g3'] = pd.concat([inner_mixed_df['G3_x'], inner_mixed_df['G3_y']], axis=1).median(axis=1)
box_df['goout'] = pd.concat([inner_mixed_df['goout_x'], inner_mixed_df['goout_y']], axis=1).median(axis=1)
box_df['romantic'] = inner_mixed_df['romantic_x']
box_df.drop(index=result.index.append(result2.index), inplace=True)
box_df

0.03 % of contradictory values
0.02 % of contradictory values


Unnamed: 0,g3,goout,romantic
0,8.5,4.0,no
1,8.5,3.0,no
2,11.0,2.0,no
3,14.5,2.0,yes
4,11.5,2.0,no
...,...,...,...
377,5.0,2.0,no
378,12.0,4.0,no
379,4.5,1.0,no
380,13.0,5.0,no


In [20]:
box = go.Figure()

category_map = {'yes': 'Ja', 'no': 'Nein'}

for index, (category, color) in enumerate(zip(['no', 'yes'], ['grey', '#E41B17'])):
    df_category = box_df[box_df['romantic'] == category]
    box.add_trace(go.Box(
        x=df_category['goout'],
        y=df_category['g3'],
        name=f'{category_map[category]}',
        marker_color=color,
        offsetgroup=index
    ))

box.update_layout(
    title='Lernleistungseinfluss des Partnerstatus und sozialer Aktivitäten',
    yaxis = dict(range=[0,20]),
    boxmode='group',
    legend=dict(title='Partner vorhanden', orientation='v', traceorder='normal', x=0.85, y=1.15, font=dict(size=14)),
    height=700,
    width=1200,
    margin=dict(l=50, r=20, t=10, b=50),
)

box.update_xaxes(title_text='Intensität sozialer Aktivitäten im Freundeskreis<br>(1: sehr gering - 5: sehr hoch)', title_font=dict(size=15))
box.update_yaxes(title_text='Median der erreichten Notenpunkte <br>in beiden Fächern', title_font=dict(size=15))

box.write_image('box.png')
box.show()

### Einfluss der individuellen Leistungsbereitschaft

#### Einfluss von Lernunterstützung

In [21]:
balk_df = pd.DataFrame()
mat_df['mat_sup_sum'] = (mat_df[['schoolsup', 'famsup', 'paid']] == 'yes').sum(axis=1)
por_df['por_sup_sum'] = (por_df[['schoolsup', 'famsup', 'paid']] == 'yes').sum(axis=1)

aggr_mat_balk_df = mat_df.groupby(['mat_sup_sum'])['G3'].median().reset_index()
aggr_por_balk_df = por_df.groupby(['por_sup_sum'])['G3'].median().reset_index()

print(aggr_mat_balk_df)
print(aggr_por_balk_df)

   mat_sup_sum    G3
0            0  11.0
1            1  11.0
2            2  11.0
3            3   9.0
   por_sup_sum    G3
0            0  12.0
1            1  12.0
2            2  12.0
3            3  12.0


In [22]:
balk = go.Figure()
balk = make_subplots(rows=1, cols=2, subplot_titles=('Mathematik', 'Portugiesisch'))

balk.add_trace(go.Box(
    x=mat_df['mat_sup_sum'],
    y=mat_df['G3'],
    name='Mathematik',
    marker_color='blue',
    offsetgroup=0,
    orientation='v'),
    row=1,
    col=1
)

balk.add_trace(go.Box(
    y=por_df['G3'],
    x=por_df['por_sup_sum'],
    name='Portugiesisch',
    marker_color='#FF0000',
    offsetgroup=0,
    orientation='v'),
    row=1,
    col=2
)

balk.update_layout(
    title='Einfluss von Lernunterstützungen zu den Lernleistungen von Schülern',
    boxmode='group',
    legend=dict(title='Schulfach', orientation='v', traceorder='normal', x=0.89, y=1.15, font=dict(size=14)),
    height=700,
    width=1200,
    margin=dict(l=50, r=20, t=10, b=50),
)

balk.update_yaxes(title_text='Final erreichte Notenpunkte', row=1, col=1, title_font=dict(size=15))
balk.update_xaxes(title_text='Summe der Lernunterstützungen', row=1, col=1, title_font=dict(size=15))
balk.update_xaxes(title_text='Summe der Lernunterstützungen', row=1, col=2, title_font=dict(size=15))

balk.write_image('balk.png')
balk.show()

#### Einfluss individueller Lernbereitschaft

In [23]:
studytime_map = {1: '<2 Stunden', 2: '2 bis 5 Stunden', 3: '5 bis 10 Stunden',4: '>10 Stunden'}

# create database for math 
mat_nextbar_df = pd.DataFrame()
mat_nextbar_df['G3'] = mat_df['G3']
mat_nextbar_df['studytime'] = mat_df['studytime']
mat_nextbar_df['absences_bins'] = pd.cut(mat_df['absences'], bins=4).astype(str)
mat_grouped = mat_nextbar_df.groupby(['studytime', 'absences_bins'])['G3'].median().reset_index()

mat_grouped['absences_bins'] = mat_grouped['absences_bins'].str[1:-1]
mat_grouped['absences_bins'] = mat_grouped['absences_bins'].replace('-0.075, 18.75', '0, 18.75')
mat_grouped['studytime_order'] = mat_grouped['studytime'] 
mat_grouped['studytime'] = mat_grouped['studytime'].map(studytime_map)
mat_grouped = mat_grouped.sort_values(by='studytime_order')
mat_grouped.drop(columns='studytime_order', inplace=True)

# create database for por
por_nextbar_df = pd.DataFrame()
por_nextbar_df['G3'] = por_df['G3']
por_nextbar_df['studytime'] = por_df['studytime']
por_nextbar_df['absences_bins'] = pd.cut(por_df['absences'], bins=4).astype(str)
por_grouped = por_nextbar_df.groupby(['studytime', 'absences_bins'])['G3'].median().reset_index()

por_grouped['absences_bins'] = por_grouped['absences_bins'].str[1:-1]
por_grouped['absences_bins'] = por_grouped['absences_bins'].replace('-0.032, 8.0', '0, 8.0')
por_grouped['absences_bins'] = por_grouped['absences_bins'].replace(',', 'bis')
por_grouped['studytime_order'] = por_grouped['studytime'] 
por_grouped['studytime'] = por_grouped['studytime'].map(studytime_map)
por_grouped = por_grouped.sort_values(by='studytime_order')
por_grouped.drop(columns='studytime_order', inplace=True)

In [24]:
# Creating a figure with subplots for each studytime value
nextbar = go.Figure()
nextbar = make_subplots(rows=1, cols=2, subplot_titles=('Mathematik', 'Portugiesisch'))

# Define color scale ranging from yellow to red based on absence values
colors = ['#ffc100', '#ff9a00','#ff7400','#ff0000']
colors = ['#bbdb44','#f7e379','#f2a134', '#e51f1f']

index_map = {1: 'Mathematik', 2: 'Portugiesisch'}

datasets = [mat_grouped, por_grouped]
for index, data in enumerate(datasets):
    for i, absence_bin in enumerate(data['absences_bins'].unique()):
        filtered_data = data[data['absences_bins'] == absence_bin]
        nextbar.add_trace(go.Bar(x=filtered_data['studytime'], 
                                 y=filtered_data['G3'], 
                                 name=f'{index_map[index+1]} Fehltage im Bereich von: {absence_bin} Tagen',
                                 marker=dict(color=colors[i])
                                 ),
                                row=1,
                                col=index+1
                         )

# Updating layout
nextbar.update_layout(
    title='Lernleistung anhand der Lernzeit und Anzahl der Schulfehltage',
    barmode='group',
    # Adjust spacing between subplots
    margin=dict(l=20, r=20, t=50, b=50),
    yaxis = dict(range=[0,20]),
    # Show legend
    legend=dict(title='Anzahl der Fehltage', orientation='v', traceorder='normal', x=0.65, y=1.65, font=dict(size=14)),
    height=700,
    width=1200,
)

# Set x-axis titles for each subplot
nextbar.update_xaxes(title_text='Wöchentliche Lernzeit', row=1, col=1, title_font=dict(size=15))
nextbar.update_xaxes(title_text='Wöchentliche Lernzeit', row=1, col=2, title_font=dict(size=15))
nextbar.update_yaxes(title_text='Final erreichte Notenpunkte', row=1, col=1, title_font=dict(size=15))

# Show the plot
nextbar.write_image('nextbar.png')
nextbar.show()