# Hausarbeit - Reporting und Visualisierung
### Einflussfaktoren auf die Schulleistungen im Rahmen der Oberschule
#### MADS 2023oB

### Environment setup

In [None]:
%%capture
# !pip install numpy pandas plotly

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

### Data description

**Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets:**

1 school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)

2 sex - student's sex (binary: 'F' - female or 'M' - male)

3 age - student's age (numeric: from 15 to 22)

4 address - student's home address type (binary: 'U' - urban or 'R' - rural)

5 famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)

6 Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart)

7 Medu - mother's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – 
secondary education or 4 – higher education)

8 Fedu - father's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – 
secondary education or 4 – higher education)

9 Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or 
police), 'at_home' or 'other')

10 Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or 
police), 'at_home' or 'other')

11 reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 
'other')

12 guardian - student's guardian (nominal: 'mother', 'father' or 'other')

13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 
4 - >1 hour)

14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)

15 failures - number of past class failures (numeric: n if 1<=n<3, else 4)

16 schoolsup - extra educational support (binary: yes or no)

17 famsup - family educational support (binary: yes or no)

18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)

19 activities - extra-curricular activities (binary: yes or no)

20 nursery - attended nursery school (binary: yes or no)

21 higher - wants to take higher education (binary: yes or no)

22 internet - Internet access at home (binary: yes or no)

23 romantic - with a romantic relationship (binary: yes or no)

24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)

25 freetime - free time after school (numeric: from 1 - very low to 5 - very high)

26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)

27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)

28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)

29 health - current health status (numeric: from 1 - very bad to 5 - very good)

30 absences - number of school absences (numeric: from 0 to 93)

**these grades are related with the course subject, Math or Portuguese:**

31 G1 - first period grade (numeric: from 0 to 20)

31 G2 - second period grade (numeric: from 0 to 20)

32 G3 - final grade (numeric: from 0 to 20, output target)

Additional note: there are several (382) students that belong to both datasets . 
These students can be identified by searching for identical attributes
that characterize each student, as shown in the annexed R file.

### Data loading and examination


In [None]:
mat_df = pd.read_csv('data/student-mat.csv', sep=';')
por_df = pd.read_csv('data/student-por.csv', sep=';')

info_df = pd.DataFrame([['mat_df', len(mat_df), len(mat_df.columns), np.mean(mat_df.isnull().sum())],
                        ['por_df', len(por_df), len(por_df.columns), np.mean(por_df.isnull().sum())]], 
                        columns=['table name', 'table size', 'number of columns', 'average missing values per column'])
info_df

In [None]:
print(f'mat_df columns: \n\n {mat_df.columns} \n')
print(f'por_df columns: \n\n {por_df.columns} \n')

In [None]:
mat_df.head()

In [None]:
por_df.head()

In [None]:
px.imshow(mat_df.corr(numeric_only=True))
px.imshow(por_df.corr(numeric_only=True))

### Data merging

In [None]:
merging_colums = ['school','sex','age','address','famsize','Pstatus','Medu','Fedu','Mjob','Fjob','reason','nursery','internet']

In [None]:
left_mixed_df = pd.merge(por_df,mat_df,'left', on=merging_colums)
# left_mixed_df.to_csv('data/student-merge-left.csv')

In [None]:
inner_mixed_df = pd.merge(por_df,mat_df,'inner', on=merging_colums)
# inner_mixed_df.to_csv('data/student-merge-inner.csv')

In [None]:
# anticipate that the following attributes from student-por also apply for student-mat
# double_columns = ['traveltime', 'internet', 'G3'] # 'guardian', 'romantic', 'famrel', 'goout', 'health', 'Dalc', 'Walc', 'higher'

# df = pd.DataFrame(inner_mixed_df).copy()

# for column in double_columns: 
#     df['equal'] = df[str(column)+'_x'] == df[str(column)+'_y']
#     # df[str(column)+'_y'] = df[str(column)+'_y'].combine_first(df[str(column)+'_x'])
#     print(df[[str(column)+'_x', str(column)+'_y']][df['equal'] == False])
# df[['guardian_x', 'guardian_y', 'romantic_x', 'romantic_y', 'famrel_x', 'famrel_y', 'goout_x', 'goout_y']][df['equal'] == False]

In [None]:
inner_mixed_info_series = pd.DataFrame([['inner_mixed_df', len(inner_mixed_df), len(inner_mixed_df.columns), np.mean(inner_mixed_df.isnull().sum())]], 
                                      columns=['table name', 'table size', 'number of columns', 'average missing values per column'])
left_mixed_info_series = pd.DataFrame([['left_mixed_df', len(left_mixed_df), len(left_mixed_df.columns), np.mean(left_mixed_df.isnull().sum())]], 
                                      columns=['table name', 'table size', 'number of columns', 'average missing values per column'])
info_df = pd.concat([info_df, inner_mixed_info_series, left_mixed_info_series])
info_df

### Data encoding

In [None]:
# binary values
binary_columns= ['school', 'sex', 'address', 'famsize', 'Pstatus','schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic',]
binary_map = {'school': {}, 
              'sex': {}, 
              'address': {}, 
              'famsize': {}, 
              'Pstatus': {},
              'schoolsup': {}, 
              'famsup': {}, 
              'paid': {}, 
              'activities': {}, 
              'nursery': {}, 
              'higher': {}, 
              'internet': {}, 
              'romantic': {}}
# iterate over all binary columns and apply the according map
# mat_df[binary_columns].map()

### Individuelle und strukturelle Faktoren

#### individuelle Faktoren

In [None]:
# median dalc / walc per g3 group 
# g3 groups are the sum of both g3 points
stacked_bar_df = pd.DataFrame()
stacked_bar_df['g3'] = pd.concat([inner_mixed_df['G3_x'], inner_mixed_df['G3_y']], axis=1).median(axis=1).astype(float)
stacked_bar_df['dalc'] = pd.concat([inner_mixed_df['Dalc_x'], inner_mixed_df['Dalc_y']], axis=1).median(axis=1)
stacked_bar_df['walc'] = pd.concat([inner_mixed_df['Walc_x'], inner_mixed_df['Walc_y']], axis=1).median(axis=1)
stacked_bar_df.tail()

# Define the g3 ranges (bins)
g3_bins = range(0,22,2)

# Create a new column with the bin labels for each g3 value
stacked_bar_df['g3_range'] = pd.cut(stacked_bar_df['g3'], bins=g3_bins)

# Group by the g3_range column and compute the median dalc and walc
stacked_bar_df = stacked_bar_df.groupby('g3_range')[['dalc', 'walc']].median().reset_index()

# Convert the Interval objects in the 'g3_range' column to strings
stacked_bar_df['g3_range'] = stacked_bar_df['g3_range'].astype(str).str[:-1] + ')'
stacked_bar_df

In [None]:
# Create a stacked bar graph using Plotly
stacked_bar = go.Figure()

# Add the first trace (bar) to the figure
stacked_bar.add_trace(go.Bar(
    x=stacked_bar_df["g3_range"],
    y=stacked_bar_df["dalc"],
    name='Alkoholkonsum an Arbeitstagen'
))

# Add the second trace (bar) to the figure
stacked_bar.add_trace(go.Bar(
    x=stacked_bar_df["g3_range"],
    y=stacked_bar_df["walc"],
    name='Alkoholkonsum am Wochenende'
))

# Configure the layout for a stacked bar graph
stacked_bar.update_layout(
    barmode='stack',
    title='Einfluss von Alkoholkonsum auf die Lernleistung',
    xaxis_title='Median der finalen Noten',
    yaxis_title='Alkoholkonsumeeinschätzung<br>(1: sehr gering - 5: sehr stark)',
    yaxis = dict(range=[0,10])
)

# Show the plot
stacked_bar.show()

#### Strukturelle Faktoren

In [None]:
scatter_df = pd.DataFrame()

mapping = {1: '<15 min', 2: '15 - 30 min', 3: '30-60 min', 4: '>60 min'}
scatter_df['traveltime'] = inner_mixed_df['traveltime_x'].map(mapping)

scatter_df['internet'] = inner_mixed_df['internet']
scatter_df['g3'] = pd.concat([inner_mixed_df['G3_x'], inner_mixed_df['G3_y']], axis=1).median(axis=1)

In [None]:
# Define colors for 'internet' categories
colors = {'yes': 'green', 'no': 'red'}

# Create a scatter plot
violin = go.Figure()

def translate(x):
    return 'Ja' if x=='yes' else 'Nein'

for internet_status, color in colors.items():
    subset = scatter_df[scatter_df['internet'] == internet_status]
    side_value = 'positive' if internet_status=='yes' else 'negative'
    violin.add_trace(go.Violin(
        x=subset['traveltime'],
        y=subset['g3'],
        box_visible=False,
        side=side_value,
        meanline_visible=True,
        line_color= 'black',# 'dark'+color,
        fillcolor=color,
        opacity=0.5,
        name=translate(internet_status)
    ))

# Update layout
violin.update_layout(
    title='Zusammenhang struktureller Faktoren und Lernleistungen',
    xaxis_title='Pendelzeit',
    yaxis_title='finale Note',
    xaxis=dict(
        categoryorder='array',
        categoryarray=['<15 min', '15 - 30 min', '30-60 min', '>60 min']
    )
)

# Show the plot
violin.show()

### Faktoren des sozialen Umfelds

In [None]:
histo_df = pd.DataFrame()
histo_df['g3'] = pd.concat([inner_mixed_df['G3_x'], inner_mixed_df['G3_y']], axis=1).median(axis=1)
histo_df['famrel'] = pd.concat([inner_mixed_df['famrel_x'], inner_mixed_df['famrel_y']], axis=1).median(axis=1)
histo_df['famsize'] = inner_mixed_df['famsize']

# histo_df['g3'] = histo_df.groupby(["famsize",'famrel']).median()
# histo_df_gt3 = histo_df[histo_df["famsize"] == "GT3"]
# histo_df_gt3 = histo_df_gt3.groupby(by="famrel").median()
# histo_df_le3 = histo_df[histo_df["famsize"] == "LE3"]
# histo_df_le3 = histo_df_le3.groupby(by="famrel")
histo_df = histo_df.groupby(['famsize', 'famrel'])['g3'].median().reset_index()
histo_df


In [None]:
# Create histograms for each family size
fig = go.Figure()
fig.add_trace(go.Bar(x=histo_df[histo_df["famsize"] == 'LE3']['famrel'], y=histo_df['g3'],name='LE3'))
fig.add_trace(go.Bar(x=histo_df[histo_df["famsize"] == 'GT3']['famrel'], y=histo_df['g3'], name='GT3'))

# Update layout
fig.update_layout(
    title='Distribution of g3 Based on Family Size',
    xaxis_title='Qualität der Familienbeziehung<br>(1: sehr schlecht - 5: sehr gut)',
    yaxis_title='Median der finalen Noten'
)

# Show the plot
fig.show()


### Einfluss der individuellen Leistungsbereitschaft