# Merge Children without secondary degree, Repeaters, Teachers and Children


In [35]:
from school_analysis.preprocessing.helpers.students_teachers import combine_school_type
from school_analysis.preprocessing.load import Loader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tueplots.constants.color import rgb
import school_analysis as sa
from school_analysis.plotting.general import GeneralPlots
from school_analysis.preprocessing import SCHOOL_TYPE_MAPPING

loader = Loader()

# Repeaters
repeaters = loader.load("number_of_repeaters")
repeaters = repeaters.rename(columns={"state": "Federal State", "school": "School Type",
                             "year": "Year", "grade": "Grade", "total": "all", "male": "m", "female": "f"})
repeaters = pd.melt(
    repeaters,
    id_vars=["Federal State", "School Type", "Year", "Grade"],
    var_name="Gender",
    value_name="Repeaters",
)
repeaters["Repeaters"] = repeaters["Repeaters"].replace(
    "-", np.nan).dropna().astype(int)
repeaters["School Type"] = repeaters["School Type"].map(SCHOOL_TYPE_MAPPING)

# Teachers
teachers = loader.load("teachers-per-schooltype")
teachers["Gender"] = teachers["Gender"].map({"z": "all", "w": "f", "m": "m"})

# Students
students = loader.load("school-children-by-type")
students["Gender"] = students["Gender"].map(
    {'Male': "m", 'Female': 'f', 'Total': 'all'})
students_rel_state = loader.load("school-children-by-state-percents")

# Teachers to Students
teachers_students_state = loader.load("students-per-teacher-by-state")
teachers_students_state = teachers_students_state.drop_duplicates()
teachers_students_type = loader.load("students-per-teacher-by-type")
teachers_students_type = teachers_students_type.drop_duplicates()

# Without Secondary Degrees
children_wo_degree = loader.load(
    "children_wo_degree").rename(columns={"year": "Year"})

In [36]:
# Preprocess
# repeaters
repeaters_c = repeaters.copy()
repeaters_c = repeaters_c.groupby(
    ["Federal State", "School Type", "Year", "Gender"])["Repeaters"].mean().reset_index()

# Students rel state
students_rel_state_c = students_rel_state.copy()
students_rel_state_c = students_rel_state_c[students_rel_state_c["Type"] == "Pupils"]

In [37]:
none_value_cols = ["Federal State", "School Type",
                   "Year", "Gender", "Contract Type"]
merged_state = pd.merge(
    repeaters_c,
    teachers,
    on=[
        "Federal State",
        "School Type",
        "Year",
        "Gender",
    ]
).dropna().drop_duplicates(none_value_cols)

none_value_cols += ["Type", "Contract Type"]
merged_state = pd.merge(
    merged_state,
    students_rel_state_c,
    on=["Federal State", "Year", "Gender"],
    suffixes=("", "_rel_state")
).dropna().drop_duplicates(none_value_cols)
merged_state = merged_state.rename(columns={
                                   "Type": "Children Type", "Number of Teachers": "Teachers", "Students": "Children", "Percentage": "Children (rel.)"})

# Sort columns
value_cols = ["Repeaters", "Teachers", "Children", "Children (rel.)"]
sorted_cols = [
    col for col in merged_state.columns if col not in value_cols] + value_cols
merged_state = merged_state[sorted_cols]
merged_state

Unnamed: 0,Federal State,School Type,Year,Gender,Contract Type,Children Type,Repeaters,Teachers,Children,Children (rel.)
0,Baden-Württemberg,Grundschulen,1998,all,Vollzeitbeschäftigte Lehrkräfte,Pupils,944.449275,12495.0,1276629.0,0.830485
1,Baden-Württemberg,Grundschulen,1998,all,Teilzeitbeschäftigte Lehrkräfte,Pupils,944.449275,16829.0,1276629.0,0.830485
2,Baden-Württemberg,Grundschulen,1998,all,Stundenweise beschäftigte Lehrkräfte,Pupils,944.449275,4659.0,1276629.0,0.830485
3,Baden-Württemberg,Grundschulen,1998,f,Vollzeitbeschäftigte Lehrkräfte,Pupils,401.840580,6484.0,625852.0,0.840116
4,Baden-Württemberg,Grundschulen,1998,f,Teilzeitbeschäftigte Lehrkräfte,Pupils,401.840580,15674.0,625852.0,0.840116
...,...,...,...,...,...,...,...,...,...,...
13177,Thüringen,Schularten mit mehreren Bildungsgängen,2020,f,Teilzeitbeschäftigte Lehrkräfte,Pupils,112.651163,700.0,96883.0,0.852018
13178,Thüringen,Schularten mit mehreren Bildungsgängen,2020,f,Stundenweise beschäftigte Lehrkräfte,Pupils,112.651163,101.0,96883.0,0.852018
13179,Thüringen,Schularten mit mehreren Bildungsgängen,2020,m,Vollzeitbeschäftigte Lehrkräfte,Pupils,163.511628,800.0,100611.0,0.842490
13180,Thüringen,Schularten mit mehreren Bildungsgängen,2020,m,Teilzeitbeschäftigte Lehrkräfte,Pupils,163.511628,109.0,100611.0,0.842490


In [38]:
merged_state[value_cols].corr()

Unnamed: 0,Repeaters,Teachers,Children,Children (rel.)
Repeaters,1.0,0.201505,0.258294,-0.24476
Teachers,0.201505,1.0,0.529151,-0.032435
Children,0.258294,0.529151,1.0,0.044618
Children (rel.),-0.24476,-0.032435,0.044618,1.0


! I think the stuff above make no sense :-/


## Students per Teacher


In [39]:
# Students per teacher
st_state_c = teachers_students_state.copy()
st_state_c = st_state_c[st_state_c["Type"] == "Pupils"]
st_state_c = st_state_c.drop(
    columns=["Type", "Contract Type", "Number of Teachers"])
st_state_c = st_state_c.rename(columns={"Sum Teachers": "Teachers"})

# Repeaters
repeaters_c = repeaters[repeaters["Gender"] == "all"].copy()
repeaters_c = repeaters_c.drop(columns=["Gender"])
repeaters_c = repeaters_c.groupby(["Federal State", "Year"])[
    "Repeaters"].mean().reset_index()

# Goal: Merge on "Year", "Federal State"

In [40]:
merged_state = pd.merge(
    repeaters_c,
    st_state_c,
    on=["Federal State", "Year"],
    suffixes=("", "_st")
).dropna().drop_duplicates(["Federal State", "Year"])

merged_state

Unnamed: 0,Federal State,Year,Repeaters,Students per Teacher,Teachers,Students
0,Baden-Württemberg,1998,654.092269,12.706190,100473.0,1276629.0
3,Baden-Württemberg,1999,655.927861,12.682710,101888.0,1292216.0
6,Baden-Württemberg,2000,712.584810,12.458252,104399.0,1300629.0
9,Baden-Württemberg,2001,721.058376,12.432342,105250.0,1308504.0
12,Baden-Württemberg,2002,663.558974,12.366889,106291.0,1314489.0
...,...,...,...,...,...,...
1020,Thüringen,2016,261.020690,10.333568,18431.0,190458.0
1023,Thüringen,2017,271.838488,10.603964,18165.0,192621.0
1026,Thüringen,2018,269.052632,11.007629,17695.0,194780.0
1029,Thüringen,2019,260.903915,11.263431,17367.0,195612.0


In [41]:
value_cols = ["Repeaters", "Teachers", "Students per Teacher", "Students"]
merged_state[value_cols].corr()

Unnamed: 0,Repeaters,Teachers,Students per Teacher,Students
Repeaters,1.0,0.04346,0.523966,0.113287
Teachers,0.04346,1.0,0.04945,0.984875
Students per Teacher,0.523966,0.04945,1.0,0.169685
Students,0.113287,0.984875,0.169685,1.0


## Average over federal states


In [42]:
# Children w\o degree
children_wo_degree_c = children_wo_degree.copy()
children_wo_degree_c = children_wo_degree_c.drop(columns=["Total students"])

In [43]:
merged_avg = merged_state.groupby(
    ["Year"])[list(set(value_cols) - {"Students per Teacher"})].sum().reset_index()
merged_avg["Students per Teacher"] = merged_state.groupby(
    ["Year"])["Students per Teacher"].mean().reset_index(drop=True)
merged_avg = pd.merge(
    merged_avg,
    children_wo_degree_c,
    on=["Year"],
    suffixes=("", "_wo_degree")
).dropna().drop_duplicates(["Year"])
# merged_avg["Without degree (rel.)"] = merged_avg["Without degree"] / \
#     merged_avg["Students"]
merged_avg

Unnamed: 0,Year,Repeaters,Students,Teachers,Students per Teacher,Without degree,Without degree (rel.)
0,1998,8798.431035,9693481.0,697159.0,13.86199,655234.0,0.029675
1,1999,8899.98087,9644719.0,705394.0,13.612149,677697.0,0.028672
2,2000,9815.450769,9568577.0,711250.0,13.360669,693118.0,0.029622
3,2001,9807.300019,9490139.0,710198.0,13.208677,661708.0,0.028552
4,2002,8903.1631,9409219.0,712650.0,12.952638,652647.0,0.028293
5,2003,8495.604834,9362322.0,713179.0,12.759696,632157.0,0.027349
6,2004,6122.159653,9268273.0,711537.0,12.572353,598069.0,0.026222
7,2005,6312.164046,9157927.0,711162.0,12.42626,581089.0,0.025718
8,2006,5941.497687,9016008.0,718772.0,12.158302,533897.0,0.024019
9,2007,5799.968846,8850784.0,719149.0,11.876377,492524.0,0.022622


In [44]:
value_cols = list(set(merged_avg.columns) - {"Year"})
merged_avg[value_cols].corr()

Unnamed: 0,Without degree,Teachers,Students per Teacher,Students,Without degree (rel.),Repeaters
Without degree,1.0,-0.823187,0.96061,0.958685,0.994622,0.967023
Teachers,-0.823187,1.0,-0.889498,-0.832355,-0.821945,-0.81336
Students per Teacher,0.96061,-0.889498,1.0,0.966595,0.953097,0.959248
Students,0.958685,-0.832355,0.966595,1.0,0.936134,0.946839
Without degree (rel.),0.994622,-0.821945,0.953097,0.936134,1.0,0.956111
Repeaters,0.967023,-0.81336,0.959248,0.946839,0.956111,1.0
