# Merge Children without secondary degree, Repeaters, Teachers and Children


In [1]:
from school_analysis.preprocessing.helpers.students_teachers import combine_school_type
from school_analysis.preprocessing.load import Loader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tueplots.constants.color import rgb
import school_analysis as sa
from school_analysis.plotting.general import GeneralPlots
from school_analysis.preprocessing import SCHOOL_TYPE_MAPPING

loader = Loader()

# Repeaters
repeaters = loader.load("number_of_repeaters")
repeaters = repeaters.rename(columns={"state": "Federal State", "school": "School Type",
                             "year": "Year", "grade": "Grade", "total": "all", "male": "m", "female": "f"})
repeaters = pd.melt(
    repeaters,
    id_vars=["Federal State", "School Type", "Year", "Grade"],
    var_name="Gender",
    value_name="Repeaters",
)
repeaters["Repeaters"] = repeaters["Repeaters"].replace(
    "-", np.nan).dropna().astype(int)
repeaters["School Type"] = repeaters["School Type"].map(SCHOOL_TYPE_MAPPING)

# Teachers
teachers = loader.load("teachers-per-schooltype")
teachers["Gender"] = teachers["Gender"].map({"z": "all", "w": "f", "m": "m"})

# Students
students = loader.load("school-children-by-type")
students["Gender"] = students["Gender"].map(
    {'Male': "m", 'Female': 'f', 'Total': 'all'})
students_rel_state = loader.load("school-children-by-state-percents")

# Teachers to Students
teachers_students_state = loader.load("students-per-teacher-by-state")
teachers_students_state = teachers_students_state.drop_duplicates()
teachers_students_type = loader.load("students-per-teacher-by-type")
teachers_students_type = teachers_students_type.drop_duplicates()

# Without Secondary Degrees
children_wo_degree = loader.load(
    "children_wo_degree").rename(columns={"year": "Year"})

In [2]:
# Preprocess
# repeaters
repeaters_c = repeaters.copy()
repeaters_c = repeaters_c.groupby(
    ["Federal State", "School Type", "Year", "Gender"])["Repeaters"].mean().reset_index()

# Students rel state
students_rel_state_c = students_rel_state.copy()
students_rel_state_c = students_rel_state_c[students_rel_state_c["Type"] == "Pupils"]

In [3]:
none_value_cols = ["Federal State", "School Type",
                   "Year", "Gender", "Contract Type"]
merged_state = pd.merge(
    repeaters_c,
    teachers,
    on=[
        "Federal State",
        "School Type",
        "Year",
        "Gender",
    ],
    suffixes=("_repeaters", "_teachers")
).dropna().drop_duplicates(none_value_cols)

none_value_cols += ["Type", "Contract Type"]
merged_state = pd.merge(
    merged_state,
    students_rel_state_c,
    on=["Federal State", "Year", "Gender"],
    suffixes=("", "_rel_state")
).dropna().drop_duplicates(none_value_cols)
merged_state = merged_state.rename(columns={
                                   "Type": "Children Type", "Number of Teachers": "Teachers", "Students": "Children", "Percentage": "Children (rel.)"})

# Sort columns
value_cols = ["Repeaters", "Teachers", "Children", "Children (rel.)"]
sorted_cols = [
    col for col in merged_state.columns if col not in value_cols] + value_cols
merged_state = merged_state[sorted_cols]
merged_state

Unnamed: 0,Federal State,School Type,Year,Gender,Contract Type,Children Type,Repeaters,Teachers,Children,Children (rel.)
0,Baden-Württemberg,Grundschulen,1998,all,Vollzeitbeschäftigte Lehrkräfte,Pupils,944.449275,12495.0,1276629.0,0.830485
1,Baden-Württemberg,Grundschulen,1998,all,Teilzeitbeschäftigte Lehrkräfte,Pupils,944.449275,16829.0,1276629.0,0.830485
2,Baden-Württemberg,Grundschulen,1998,all,Stundenweise beschäftigte Lehrkräfte,Pupils,944.449275,4659.0,1276629.0,0.830485
3,Baden-Württemberg,Grundschulen,1998,f,Vollzeitbeschäftigte Lehrkräfte,Pupils,401.840580,6484.0,625852.0,0.840116
4,Baden-Württemberg,Grundschulen,1998,f,Teilzeitbeschäftigte Lehrkräfte,Pupils,401.840580,15674.0,625852.0,0.840116
...,...,...,...,...,...,...,...,...,...,...
13177,Thüringen,Schularten mit mehreren Bildungsgängen,2020,f,Teilzeitbeschäftigte Lehrkräfte,Pupils,112.651163,700.0,96883.0,0.852018
13178,Thüringen,Schularten mit mehreren Bildungsgängen,2020,f,Stundenweise beschäftigte Lehrkräfte,Pupils,112.651163,101.0,96883.0,0.852018
13179,Thüringen,Schularten mit mehreren Bildungsgängen,2020,m,Vollzeitbeschäftigte Lehrkräfte,Pupils,163.511628,800.0,100611.0,0.842490
13180,Thüringen,Schularten mit mehreren Bildungsgängen,2020,m,Teilzeitbeschäftigte Lehrkräfte,Pupils,163.511628,109.0,100611.0,0.842490


In [4]:
merged_state[value_cols].corr()

Unnamed: 0,Repeaters,Teachers,Children,Children (rel.)
Repeaters,1.0,0.201505,0.258294,-0.24476
Teachers,0.201505,1.0,0.529151,-0.032435
Children,0.258294,0.529151,1.0,0.044618
Children (rel.),-0.24476,-0.032435,0.044618,1.0


! I think the stuff above make no sense :-/


## Students per Teacher


In [5]:
# Students per teacher
st_state_c = teachers_students_state.copy()
st_state_c = st_state_c[st_state_c["Type"] == "Pupils"]
st_state_c = st_state_c[
    (st_state_c["Contract Type"] == "Vollzeitbeschäftigte Lehrkräfte")
    & (st_state_c["Gender_students"] == "all")
    & (st_state_c["Gender_teachers"] == "all")
]
st_state_c = st_state_c.drop(
    columns=["Type", "Contract Type"])
st_state_c = st_state_c.rename(columns={"Number of Teachers": "Teachers"})

# Repeaters
repeaters_c = repeaters[repeaters["Gender"] == "all"].copy()
repeaters_c = repeaters_c.drop(columns=["Gender"])
repeaters_c = repeaters_c.groupby(["Federal State", "Year"])[
    "Repeaters"].mean().reset_index()

# Goal: Merge on "Year", "Federal State"

In [6]:
merged_state = pd.merge(
    repeaters_c,
    st_state_c,
    on=["Federal State", "Year"],
    suffixes=("", "_st")
).dropna().drop_duplicates(["Federal State", "Year"])

merged_state

Unnamed: 0,Federal State,Year,Repeaters,Gender_students,Students,Gender_teachers,Teachers,Students per Teacher
0,Baden-Württemberg,1998,654.092269,all,1276629.0,all,47747.0,26.737366
1,Baden-Württemberg,1999,655.927861,all,1292216.0,all,47462.0,27.226328
2,Baden-Württemberg,2000,712.584810,all,1300629.0,all,47233.0,27.536447
3,Baden-Württemberg,2001,721.058376,all,1308504.0,all,47675.0,27.446335
4,Baden-Württemberg,2002,663.558974,all,1314489.0,all,48072.0,27.344171
...,...,...,...,...,...,...,...,...
340,Thüringen,2016,261.020690,all,190458.0,all,12328.0,15.449221
341,Thüringen,2017,271.838488,all,192621.0,all,12431.0,15.495214
342,Thüringen,2018,269.052632,all,194780.0,all,12460.0,15.632424
343,Thüringen,2019,260.903915,all,195612.0,all,12389.0,15.789168


In [7]:
value_cols = ["Repeaters", "Teachers", "Students per Teacher", "Students"]
merged_state[value_cols].corr()

Unnamed: 0,Repeaters,Teachers,Students per Teacher,Students
Repeaters,1.0,0.034795,0.30973,0.113287
Teachers,0.034795,1.0,-0.026483,0.986145
Students per Teacher,0.30973,-0.026483,1.0,0.07446
Students,0.113287,0.986145,0.07446,1.0


## Average over federal states


In [8]:
# Children w\o degree
children_wo_degree_c = children_wo_degree.copy()
children_wo_degree_c = children_wo_degree_c.drop(columns=["Total students"])

In [9]:
merged_avg = merged_state.groupby(
    ["Year"])[list(set(value_cols) - {"Students per Teacher"})].sum().reset_index()
merged_avg["Students per Teacher"] = merged_state.groupby(
    ["Year"])["Students per Teacher"].mean().reset_index(drop=True)
merged_avg = pd.merge(
    merged_avg,
    children_wo_degree_c,
    on=["Year"],
    suffixes=("", "_wo_degree")
).dropna().drop_duplicates(["Year"])
# merged_avg["Without degree (rel.)"] = merged_avg["Without degree"] / \
#     merged_avg["Students"]
merged_avg

Unnamed: 0,Year,Students,Repeaters,Teachers,Students per Teacher,Without degree,Without degree (rel.)
0,1998,9693481.0,8798.431035,397352.0,24.156118,655234.0,0.029675
1,1999,9644719.0,8899.98087,391728.0,24.755593,677697.0,0.028672
2,2000,9568577.0,9815.450769,385555.0,24.706919,693118.0,0.029622
3,2001,9490139.0,9807.300019,387224.0,23.931955,661708.0,0.028552
4,2002,9409219.0,8903.1631,386432.0,23.876403,652647.0,0.028293
5,2003,9362322.0,8495.604834,384973.0,24.521486,632157.0,0.027349
6,2004,9268273.0,6122.159653,371840.0,25.408487,598069.0,0.026222
7,2005,9157927.0,6312.164046,370130.0,26.927937,581089.0,0.025718
8,2006,9016008.0,5941.497687,365986.0,27.314628,533897.0,0.024019
9,2007,8850784.0,5799.968846,365844.0,26.489617,492524.0,0.022622


In [10]:
value_cols = list(set(merged_avg.columns) - {"Year"})
merged_avg[value_cols].corr()

Unnamed: 0,Without degree (rel.),Repeaters,Students,Teachers,Without degree,Students per Teacher
Without degree (rel.),1.0,0.956111,0.936134,-0.275901,0.994622,0.734157
Repeaters,0.956111,1.0,0.946839,-0.275346,0.967023,0.703862
Students,0.936134,0.946839,1.0,-0.466281,0.958685,0.843986
Teachers,-0.275901,-0.275346,-0.466281,1.0,-0.330617,-0.824569
Without degree,0.994622,0.967023,0.958685,-0.330617,1.0,0.768525
Students per Teacher,0.734157,0.703862,0.843986,-0.824569,0.768525,1.0
