# Parser Playground

The goal of this project is to provide a playground for experimenting with different parsing techniques for the tables used in this project.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import school_analysis as sa
import os
from school_analysis.analysis.exploration import Exploration
from school_analysis.preprocessing.load import Loader
loader = Loader()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Zensus


In [None]:

from io import StringIO
import re


def _parser_12411_0013(self, raw_data, *args, **kwargs) -> pd.DataFrame:
    """Parser for the # of children by school type of Germany"""
    df = pd.read_csv(StringIO(raw_data), sep=";", skiprows=4,
                     skipfooter=4, engine="python")

    # Rename columns
    last_state = ""
    for i in range(1, len(df.columns)):
        if not re.match("Unnamed: \d", df.columns[i]):
            last_state = df.columns[i]
        df = df.rename(
            columns={df.columns[i]: last_state + "." + df.iloc[0, i]})
    df = df.drop(df.index[0]).reset_index(drop=True)
    df = df.rename(columns={"Unnamed: 0": "Temp"})

    # Build own melted table --> may be done better
    temp = pd.DataFrame(
        columns=["Year", "Gender", "Value", "Federal State", "Age"])
    last_year = ""
    for i in df.index:
        if df.loc[i, "Temp"] is np.nan:
            continue
        elif re.match(r"\d{4}", df.loc[i, "Temp"]):
            last_year = df.loc[i, "Temp"].split("-")[0]
            continue

        age = df.loc[i, "Temp"]
        for c in df.columns[1:]:
            splitted = c.split(".")
            federal_state = splitted[0]
            gender = splitted[1]
            value = df.loc[i, c]
            temp.loc[len(temp.index)] = [int(last_year), gender,
                                         float(value), federal_state, age]

    # Convert age to int
    def convert_ages(x):
        if x == "under 1 year":
            return 0
        elif x == "90 years and over":
            return 90
        elif x == "Total":
            return -1
        else:
            return int(x.split(" ")[0])
    temp["Age"] = temp["Age"].apply(convert_ages)

    df = temp

    return df


with open(os.path.join(sa.PROJECT_PATH, "data", "raw", "genesis", "zensus-3.csv"), "r") as f:
    raw_data = f.read()

df = _parser_12411_0013(None, raw_data)
df

Unnamed: 0,Year,Gender,Value,Federal State,Age
0,1996,Male,59065.0,Baden-Württemberg,0
1,1996,Female,55919.0,Baden-Württemberg,0
2,1996,Total,114984.0,Baden-Württemberg,0
3,1996,Male,66703.0,Bayern,0
4,1996,Female,62871.0,Bayern,0
...,...,...,...,...,...
39739,2004,Female,1446229.0,Schleswig-Holstein,-1
39740,2004,Total,2828760.0,Schleswig-Holstein,-1
39741,2004,Male,1158456.0,Thüringen,-1
39742,2004,Female,1196824.0,Thüringen,-1


In [None]:
df = loader.load("zensus-age")
Exploration.analyse_min_max(df)

Highest Value: 
Year                            2022
Gender                           all
Value                     18139116.0
Federal State    Nordrhein-Westfalen
Age                               -1
Name: 39725, dtype: object
----------------------------------------------------------------------------------------------------
Lowest Value: 
Year               2006
Gender                m
Value             267.0
Federal State    Bremen
Age                  89
Name: 48444, dtype: object
----------------------------------------------------------------------------------------------------


In [None]:
Exploration.analyse_structure(
    df, cols=['Year', 'Gender', 'Federal State', 'Age'])

Shape:  (119232, 5)
Columns:  Index(['Year', 'Gender', 'Value', 'Federal State', 'Age'], dtype='object')
Data types:
 Year               int64
Gender            object
Value            float64
Federal State     object
Age                int64
dtype: object 

Missing values:
 Year             0
Gender           0
Value            0
Federal State    0
Age              0
dtype: int64 

Unique values:
 Year                27
Gender               3
Value            63069
Federal State       16
Age                 92
dtype: int64 

Value counts: 
  Year    Year_count  Gender      Gender_count  Federal State             Federal State_count    Age    Age_count
------  ------------  --------  --------------  ----------------------  ---------------------  -----  -----------
  2014          4416  m                  39744  Baden-Württemberg                        7452      0         1296
  2010          4416  f                  39744  Bayern                                   7452     58         12

In [None]:

from io import StringIO
import re


def _test_parser(self, raw_data, *args, **kwargs) -> pd.DataFrame:
    df = pd.read_csv(StringIO(raw_data), sep=";", skiprows=4,
                     skipfooter=3, engine="python")
    df = df.rename(columns={"Unnamed: 0": "Federal State"})
    df = df.melt(id_vars=["Federal State"],
                 var_name="Year", value_name="Budget")

    return df


with open(os.path.join(sa.PROJECT_PATH, "data", "raw", "genesis", "budgets_by_child_by_state.csv"), "r") as f:
    raw_data = f.read()

df = _test_parser(None, raw_data)
df

Unnamed: 0,Federal State,Year,Budget
0,Baden-Württemberg,2010,6100
1,Bayern,2010,6400
2,Berlin,2010,7000
3,Brandenburg,2010,6200
4,Bremen,2010,6100
...,...,...,...
199,Sachsen,2021,8700
200,Sachsen-Anhalt,2021,8600
201,Schleswig-Holstein,2021,8600
202,Thüringen,2021,9500


In [None]:
Exploration.analyse_structure(df, cols=['Year', 'Federal State'])

Shape:  (204, 3)
Columns:  Index(['Federal State', 'Year', 'Budget'], dtype='object')
Data types:
 Federal State    object
Year             object
Budget            int64
dtype: object 

Missing values:
 Federal State    0
Year             0
Budget           0
dtype: int64 

Unique values:
 Federal State    17
Year             12
Budget           53
dtype: int64 

Value counts: 
  Year    Year_count  Federal State             Federal State_count
------  ------------  ----------------------  ---------------------
  2010            17  Baden-Württemberg                          12
  2011            17  Nordrhein-Westfalen                        12
  2012            17  Thüringen                                  12
  2013            17  Schleswig-Holstein                         12
  2014            17  Sachsen-Anhalt                             12
  2015            17  Sachsen                                    12
  2016            17  Saarland                                   12
  2017

In [None]:

from io import StringIO
import re


def _test_parser(self, raw_data, *args, **kwargs) -> pd.DataFrame:
    df = pd.read_csv(StringIO(raw_data), sep=";", skiprows=4,
                     skipfooter=3, engine="python")
    df = df.rename(columns={"Unnamed: 0": "Year/Institution",
                   "Unnamed: 1": "Measure", "Unnamed: 2": "Unit"})
    df = df.drop(df.index[1]).reset_index(drop=True)
    df = df.replace(r"^.$", np.nan, regex=True)

    last_year = ""
    last_institution = ""
    temp = pd.DataFrame(
        columns=["Institution", "Year", "Measure", "Unit", "Value", "Federal State"])
    for i in df.index:
        if re.match(r"\d{4}", str(df.loc[i, "Year/Institution"])):
            last_year = df.loc[i, "Year/Institution"]
            continue
        if df.loc[i, "Year/Institution"] == "Länder":
            continue
        if df.loc[i, "Year/Institution"] is not np.nan:
            last_institution = df.loc[i, "Year/Institution"]

        measure = df.loc[i, "Measure"]
        unit = df.loc[i, "Unit"]

        for c in df.columns[3:]:
            federal_state = c
            value = df.loc[i, c]
            temp.loc[len(temp.index)] = [last_institution, int(
                last_year), measure, unit, float(value), federal_state]

    df = temp

    return df


with open(os.path.join(sa.PROJECT_PATH, "data", "raw", "genesis", "budgets_by_state.csv"), "r") as f:
    raw_data = f.read()

df = _test_parser(None, raw_data)
df

Unnamed: 0,Institution,Year,Measure,Unit,Value,Federal State
0,Child day care,1995,Public expenditure on education,EUR 1000,242182.09,Baden-Württemberg
1,Child day care,1995,Public expenditure on education,EUR 1000,377243.42,Bayern
2,Child day care,1995,Public expenditure on education,EUR 1000,,Berlin
3,Child day care,1995,Public expenditure on education,EUR 1000,206027.11,Brandenburg
4,Child day care,1995,Public expenditure on education,EUR 1000,48907.11,Bremen
...,...,...,...,...,...,...
39899,Total,2023,Prop. of ed. exp. of pub.budgets of overall bu...,percent,24.32,Saarland
39900,Total,2023,Prop. of ed. exp. of pub.budgets of overall bu...,percent,27.83,Sachsen
39901,Total,2023,Prop. of ed. exp. of pub.budgets of overall bu...,percent,25.89,Sachsen-Anhalt
39902,Total,2023,Prop. of ed. exp. of pub.budgets of overall bu...,percent,25.58,Schleswig-Holstein


In [None]:
Exploration.analyse_structure(
    df, cols=['Year', 'Federal State', 'Institution', 'Measure'])

Shape:  (39904, 6)
Columns:  Index(['Institution', 'Year', 'Measure', 'Unit', 'Value', 'Federal State'], dtype='object')
Data types:
 Institution       object
Year               int64
Measure           object
Unit              object
Value            float64
Federal State     object
dtype: object 

Missing values:
 Institution         0
Year                0
Measure           928
Unit              928
Value            5656
Federal State       0
dtype: int64 

Unique values:
 Institution          8
Year                29
Measure              4
Unit                 3
Value            14718
Federal State       16
dtype: int64 

Value counts: 
  Year    Year_count  Federal State             Federal State_count  Institution                         Institution_count  Measure                                               Measure_count
------  ------------  ----------------------  ---------------------  --------------------------------  -------------------  ------------------------------------

In [None]:
from io import StringIO
import re


def _test_parser(self, raw_data, *args, **kwargs) -> pd.DataFrame:
    df = pd.read_csv(StringIO(raw_data), sep=";", skiprows=4,
                     skipfooter=3, engine="python")
    df = df.rename(columns={"Unnamed: 0": "Year"})
    df = df.melt(id_vars=["Year"],
                 var_name="Federal State", value_name="Index")
    df = df.replace(r"^-$", np.nan, regex=True)
    df["Index"] = df["Index"].apply(lambda x: float(x) / 100)
    df["Year Relative"] = 2020

    return df


with open(os.path.join(sa.PROJECT_PATH, "data", "raw", "genesis", "verbraucherpreisindex_state.csv"), "r") as f:
    raw_data = f.read()

df = _test_parser(None, raw_data)
df

Unnamed: 0,Year,Federal State,Index,Year Relative
0,1995,Baden-Württemberg,0.705,2020
1,1996,Baden-Württemberg,0.713,2020
2,1997,Baden-Württemberg,0.725,2020
3,1998,Baden-Württemberg,0.732,2020
4,1999,Baden-Württemberg,0.736,2020
...,...,...,...,...
459,2019,Thüringen,0.988,2020
460,2020,Thüringen,1.000,2020
461,2021,Thüringen,1.032,2020
462,2022,Thüringen,1.109,2020


In [None]:
from io import StringIO
import re
import pandas as pd
import os
import school_analysis as sa
import numpy as np


def _test_parser(self, raw_data, *args, **kwargs) -> pd.DataFrame:
    df = pd.read_csv(StringIO(raw_data), sep=";", skiprows=7,
                     skipfooter=3, engine="python")
    df = df.rename(
        columns={
            "Unnamed: 0": "School Type",
            "Unnamed: 1": "Grade"
        }
    )
    df = df.loc[1:].reset_index(drop=True)
    temp = pd.DataFrame(
        columns=["Year", "School Type", "Grade", "Students", "Gender"])

    last_school_type = ""
    # Loop over all rows
    for i in df.index[1:]:
        if df.loc[i, "School Type"] is not np.nan:
            last_school_type = df.loc[i, "School Type"]
            continue
        if df.loc[i, "Grade"] is np.nan or df.loc[i, "Grade"] == "Total":
            continue

        grade = df.loc[i, "Grade"]
        last_year = ""

        # Loop over all columns
        for c in df.columns[2:]:
            if re.match(r"\d{4}", str(c)):
                last_year = c.split("/")[0]
            gender = df.loc[0, c]
            students = df.loc[i, c]
            temp.loc[len(temp)] = [last_year, last_school_type,
                                   grade, students, gender]

    df = temp

    # Right types and columns
    df = df[df["School Type"] != "Total"]
    df["Year"] = df["Year"].astype(int)
    df["Students"] = df["Students"].replace("-", np.nan).astype(float)

    return df


with open(os.path.join(sa.PROJECT_PATH, "data", "raw", "genesis", "school_children_by_type.csv"), "r") as f:
    raw_data = f.read()

df = _test_parser(None, raw_data)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"] = df["Year"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Students"] = df["Students"].replace("-", np.nan).astype(float)


Unnamed: 0,Year,School Type,Grade,Students,Gender
0,1998,Pre-school classes,Grade 2,,Male
1,1998,Pre-school classes,Grade 2,,Female
2,1998,Pre-school classes,Grade 2,,Total
3,1999,Pre-school classes,Grade 2,,Male
4,1999,Pre-school classes,Grade 2,,Female
...,...,...,...,...,...
16570,2021,Assignment to the type of school is not possible,No data provided,4438.0,Female
16571,2021,Assignment to the type of school is not possible,No data provided,9482.0,Total
16572,2022,Assignment to the type of school is not possible,No data provided,12447.0,Male
16573,2022,Assignment to the type of school is not possible,No data provided,11302.0,Female


In [50]:
from io import StringIO
import re
import pandas as pd
import os
import school_analysis as sa
import numpy as np


def _test_parser(self, raw_data, *args, **kwargs) -> pd.DataFrame:
    """Parser for repeaters"""
    df = pd.read_csv(StringIO(raw_data), sep=";", skiprows=5,
                     skipfooter=14, engine="python")
    df.replace("b'", "", inplace=True, regex=True)
    years = [int(item.split("/")[0])
             for item in df.iloc[0].dropna().tolist() if item != '' and item != '\'']

    df.rename(columns={df.columns[0]: 'state'}, inplace=True)
    df.rename(columns={df.columns[1]: 'grade'}, inplace=True)
    df.rename(columns={df.columns[3]: 'male'}, inplace=True)
    df.rename(columns={df.columns[5]: 'female'}, inplace=True)
    df.rename(columns={df.columns[7]: 'total'}, inplace=True)

    df['state'] = df['state'].ffill()
    df['grade'] = df['grade'].ffill()
    df.to_csv("test.csv")

    years = [int(item.split("/")[0])
             for item in df.iloc[0].dropna().tolist() if item != '' and item != '\'']

    states = [
        "Baden-Württemberg",
        "Bayern",
        "Berlin",
        "Brandenburg",
        "Bremen",
        "Hamburg",
        "Hessen",
        "Mecklenburg-Vorpommern",
        "Niedersachsen",
        "Nordrhein-Westfalen",
        "Rheinland-Pfalz",
        "Saarland",
        "Sachsen",
        "Sachsen-Anhalt",
        "Schleswig-Holstein",
        "Thüringen"
    ]

    data = []

    # for idx, state in enumerate(states):
    #     part = df.iloc[4+85*idx:]
    #     for row in part.iterrows():
    #         for y_idx, year in enumerate(years):
    #             male = row[1][2+y_idx*3]
    #             female = row[1][3+y_idx*3]
    #             total = row[1][4+y_idx*3]
    #             grade = row[1]['grade']
    #             school = row[1]['state']
    #             record = {
    #                 'state': state,
    #                 'school': school,  # state and schol have the same column
    #                 'year': year,
    #                 'male': male,
    #                 'female': female,
    #                 'grade': grade,
    #                 'total': total
    #             }
    #             data.append(record)

    last_state = ""
    for idx in df.index[3:]:
        row = df.loc[idx]
        if row["state"] in states:
            last_state = row["state"]
            continue
        if row["state"] == "Total":
            last_state = "Total"
            continue
        school = row["state"]

        last_year = ""
        for col in df.columns[2:]:
            if re.match(r"\d{4}/\d{2}", str(df.loc[0][col])):
                last_year = str(df.loc[0][col]).split("/")[0]

            gender = df.loc[2][col]
            grade = row["grade"]
            school = row["state"]
            record = {
                'state': last_state,
                'school': school,  # state and school have the same column
                'year': last_year,
                'grade': grade,
                'gender': gender,
                'repeaters': row[col]
            }
            data.append(record)

    df_melted = pd.DataFrame(data)
    df_melted = df_melted.replace("-", np.nan)
    df_melted = df_melted.astype({"year": int, "repeaters": float})
    # df_melted = df_melted.pivot_table(values='repeaters', columns="gender", index=[
    #     "state", "school", "year", "grade"]).reset_index()
    df_melted = df_melted[(df_melted['state'] != 'Total') &
                          (df_melted['school'] != 'Total') &
                          (df_melted['grade'] != 'Total')]
    return df_melted


with open(os.path.join(sa.PROJECT_PATH, "data", "raw", "genesis", "number_of_repeaters_1.csv"), "r") as f:
    raw_data = f.read()

df = _test_parser(None, raw_data)
df

Unnamed: 0,state,school,year,grade,gender,repeaters
0,Baden-Württemberg,Primary schools,1998,Grade 1,Male,1024.0
1,Baden-Württemberg,Primary schools,1998,Grade 1,Female,835.0
2,Baden-Württemberg,Primary schools,1998,Grade 1,Total,1859.0
3,Baden-Württemberg,Primary schools,1999,Grade 1,Male,924.0
4,Baden-Württemberg,Primary schools,1999,Grade 1,Female,780.0
...,...,...,...,...,...,...
31663,Thüringen,Evening schools and adult education colleges,2000,No data provided,Female,
31664,Thüringen,Evening schools and adult education colleges,2000,No data provided,Total,
31665,Thüringen,Evening schools and adult education colleges,2001,No data provided,Male,
31666,Thüringen,Evening schools and adult education colleges,2001,No data provided,Female,
