# Parser Playground
The goal of this project is to provide a playground for experimenting with different parsing techniques for the tables used in this project.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import school_analysis as sa
import os
from school_analysis.analysis.exploration import  Exploration
from school_analysis.preprocessing.load import Loader
loader = Loader()

## Zensus

In [3]:

from io import StringIO
import re


def _parser_12411_0042(self, raw_data, *args, **kwargs) -> pd.DataFrame:
        """Parser for the # of children by school type of Germany"""
        df = pd.read_csv(StringIO(raw_data), sep=";", skiprows=4, skipfooter=4, engine="python")
        df = df.rename(columns={"Unnamed: 0": "Temp", "Germans": "Germans m", "Unnamed: 2": "Germans f", "Unnamed: 3": "Germans all", "Foreigners": "Foreigners m", "Unnamed: 5": "Foreigners f", "Unnamed: 6": "Foreigners all", "Total": "Total m", "Unnamed: 8": "Total f", "Unnamed: 9": "Total all"})
        
        # Build own melted table --> may be done better
        temp = pd.DataFrame(columns=["Year", "Gender", "Value", "Federal State", "Origin"])
        last_year = ""
        for i in df.index:
            if df.loc[i, "Temp"] is np.nan:
                continue
            elif re.match(r"\d{4}", df.loc[i, "Temp"]):
                last_year = df.loc[i, "Temp"].split("-")[0]
                continue
        
            fs = df.loc[i, "Temp"]
            for g in ["m", "f", "all"]:
                for t in ["Germans", "Foreigners", "Total"]:
                    temp.loc[len(temp.index)] = [last_year, g, df.loc[i, t + " " + g], fs, t]
        df = temp
        
        return df
    
with open(os.path.join(sa.PROJECT_PATH, "data", "raw", "genesis", "zensus_ages.csv"), "r") as f:
    raw_data = f.read()
    
df = _parser_12411_0042(None, raw_data)
df

FileNotFoundError: [Errno 2] No such file or directory: '/home/tomf/projects/DataLiteracyWS23/src/school_analysis/../../data/raw/genesis/zensus_ages.csv'

In [None]:
Exploration.analyse_min_max(df)

Highest Value: 
Year                 2011
Gender                all
Value              999867
Federal State    Saarland
Origin              Total
Name: 1691, dtype: object
----------------------------------------------------------------------------------------------------
Lowest Value: 
Year                            2005
Gender                             m
Value                        1000291
Federal State    Nordrhein-Westfalen
Origin                    Foreigners
Name: 802, dtype: object
----------------------------------------------------------------------------------------------------


In [None]:
Exploration.analyse_structure(df, ['Year', 'Gender', 'Federal State', 'Origin'])

Shape:  (3312, 5)
Columns:  Index(['Year', 'Gender', 'Value', 'Federal State', 'Origin'], dtype='object')
Data types:
 Year             object
Gender           object
Value            object
Federal State    object
Origin           object
dtype: object 

Missing values:
 Year             0
Gender           0
Value            0
Federal State    0
Origin           0
dtype: int64 

Unique values:
 Year               23
Gender              3
Value            3312
Federal State      16
Origin              3
dtype: int64 

Value counts: 
  Year    Year_count  Gender      Gender_count  Federal State             Federal State_count  Origin        Origin_count
------  ------------  --------  --------------  ----------------------  ---------------------  ----------  --------------
  2000           144  m                   1104  Baden-Württemberg                         207  Germans               1104
  2012           144  f                   1104  Bayern                                    207  F

## Zensus - Ages

In [None]:

from io import StringIO
import re


def _parser_12411_0013(self, raw_data, *args, **kwargs) -> pd.DataFrame:
        """Parser for the # of children by school type of Germany"""
        df = pd.read_csv(StringIO(raw_data), sep=";", skiprows=4, skipfooter=4, engine="python")
        
        # Rename columns
        last_state = ""
        for i in range(1, len(df.columns)):
            if not re.match("Unnamed: \d", df.columns[i]):
                last_state = df.columns[i]
            df = df.rename(columns={df.columns[i]: last_state + "." + df.iloc[0, i]})
        df = df.drop(df.index[0]).reset_index(drop=True)
        df = df.rename(columns={"Unnamed: 0": "Temp"})
        
        # Build own melted table --> may be done better
        temp = pd.DataFrame(columns=["Year", "Gender", "Value", "Federal State", "Age"])
        last_year = ""
        for i in df.index:
            if df.loc[i, "Temp"] is np.nan:
                continue
            elif re.match(r"\d{4}", df.loc[i, "Temp"]):
                last_year = df.loc[i, "Temp"].split("-")[0]
                continue
        
            age = df.loc[i, "Temp"]
            for c in df.columns[1:]:
                splitted = c.split(".")
                federal_state = splitted[0]
                gender = splitted[1]
                value = df.loc[i, c]
                temp.loc[len(temp.index)] = [int(last_year), gender, float(value), federal_state, age]                
        
        # Convert age to int
        def convert_ages(x):
            if x == "under 1 year":
                return 0
            elif x == "90 years and over":
                return 90
            elif x == "Total":
                return -1
            else:
                return int(x.split(" ")[0])
        temp["Age"] = temp["Age"].apply(convert_ages)
        
        df = temp
        
        return df
    
with open(os.path.join(sa.PROJECT_PATH, "data", "raw", "genesis", "zensus-3.csv"), "r") as f:
    raw_data = f.read()
    
df = _parser_12411_0013(None, raw_data)
df

Unnamed: 0,Year,Gender,Value,Federal State,Age
0,1996,Male,59065.0,Baden-Württemberg,0
1,1996,Female,55919.0,Baden-Württemberg,0
2,1996,Total,114984.0,Baden-Württemberg,0
3,1996,Male,66703.0,Bayern,0
4,1996,Female,62871.0,Bayern,0
...,...,...,...,...,...
39739,2004,Female,1446229.0,Schleswig-Holstein,-1
39740,2004,Total,2828760.0,Schleswig-Holstein,-1
39741,2004,Male,1158456.0,Thüringen,-1
39742,2004,Female,1196824.0,Thüringen,-1


In [None]:
df = loader.load("zensus-age")
Exploration.analyse_min_max(df)

Highest Value: 
Year                            2022
Gender                           all
Value                     18139116.0
Federal State    Nordrhein-Westfalen
Age                               -1
Name: 39725, dtype: object
----------------------------------------------------------------------------------------------------
Lowest Value: 
Year               2006
Gender                m
Value             267.0
Federal State    Bremen
Age                  89
Name: 48444, dtype: object
----------------------------------------------------------------------------------------------------


In [None]:
Exploration.analyse_structure(df, cols=['Year', 'Gender', 'Federal State', 'Age'])

Shape:  (119232, 5)
Columns:  Index(['Year', 'Gender', 'Value', 'Federal State', 'Age'], dtype='object')
Data types:
 Year               int64
Gender            object
Value            float64
Federal State     object
Age                int64
dtype: object 

Missing values:
 Year             0
Gender           0
Value            0
Federal State    0
Age              0
dtype: int64 

Unique values:
 Year                27
Gender               3
Value            63069
Federal State       16
Age                 92
dtype: int64 

Value counts: 
  Year    Year_count  Gender      Gender_count  Federal State             Federal State_count    Age    Age_count
------  ------------  --------  --------------  ----------------------  ---------------------  -----  -----------
  2014          4416  m                  39744  Baden-Württemberg                        7452      0         1296
  2010          4416  f                  39744  Bayern                                   7452     58         12

In [7]:

from io import StringIO
import re


def _test_parser(self, raw_data, *args, **kwargs) -> pd.DataFrame:
        df = pd.read_csv(StringIO(raw_data), sep=";", skiprows=4, skipfooter=3, engine="python")
        df = df.rename(columns={"Unnamed: 0": "Federal State"})
        df = df.melt(id_vars=["Federal State"], var_name="Year", value_name="Budget")
        
        return df
    
with open(os.path.join(sa.PROJECT_PATH, "data", "raw", "genesis", "budgets_by_child_by_state.csv"), "r") as f:
    raw_data = f.read()
    
df = _test_parser(None, raw_data)
df

Unnamed: 0,Federal State,Year,Budget
0,Baden-Württemberg,2010,6100
1,Bayern,2010,6400
2,Berlin,2010,7000
3,Brandenburg,2010,6200
4,Bremen,2010,6100
...,...,...,...
199,Sachsen,2021,8700
200,Sachsen-Anhalt,2021,8600
201,Schleswig-Holstein,2021,8600
202,Thüringen,2021,9500


In [8]:
Exploration.analyse_structure(df, cols=['Year', 'Federal State'])

Shape:  (204, 3)
Columns:  Index(['Federal State', 'Year', 'Budget'], dtype='object')
Data types:
 Federal State    object
Year             object
Budget            int64
dtype: object 

Missing values:
 Federal State    0
Year             0
Budget           0
dtype: int64 

Unique values:
 Federal State    17
Year             12
Budget           53
dtype: int64 

Value counts: 
  Year    Year_count  Federal State             Federal State_count
------  ------------  ----------------------  ---------------------
  2010            17  Baden-Württemberg                          12
  2011            17  Nordrhein-Westfalen                        12
  2012            17  Thüringen                                  12
  2013            17  Schleswig-Holstein                         12
  2014            17  Sachsen-Anhalt                             12
  2015            17  Sachsen                                    12
  2016            17  Saarland                                   12
  2017

In [24]:

from io import StringIO
import re


def _test_parser(self, raw_data, *args, **kwargs) -> pd.DataFrame:
        df = pd.read_csv(StringIO(raw_data), sep=";", skiprows=4, skipfooter=3, engine="python")
        df = df.rename(columns={"Unnamed: 0": "Year/Institution", "Unnamed: 1": "Measure", "Unnamed: 2": "Unit"})
        df = df.drop(df.index[1]).reset_index(drop=True)
        df = df.replace(r"^.$", np.nan, regex=True)
        
        last_year = ""
        last_institution = ""
        temp = pd.DataFrame(columns=["Institution", "Year", "Measure", "Unit", "Value", "Federal State"])
        for i in df.index:
            if re.match(r"\d{4}", str(df.loc[i, "Year/Institution"])):
                last_year = df.loc[i, "Year/Institution"]
                continue
            if df.loc[i, "Year/Institution"] == "Länder":
                continue
            if df.loc[i, "Year/Institution"] is not np.nan:
                last_institution = df.loc[i, "Year/Institution"]
                
            measure = df.loc[i, "Measure"]
            unit = df.loc[i, "Unit"]
                    
            for c in df.columns[3:]:
                federal_state = c
                value = df.loc[i, c]
                temp.loc[len(temp.index)] = [last_institution, int(last_year), measure, unit, float(value), federal_state]
                
        df = temp
        
        
        return df
    
with open(os.path.join(sa.PROJECT_PATH, "data", "raw", "genesis", "budgets_by_state.csv"), "r") as f:
    raw_data = f.read()
    
df = _test_parser(None, raw_data)
df

Unnamed: 0,Institution,Year,Measure,Unit,Value,Federal State
0,Child day care,1995,Public expenditure on education,EUR 1000,242182.09,Baden-Württemberg
1,Child day care,1995,Public expenditure on education,EUR 1000,377243.42,Bayern
2,Child day care,1995,Public expenditure on education,EUR 1000,,Berlin
3,Child day care,1995,Public expenditure on education,EUR 1000,206027.11,Brandenburg
4,Child day care,1995,Public expenditure on education,EUR 1000,48907.11,Bremen
...,...,...,...,...,...,...
39899,Total,2023,Prop. of ed. exp. of pub.budgets of overall bu...,percent,24.32,Saarland
39900,Total,2023,Prop. of ed. exp. of pub.budgets of overall bu...,percent,27.83,Sachsen
39901,Total,2023,Prop. of ed. exp. of pub.budgets of overall bu...,percent,25.89,Sachsen-Anhalt
39902,Total,2023,Prop. of ed. exp. of pub.budgets of overall bu...,percent,25.58,Schleswig-Holstein


In [25]:
Exploration.analyse_structure(df, cols=['Year', 'Federal State', 'Institution', 'Measure'])

Shape:  (39904, 6)
Columns:  Index(['Institution', 'Year', 'Measure', 'Unit', 'Value', 'Federal State'], dtype='object')
Data types:
 Institution       object
Year               int64
Measure           object
Unit              object
Value            float64
Federal State     object
dtype: object 

Missing values:
 Institution         0
Year                0
Measure           928
Unit              928
Value            5656
Federal State       0
dtype: int64 

Unique values:
 Institution          8
Year                29
Measure              4
Unit                 3
Value            14718
Federal State       16
dtype: int64 

Value counts: 
  Year    Year_count  Federal State             Federal State_count  Institution                         Institution_count  Measure                                               Measure_count
------  ------------  ----------------------  ---------------------  --------------------------------  -------------------  ------------------------------------

In [33]:
from io import StringIO
import re


def _test_parser(self, raw_data, *args, **kwargs) -> pd.DataFrame:
        df = pd.read_csv(StringIO(raw_data), sep=";", skiprows=4, skipfooter=3, engine="python")
        df = df.rename(columns={"Unnamed: 0": "Year"})
        df = df.melt(id_vars=["Year"], var_name="Federal State", value_name="Index")
        df = df.replace(r"^-$", np.nan, regex=True)
        df["Index"] = df["Index"].apply(lambda x: float(x) / 100)
        df["Year Relative"] = 2020
        
        return df
    
with open(os.path.join(sa.PROJECT_PATH, "data", "raw", "genesis", "verbraucherpreisindex_state.csv"), "r") as f:
    raw_data = f.read()
    
df = _test_parser(None, raw_data)
df

Unnamed: 0,Year,Federal State,Index,Year Relative
0,1995,Baden-Württemberg,0.705,2020
1,1996,Baden-Württemberg,0.713,2020
2,1997,Baden-Württemberg,0.725,2020
3,1998,Baden-Württemberg,0.732,2020
4,1999,Baden-Württemberg,0.736,2020
...,...,...,...,...
459,2019,Thüringen,0.988,2020
460,2020,Thüringen,1.000,2020
461,2021,Thüringen,1.032,2020
462,2022,Thüringen,1.109,2020
