# Parser Playground
The goal of this project is to provide a playground for experimenting with different parsing techniques for the tables used in this project.

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import school_analysis as sa
import os
from school_analysis.analysis.exploration import  Exploration

## Zensus

In [15]:

from io import StringIO
import re


def _parser_12411_0042(self, raw_data, *args, **kwargs) -> pd.DataFrame:
        """Parser for the # of children by school type of Germany"""
        df = pd.read_csv(StringIO(raw_data), sep=";", skiprows=4, skipfooter=4, engine="python")
        df = df.rename(columns={"Unnamed: 0": "Temp", "Germans": "Germans m", "Unnamed: 2": "Germans f", "Unnamed: 3": "Germans all", "Foreigners": "Foreigners m", "Unnamed: 5": "Foreigners f", "Unnamed: 6": "Foreigners all", "Total": "Total m", "Unnamed: 8": "Total f", "Unnamed: 9": "Total all"})
        
        # Build own melted table --> may be done better
        temp = pd.DataFrame(columns=["Year", "Gender", "Value", "Federal State", "Origin"])
        last_year = ""
        for i in df.index:
            if df.loc[i, "Temp"] is np.nan:
                continue
            elif re.match(r"\d{4}", df.loc[i, "Temp"]):
                last_year = df.loc[i, "Temp"].split("-")[0]
                continue
        
            fs = df.loc[i, "Temp"]
            for g in ["m", "f", "all"]:
                for t in ["Germans", "Foreigners", "Total"]:
                    temp.loc[len(temp.index)] = [last_year, g, df.loc[i, t + " " + g], fs, t]
        df = temp
        
        return df
    
with open(os.path.join(sa.PROJECT_PATH, "data", "raw", "genesis", "zensus_ages.csv"), "r") as f:
    raw_data = f.read()
    
df = _parser_12411_0042(None, raw_data)
df

Unnamed: 0,Year,Gender,Value,Federal State,Origin
0,2000,m,4456947,Baden-Württemberg,Germans
1,2000,m,684993,Baden-Württemberg,Foreigners
2,2000,m,5141940,Baden-Württemberg,Total
3,2000,f,4740243,Baden-Württemberg,Germans
4,2000,f,610438,Baden-Württemberg,Foreigners
...,...,...,...,...,...
3307,2022,f,63680,Thüringen,Foreigners
3308,2022,f,1070146,Thüringen,Total
3309,2022,all,1975636,Thüringen,Germans
3310,2022,all,142219,Thüringen,Foreigners


In [16]:
Exploration.analyse_min_max(df)

Highest Value: 
Year                 2011
Gender                all
Value              999867
Federal State    Saarland
Origin              Total
Name: 1691, dtype: object
----------------------------------------------------------------------------------------------------
Lowest Value: 
Year                            2005
Gender                             m
Value                        1000291
Federal State    Nordrhein-Westfalen
Origin                    Foreigners
Name: 802, dtype: object
----------------------------------------------------------------------------------------------------


In [19]:
Exploration.analyse_structure(df, ['Year', 'Gender', 'Federal State', 'Origin'])

Shape:  (3312, 5)
Columns:  Index(['Year', 'Gender', 'Value', 'Federal State', 'Origin'], dtype='object')
Data types:
 Year             object
Gender           object
Value            object
Federal State    object
Origin           object
dtype: object 

Missing values:
 Year             0
Gender           0
Value            0
Federal State    0
Origin           0
dtype: int64 

Unique values:
 Year               23
Gender              3
Value            3312
Federal State      16
Origin              3
dtype: int64 

Value counts: 
  Year    Year_count  Gender      Gender_count  Federal State             Federal State_count  Origin        Origin_count
------  ------------  --------  --------------  ----------------------  ---------------------  ----------  --------------
  2000           144  m                   1104  Baden-Württemberg                         207  Germans               1104
  2012           144  f                   1104  Bayern                                    207  F