# Parser Playground
The goal of this project is to provide a playground for experimenting with different parsing techniques for the tables used in this project.

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import school_analysis as sa
import os
from school_analysis.analysis.exploration import  Exploration
from school_analysis.preprocessing.load import Loader
loader = Loader()

## Zensus

In [2]:

from io import StringIO
import re


def _parser_12411_0042(self, raw_data, *args, **kwargs) -> pd.DataFrame:
        """Parser for the # of children by school type of Germany"""
        df = pd.read_csv(StringIO(raw_data), sep=";", skiprows=4, skipfooter=4, engine="python")
        df = df.rename(columns={"Unnamed: 0": "Temp", "Germans": "Germans m", "Unnamed: 2": "Germans f", "Unnamed: 3": "Germans all", "Foreigners": "Foreigners m", "Unnamed: 5": "Foreigners f", "Unnamed: 6": "Foreigners all", "Total": "Total m", "Unnamed: 8": "Total f", "Unnamed: 9": "Total all"})
        
        # Build own melted table --> may be done better
        temp = pd.DataFrame(columns=["Year", "Gender", "Value", "Federal State", "Origin"])
        last_year = ""
        for i in df.index:
            if df.loc[i, "Temp"] is np.nan:
                continue
            elif re.match(r"\d{4}", df.loc[i, "Temp"]):
                last_year = df.loc[i, "Temp"].split("-")[0]
                continue
        
            fs = df.loc[i, "Temp"]
            for g in ["m", "f", "all"]:
                for t in ["Germans", "Foreigners", "Total"]:
                    temp.loc[len(temp.index)] = [last_year, g, df.loc[i, t + " " + g], fs, t]
        df = temp
        
        return df
    
with open(os.path.join(sa.PROJECT_PATH, "data", "raw", "genesis", "zensus_ages.csv"), "r") as f:
    raw_data = f.read()
    
df = _parser_12411_0042(None, raw_data)
df

FileNotFoundError: [Errno 2] No such file or directory: '/home/tomf/projects/DataLiteracyWS23/src/school_analysis/../../data/raw/genesis/zensus_ages.csv'

In [None]:
Exploration.analyse_min_max(df)

Highest Value: 
Year                 2011
Gender                all
Value              999867
Federal State    Saarland
Origin              Total
Name: 1691, dtype: object
----------------------------------------------------------------------------------------------------
Lowest Value: 
Year                            2005
Gender                             m
Value                        1000291
Federal State    Nordrhein-Westfalen
Origin                    Foreigners
Name: 802, dtype: object
----------------------------------------------------------------------------------------------------


In [None]:
Exploration.analyse_structure(df, ['Year', 'Gender', 'Federal State', 'Origin'])

Shape:  (3312, 5)
Columns:  Index(['Year', 'Gender', 'Value', 'Federal State', 'Origin'], dtype='object')
Data types:
 Year             object
Gender           object
Value            object
Federal State    object
Origin           object
dtype: object 

Missing values:
 Year             0
Gender           0
Value            0
Federal State    0
Origin           0
dtype: int64 

Unique values:
 Year               23
Gender              3
Value            3312
Federal State      16
Origin              3
dtype: int64 

Value counts: 
  Year    Year_count  Gender      Gender_count  Federal State             Federal State_count  Origin        Origin_count
------  ------------  --------  --------------  ----------------------  ---------------------  ----------  --------------
  2000           144  m                   1104  Baden-Württemberg                         207  Germans               1104
  2012           144  f                   1104  Bayern                                    207  F

## Zensus - Ages

In [16]:

from io import StringIO
import re


def _parser_12411_0013(self, raw_data, *args, **kwargs) -> pd.DataFrame:
        """Parser for the # of children by school type of Germany"""
        df = pd.read_csv(StringIO(raw_data), sep=";", skiprows=4, skipfooter=4, engine="python")
        
        # Rename columns
        last_state = ""
        for i in range(1, len(df.columns)):
            if not re.match("Unnamed: \d", df.columns[i]):
                last_state = df.columns[i]
            df = df.rename(columns={df.columns[i]: last_state + "." + df.iloc[0, i]})
        df = df.drop(df.index[0]).reset_index(drop=True)
        df = df.rename(columns={"Unnamed: 0": "Temp"})
        
        # Build own melted table --> may be done better
        temp = pd.DataFrame(columns=["Year", "Gender", "Value", "Federal State", "Age"])
        last_year = ""
        for i in df.index:
            if df.loc[i, "Temp"] is np.nan:
                continue
            elif re.match(r"\d{4}", df.loc[i, "Temp"]):
                last_year = df.loc[i, "Temp"].split("-")[0]
                continue
        
            age = df.loc[i, "Temp"]
            for c in df.columns[1:]:
                splitted = c.split(".")
                federal_state = splitted[0]
                gender = splitted[1]
                value = df.loc[i, c]
                temp.loc[len(temp.index)] = [int(last_year), gender, float(value), federal_state, age]                
            
        df = temp
        
        return df
    
with open(os.path.join(sa.PROJECT_PATH, "data", "raw", "genesis", "zensus-3.csv"), "r") as f:
    raw_data = f.read()
    
df = _parser_12411_0013(None, raw_data)
df

Unnamed: 0,Year,Gender,Value,Federal State,Age
0,1996,Male,59065.0,Baden-Württemberg,under 1 year
1,1996,Female,55919.0,Baden-Württemberg,under 1 year
2,1996,Total,114984.0,Baden-Württemberg,under 1 year
3,1996,Male,66703.0,Bayern,under 1 year
4,1996,Female,62871.0,Bayern,under 1 year
...,...,...,...,...,...
39739,2004,Female,1446229.0,Schleswig-Holstein,Total
39740,2004,Total,2828760.0,Schleswig-Holstein,Total
39741,2004,Male,1158456.0,Thüringen,Total
39742,2004,Female,1196824.0,Thüringen,Total


In [21]:
df = loader.load("zensus-age")
Exploration.analyse_min_max(df)

Highest Value: 
Year                            2022
Gender                         Total
Value                     18139116.0
Federal State    Nordrhein-Westfalen
Age                            Total
Name: 39725, dtype: object
----------------------------------------------------------------------------------------------------
Lowest Value: 
Year                 2006
Gender               Male
Value               267.0
Federal State      Bremen
Age              89 years
Name: 48444, dtype: object
----------------------------------------------------------------------------------------------------


In [24]:
Exploration.analyse_structure(df, cols=['Year', 'Gender', 'Federal State', 'Age'])

Shape:  (119232, 5)
Columns:  Index(['Year', 'Gender', 'Value', 'Federal State', 'Age'], dtype='object')
Data types:
 Year               int64
Gender            object
Value            float64
Federal State     object
Age               object
dtype: object 

Missing values:
 Year             0
Gender           0
Value            0
Federal State    0
Age              0
dtype: int64 

Unique values:
 Year                27
Gender               3
Value            63069
Federal State       16
Age                 92
dtype: int64 

Value counts: 
  Year    Year_count  Gender      Gender_count  Federal State             Federal State_count  Age                  Age_count
------  ------------  --------  --------------  ----------------------  ---------------------  -----------------  -----------
  2014          4416  Male               39744  Baden-Württemberg                        7452  under 1 year              1296
  2010          4416  Female             39744  Bayern                     