In [41]:
from datetime import date
from pathlib import Path
import argparse
import networkx as nx
import pandas as pd

SOURCE_PATH = Path('../bfs_source_data/')
TARGET_PATH = Path('../processed_data/')
INIT_GMDE_FILE = 'Gemeindestand_31_05_1981.xlsx'
MUTATION_FILE = 'Mutationen_1981_2023.xlsx'

END = '2023-01-01'

In [42]:
def read_files():
    gemeindestaende = pd.read_excel(TARGET_PATH / 'Anzahl_Gemeinden_pro_Gemeindestand.xlsx', engine='openpyxl')
    mutations = pd.read_excel(SOURCE_PATH / 'Mutationen_1981_2023.xlsx', header=1, engine='openpyxl')
    
    return gemeindestaende, mutations


def process_column_names(df, suffix=False):
    """Function to process the column names of a DataFrame

    Parameters
    ----------
    df: pd.DataFrame
    suffix: bool
        if True, adds (_old, _new) to columns of the same name

    Returns
    -------
    pd.DataFrame
    """

    if 'Mutationsnummer' in df.columns:
        df = df.drop(columns='Mutationsnummer')

    if suffix:
        new_col_names = [c + '_old' if '.1' not in c else c[:-2] + '_new'
                         for c in df.columns]
        new_col_names[-1] = new_col_names[-1].replace('_old', '')
        df.columns = new_col_names

    df.columns = [c.lower().replace(' ', '_').replace('-', '_')
                  for c in df.columns]

    return df


def build_graphs(initial_list, change_matrices):
    """Function to create a graph for each municipality
    to represent changes such as merging, splitting, and
    renaming of municipalities.

    Parameters
    ----------
    initial_list: list
        List of municipality numbers
    change_matrices: Union[List | Tuples]
        List of edges per Gemeindestand

    Returns
    -------
    Union[dict | nx.DiGraph]
    """
    # Create a dictionary to store the trees
    graphs = {}

    # Iterate over each element in the initial list
    for root in initial_list:
        # Create a directed graph
        G = nx.DiGraph()
        G.add_node(root)

        # Iterate through the change matrices to build the graph
        for edges in change_matrices:
            for edge in edges:
                original_value, new_value = edge
                if original_value in G:
                    G.add_node(new_value)
                    G.add_edge(original_value, new_value)
                    if (G.has_edge(original_value, original_value) & ((original_value, original_value) not in edges)):
                        G.remove_edge(original_value, original_value)

        # Store the graph in the dictionary
        graphs[root] = G

    return graphs

In [43]:
gemeindestaende, mutation_df = read_files()

  warn("Workbook contains no default style, apply openpyxl's default")


In [44]:
df = pd.read_excel(SOURCE_PATH / "Gleiche Rechte für Mann und Frau.xlsx", header=2, nrows=2396)

col_names = ['gmde_stand_old', 'gmde_name_old'] + df.columns[4:].to_list()
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3'])
df.columns = col_names

df = df[df.gmde_name_old.str.contains('...', regex=False)]
df = df[df['gmde_stand_old'].astype(int) < 7000]

df['gmde_stand_old'] = [int(num.lstrip('0')) for num in df['gmde_stand_old']]

df['gmde_name_old'] = [name.strip('.') for name in df.gmde_name_old]

In [45]:
gmde_stand = gemeindestaende[gemeindestaende.anz_gmde == df['gmde_stand_old'].nunique()].gemeindestand.values[0]

In [46]:
gmde_stand

'2022-05-01'

In [47]:
start_df = pd.read_excel(TARGET_PATH / 'gmde_stde' / f'Gemeindestand_{gmde_stand}.xlsx')

In [48]:
mutation_df = process_column_names(mutation_df, suffix=True)

# add column with Gemeindestand
mutation_df = mutation_df.rename(columns={'datum_der_aufnahme': 'gemeindestand'})

In [49]:
mutation_df = mutation_df[mutation_df.gemeindestand >= gmde_stand]

In [50]:
# GEMEINDESTAND-CREATOR

l_init = start_df[gmde_stand].to_list() # Data 01.06.1981

change_matrices = []
for gemeindestand in mutation_df.gemeindestand.unique():
    temp_df = mutation_df.query("gemeindestand == @gemeindestand")
    edges = list(zip(temp_df.bfs_gde_nummer_old, temp_df.bfs_gde_nummer_new))
    change_matrices.append(edges)
    
for i, gemeindestand in enumerate(mutation_df.gemeindestand.unique()):
    graphs = build_graphs(l_init, change_matrices[:i+1])
    gemeinde_mapping = {}
    for root, graph in graphs.items():
        leaves = (
            [node for node, out_degree in graph.out_degree() if out_degree == 0] 
            + [node for node in list(nx.nodes_with_selfloops(graph))]
        )
        gemeinde_mapping[root] = leaves


# GEMEINDEMAPPING-TABLE

col1 = []
col2 = []
for key, value in gemeinde_mapping.items():
    for v in value:
        col1.append(key)
        col2.append(v)

mapping = pd.DataFrame({gmde_stand: col1,
                       END: col2})
mapping.to_excel(TARGET_PATH / f'Gemeindemapping_{gmde_stand}_{END}.xlsx', index=False)

In [51]:
df = df.rename(columns={'gmde_stand_old': gmde_stand})
df = df.merge(mapping, on=gmde_stand, how='right')

In [52]:
gmde_names = pd.read_excel(SOURCE_PATH / 'Gemeindestand_01_01_2023.xlsx')
gmde_names = gmde_names.rename(columns={'BFS Gde-nummer': END, 'Gemeindename': 'gmde_name_new'})

  warn("Workbook contains no default style, apply openpyxl's default")


In [53]:
df = df.merge(gmde_names[[END, 'gmde_name_new']], how='left', on=END)

In [55]:
df.to_excel(SOURCE_PATH / 'Abstimmungsergebnisse_gmde_2023.xlsx', index=False)