# Explorative Datenanalyse - CDA2/2Da

In [55]:
import pandas as pd
import numpy as np
# pip install pygal_maps_ch
from pygal import maps

In [66]:
!pip install pygal




[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
def import_swiss_votes_data(filepath):
    """
    Import Swiss votes data from a CSV file.
    
    Parameters:
    filepath (str): Path to the CSV file
    
    Returns:
    pandas.DataFrame: dataframe with a subselect of csv data
    """
    
    # Import the CSV data
    # The data doesn't have headers and uses semicolons as separators
    df = pd.read_csv(filepath, sep=';')
    
    # Based on the data structure, we can create appropriate column names
    # This is a simplified set of column names based on the visible patterns
    columns_to_use = [
        'datum', 'titel_kurz_d', 'titel_off_d', 'stichwort', 'swissvoteslink', 'rechtsform', 'd1e1', 'd1e2', 'd1e3', 
        'dep', 'br-pos', 'bv-pos', 'nr-pos', 'sr-pos', 'annahme', 'berecht', 'stimmen', 'bet', 'leer', 'ungultig', 'gultig', 'volkja',
        'volknein', 'bfsdash-de', 'bfsmap-de', 'nach_cockpit_d',

        # Kantonale Parameter
        'zh-ja', 'be-ja', 'zg-ja', 'so-ja', 'ag-ja', 'ti-ja', 'vs-ja', 'bs-ja', 'bl-ja',
        'ju-ja', 'ne-ja', 'ow-ja', 'nw-ja', 'ai-ja', 'sg-ja', 'tg-ja', 'ar-ja', 'gl-ja',
        'gr-ja', 'sh-ja', 'vd-ja', 'ge-ja', 'fr-ja', 'ur-ja', 'lu-ja', 'sz-ja',

        'zh-nein', 'be-nein', 'zg-nein', 'so-nein', 'ag-nein', 'ti-nein', 'vs-nein', 'bs-nein', 'bl-nein',
        'ju-nein', 'ne-nein', 'ow-nein', 'nw-nein', 'ai-nein', 'sg-nein', 'tg-nein', 'ar-nein', 'gl-nein',
        'gr-nein', 'sh-nein', 'vd-nein', 'ge-nein', 'fr-nein', 'ur-nein', 'lu-nein', 'sz-nein',

        'zh-stimmen', 'be-stimmen', 'zg-stimmen', 'so-stimmen', 'ag-stimmen', 'ti-stimmen', 'vs-stimmen', 'bs-stimmen', 'bl-stimmen',
        'ju-stimmen', 'ne-stimmen', 'ow-stimmen', 'nw-stimmen', 'ai-stimmen', 'sg-stimmen', 'tg-stimmen', 'ar-stimmen', 'gl-stimmen',
        'gr-stimmen', 'sh-stimmen', 'vd-stimmen', 'ge-stimmen', 'fr-stimmen', 'ur-stimmen', 'lu-stimmen', 'sz-stimmen',

        'zh-berecht', 'be-berecht', 'zg-berecht', 'so-berecht', 'ag-berecht', 'ti-berecht', 'vs-berecht', 'bs-berecht', 'bl-berecht',
        'ju-berecht', 'ne-berecht', 'ow-berecht', 'nw-berecht', 'ai-berecht', 'sg-berecht', 'tg-berecht', 'ar-berecht', 'gl-berecht',
        'gr-berecht', 'sh-berecht', 'vd-berecht', 'ge-berecht', 'fr-berecht', 'ur-berecht', 'lu-berecht', 'sz-berecht',
    ]
    # srja, srnein, nrja, nrnein
    # Parteiparolen könnten spannend sein, p-fdp, p-sps, p-svp, p-mitte, etc.

    df_subset = df[columns_to_use]
    
    print(f"Cleaned data: {df.shape[0]} rows, {df.shape[1]} columns")
    return df_subset

swissvotes_data_path = 'data/raw/swissvotes_data.csv'

swissvotes_data_subset = clean_swiss_votes_data(swissvotes_data_path)

Cleaned data: 696 rows, 874 columns


In [12]:
swissvotes_data_subset.head()

Unnamed: 0,datum,titel_kurz_d,titel_off_d,stichwort,swissvoteslink,rechtsform,d1e1,d1e2,d1e3,dep,...,ar-berecht,gl-berecht,gr-berecht,sh-berecht,vd-berecht,ge-berecht,fr-berecht,ur-berecht,lu-berecht,sz-berecht
0,12.09.1848,Bundesverfassung der schweizerischen Eidgenoss...,Totalrevision vom 12. September 1848,.,https://swissvotes.ch/vote/1.00,1,1,1.2,1.21,.,...,.,.,.,.,.,.,.,.,.,.
1,14.01.1866,Mass und Gewicht,Festsetzung von Mass und Gewicht,.,https://swissvotes.ch/vote/2.00,1,1,1.5,1.52,.,...,.,.,.,.,.,.,.,.,.,.
2,14.01.1866,Gleichstellung der Juden und Naturalisierten m...,Gleichstellung der Juden und Naturalisierten m...,.,https://swissvotes.ch/vote/3.00,1,12,12.3,.,3,...,.,.,.,.,.,.,.,.,.,.
3,14.01.1866,Stimmrecht der Niedergelassenen in Gemeindeang...,Stimmrecht der Niedergelassenen in Gemeindeang...,.,https://swissvotes.ch/vote/4.00,1,1,1.4,1.43,.,...,.,.,.,.,.,.,.,.,.,.
4,14.01.1866,Besteuerung und zivilrechtliche Verhältnisse d...,Besteuerung und zivilrechtliche Verhältnisse d...,.,https://swissvotes.ch/vote/5.00,1,6,6.1,6.12,.,...,.,.,.,.,.,.,.,.,.,.


In [18]:

def clean_and_prepare_data(df_to_clean):
    
    # 1. Convert date to datetime format
    # Transform dates from DD.MM.YYYY to YYYY-MM-DD format
    df_to_clean['datum'] = pd.to_datetime(df_to_clean['datum'], format='%d.%m.%Y', errors='coerce')
    
    # 2. Replace placeholder values with NaN
    # In the data, '.' is used as unknown and '9999' appears to be not applicable (organisation doesn't exist)
    # We will be replaceing all these values with nan
    placeholder_values = ['.', '9999', '']
    for placeholder in placeholder_values:
        df_to_clean.replace(placeholder, np.nan, inplace=True)
    
    # 3. Convert boolean-like columns (0/1) to boolean
    boolean_cols = [col for col in df_to_clean.columns if 
                    df_to_clean[col].dropna().astype(str).str.match(r'^[01]$').all()]
    
    for col in boolean_cols:
        df_to_clean[col] = df_to_clean[col].map({0: False, '0': False, 1: True, '1': True})
    
    # 4. Strip whitespace from string columns
    string_cols = df_to_clean.select_dtypes(include=['object']).columns
    for col in string_cols:
        df_to_clean[col] = df_to_clean[col].astype(str).str.strip()
        # Convert back to object type if the column had empty strings that were converted to 'nan'
        df_to_clean[col] = df_to_clean[col].replace('nan', np.nan)
    
    # 6. Handle URLs - validate and clean
    url_cols = [col for col in df_to_clean.columns if 'url' in col.lower()]
    for col in url_cols:
        if col in df_to_clean.columns:
            # Basic URL validation - ensure URLs start with http
            mask = df_to_clean[col].notna() & ~df_to_clean[col].astype(str).str.startswith(('http://', 'https://'))
            if mask.any():
                print(f"Warning: Found {mask.sum()} invalid URLs in column {col}")
                
    # 7. Remove completely empty rows
    df_to_clean = df_to_clean.dropna(how='all')
    
    # 8. Handle trailing semicolons that might have created extra empty columns
    # Drop columns that are completely empty
    df_to_clean = df_to_clean.dropna(axis=1, how='all')

    return df_to_clean

In [19]:
swissvotes_data_subset_cleaned = clean_and_prepare_data(swissvotes_data_subset)

swissvotes_data_subset_cleaned.head()

Unnamed: 0,datum,titel_kurz_d,titel_off_d,stichwort,swissvoteslink,rechtsform,d1e1,d1e2,d1e3,dep,...,ar-berecht,gl-berecht,gr-berecht,sh-berecht,vd-berecht,ge-berecht,fr-berecht,ur-berecht,lu-berecht,sz-berecht
0,1848-09-12,Bundesverfassung der schweizerischen Eidgenoss...,Totalrevision vom 12. September 1848,,https://swissvotes.ch/vote/1.00,1,1,1.2,1.21,,...,,,,,,,,,,
1,1866-01-14,Mass und Gewicht,Festsetzung von Mass und Gewicht,,https://swissvotes.ch/vote/2.00,1,1,1.5,1.52,,...,,,,,,,,,,
2,1866-01-14,Gleichstellung der Juden und Naturalisierten m...,Gleichstellung der Juden und Naturalisierten m...,,https://swissvotes.ch/vote/3.00,1,12,12.3,,3.0,...,,,,,,,,,,
3,1866-01-14,Stimmrecht der Niedergelassenen in Gemeindeang...,Stimmrecht der Niedergelassenen in Gemeindeang...,,https://swissvotes.ch/vote/4.00,1,1,1.4,1.43,,...,,,,,,,,,,
4,1866-01-14,Besteuerung und zivilrechtliche Verhältnisse d...,Besteuerung und zivilrechtliche Verhältnisse d...,,https://swissvotes.ch/vote/5.00,1,6,6.1,6.12,,...,,,,,,,,,,


## Test Visualizations for later

In [65]:
test = maps.ch

dir(test)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'title']

In [52]:
#https://www.bfs.admin.ch/bfs/de/home/statistiken/bevoelkerung/erhebungen/esrk.html