In [1]:
import ast
from collections import defaultdict
import pandas as pd
from pocketknife.database import connect_database, read_from_database, save_to_database
from config import EXTERNAL_DATA_DIR

[32m2025-03-22 18:03:49.718[0m | [1mINFO    [0m | [36mconfig[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/manuelbolz/Documents/git/for_work/company_success_prediction[0m


# LOAD DATA

In [2]:
# db queries
query_shab = """
    SELECT
        ehraid,
        shab_id,
        shab_date,
        registry_office_canton,
        message AS message_raw
    FROM zefix.shab
"""

query_shab_mutation = """
    SELECT * 
    FROM zefix.shab_mutation
"""

query_gender_mapping = """
    SELECT *
    FROM zefix.founders_gender_mapping
"""

In [3]:
with connect_database() as con:
    raw_shab = read_from_database(con, query_shab)
    raw_shab_mutation = read_from_database(con, query_shab_mutation)

In [4]:
with connect_database() as con:
    zefix_gender_mapping = read_from_database(con, query_gender_mapping)

In [5]:
LANGUAGE = 'de'

In [6]:
parsed_shab_messages = pd.read_csv(EXTERNAL_DATA_DIR / f'final_{LANGUAGE}.csv')

In [7]:
parsed_shab_messages['parsed_variables'] = parsed_shab_messages.parsed_variables.fillna('{}')
parsed_shab_messages['parsed_variables'] = parsed_shab_messages.parsed_variables.str.replace("'null'", "''", regex=False)
parsed_shab_messages['parsed_variables'] = parsed_shab_messages.parsed_variables.str.replace("'n/a'", "''", regex=False)

In [8]:
parsed_shab_messages['parsed_variables'] = parsed_shab_messages.parsed_variables.apply(ast.literal_eval)
parsed_shab_messages['parsed_variables'] = parsed_shab_messages.parsed_variables.fillna({})

In [9]:
parsed_shab_messages['text_slices'] = parsed_shab_messages['text_slices'].fillna('[]')
parsed_shab_messages['text_slices'] = parsed_shab_messages['text_slices'].apply(ast.literal_eval)
parsed_shab_messages.head()

Unnamed: 0,shab_id,keyword,text_slices,parsed_variables,category,language
0,2636073,BEGINNING,"[Rubigenhof Fischzucht AG , in Rubigen , CHE-2...",{},undetermined,de
1,2636073,adresse neu,"[Alte Belpstrasse 5 , 3113 Rubigen]","{'addresses_until_now': [], 'addresses_deleted...",firm and address changes,de
2,2636075,BEGINNING,"[Käsereigenossenschaft Rüegsbach , in Rüegsau ...",{},undetermined,de
3,2636075,adresse neu,"[c/o Peter Stalder , Lehn 265 , 3418 Rüegsbach]","{'addresses_until_now': [], 'addresses_deleted...",firm and address changes,de
4,2636075,mitteilungen,[Mitteilungen an die Genossenschafter : schrif...,{},undetermined,de


In [None]:
print(parsed_shab_messages['shab_id'].nunique())
print(raw_shab['shab_id'].nunique())

636019
2476325


In [10]:
# Filter out the companies where the shab entries have been parsed
raw_shab_sub = raw_shab[raw_shab.shab_id.isin(parsed_shab_messages['shab_id'].unique())].copy()
raw_shab_mutation = raw_shab_mutation[raw_shab_mutation.shab_id.isin(parsed_shab_messages['shab_id'].unique())].copy()
raw_shab_mutation_grouped = (
    raw_shab_mutation
    .groupby('shab_id')
    .agg(codes=pd.NamedAgg(column='description', aggfunc=lambda x: [v for v in x]))
    .reset_index()
)

In [11]:
# Merge all the dataframes
shab_merged = (
    raw_shab_sub
    .merge(raw_shab_mutation_grouped, on='shab_id', how='left')
    .merge(parsed_shab_messages, on='shab_id', how='left')
)

# Sort values in the correct temporal order
shab_merged = shab_merged.sort_values(['ehraid', 'shab_date', 'shab_id'], ascending=[True, True, True]).reset_index(drop=True)
shab_merged = shab_merged.drop(columns=['language'])

In [11]:
# shab_merged_temp = shab_merged[shab_merged.ehraid.isin([905876, 905843, 905844])]
# shab_merged_temp = shab_merged_temp.sort_values(['ehraid', 'shab_date', 'shab_id'], ascending=[True, True, True]).reset_index()

In [18]:
def create_raw_json_history(df: pd.DataFrame) -> dict:
    json_structure = defaultdict(lambda: {'history': []})
    for _, row in df.iterrows():
        # Extract the fields to identify a company and its entries
        ehraid = row['ehraid']
        shab_date = row['shab_date'] if isinstance(row['shab_date'], str) else row['shab_date'].strftime('%Y-%d-%m')
        shab_id = row['shab_id']
        category = row['category']
        keyword = row['keyword']

        # Extract main information
        message_info = {
            'registry_office_canton': row['registry_office_canton'],
            'codes': row['codes'],
            'message_raw': row['message_raw'],
            'extracted_content': {
                category: {
                    keyword: {
                        'text_slices': row['text_slices'],
                        'variables': row['parsed_variables']
                    }
                }
            }
        }

        # Search if shab_date already exists in the history
        date_entry = next((entry for entry in json_structure[ehraid]['history'] if shab_date in entry), None)

        if date_entry is None:
            # If the date does not exist, create a new entry
            date_entry = {shab_date: {shab_id: message_info}}
            json_structure[ehraid]['history'].append(date_entry)
        else:
            # If the date exists, check if the shab_id already exists
            id_entry = date_entry[shab_date].get(shab_id, None)
            if id_entry is None:
                # If the shab_id does not exists, we can simply add it to the shab_date
                date_entry[shab_date][shab_id] = message_info
            else:
                # If the shab_id exists, we need to check if the category already exists
                category_entry = date_entry[shab_date][shab_id]['extracted_content'].get(category, None)
                if category_entry is None:
                    # If it does not exist, we add it to the extracted content
                    date_entry[shab_date][shab_id]['extracted_content'][category] = message_info['extracted_content'][category]
                else:
                    # If it does, we add the keyword to the category, since a keyword can only appear once within the category
                    date_entry[shab_date][shab_id]['extracted_content'][category][keyword] = message_info['extracted_content'][category][keyword]      

    return dict(json_structure)

In [13]:
# history_json = create_raw_json_history(shab_merged)

# CREATE REGISTERED PEOPLE AND FIRMS TABLE

In [12]:
df_people = shab_merged[shab_merged.category == 'natural persons and legal entities'].copy()

In [13]:
# Make sure only the expected fields are there
def validate_person_and_firms(x: dict):    
    schema = {'firms': [], 'people': []}
    for firm in x.get('firms', []):
        if isinstance(firm, dict):
            schema['firms'].append({
                'firm_name': firm.get('firm_name'),
                'firm_uid': firm.get('id'),
                'firm_seat': firm.get('location'),
                'firm_type': firm.get('type'),
                'firm_shares': firm.get('shares')
            })
    for person in x.get('people', []):
        if isinstance(person, dict):
            schema['people'].append({
                'first_name': person.get('first_name'),
                'last_name': person.get('last_name'),
                'hometown': person.get('hometown'),
                'place_of_residence': person.get('place_of_residence'),
                'nationality': person.get('nationality'),
                'job_title': person.get('job_title'),
                'signing_rights': person.get('signing_rights'),
                'shares': person.get('shares')
            })
    return schema

df_people['validated_variables'] = df_people['parsed_variables'].apply(validate_person_and_firms)  
df_people['firms'] = df_people['validated_variables'].apply(lambda x: x.get('firms', []))
df_people['people'] = df_people['validated_variables'].apply(lambda x: x.get('people', []))

# Split individual firm dictionaries into individual rows
df_firms_exploded = df_people.explode(column=['firms']).dropna()
df_people_exploded = df_people.explode(column=['people']).dropna()

# Create individual columns from the dictionary
df_firms_norm = pd.json_normalize(
    df_firms_exploded['firms'],
    errors='raise'
)
df_firms_concat = pd.concat([df_firms_exploded[['ehraid', 'message_raw', 'shab_date', 'shab_id', 'codes', 'keyword']].reset_index(drop=True), df_firms_norm], axis=1)

df_people_norm = pd.json_normalize(
    df_people_exploded['people'],
    errors='raise'
)
df_people_concat = pd.concat([df_people_exploded[['ehraid', 'message_raw', 'shab_date', 'shab_id', 'codes', 'keyword']].reset_index(drop=True), df_people_norm], axis=1)

# Add Gender and Nationality to the data

In [14]:
import re
import spacy
from unidecode import unidecode
from zefix_processing.country_mapping import country_names_to_alpha2
from zefix_processing.gender_mapping import german2gender, french2gender, italian2gender

nlp_models = {
    "de": spacy.load("de_core_news_sm"),
    "fr": spacy.load("fr_core_news_sm"),
    "it": spacy.load("it_core_news_sm")
}

In [15]:
def normalize_words(string: str) -> str:
    replacements = {
        'ä': 'ae',
        'ö': 'oe',
        'ü': 'ue'
    }
    for char, replacement in replacements.items():
        string = string.replace(char, replacement)
    
    return unidecode(string.lower())


def remove_articles(string: str) -> str:
    articles = [
        # German
        r'\bder\b', r'\bdie\b', r'\bdas\b', r'\bdes\b', r'\bdem\b', r'\bden\b',
        r'\bdessen\b', r'\bderen\b',
        r'\bein\b', r'\beine\b', r'\beiner\b', r'\beines\b', r'\beinem\b', r'\beinen\b',

        # French
        r'\ble\b', r'\bla\b', r'\bles\b', r"\bl'", r'\bdu\b', r'\bdes\b',
        r'\bau\b', r'\baux\b', r'\bun\b', r'\bune\b', r"\bd'", r'\bde\b', r'\bde la\b', r"\bde l'",

        # Italian
        r'\bdi\b', r'\bil\b', r'\blo\b', r'\bla\b', r"\bl'", r'\bi\b', r'\bgli\b', r'\ble\b',
        r'\bun\b', r'\buno\b', r'\buna\b', r"\bun'",
        r'\bdel\b', r'\bdello\b', r'\bdella\b', r'\bdei\b', r'\bdegli\b', r'\bdelle\b',
        r'\bdal\b', r'\bdallo\b', r'\bdalla\b', r'\bdai\b', r'\bdagli\b', r'\bdalle\b',
        r'\bal\b', r'\ballo\b', r'\balla\b', r'\bai\b', r'\bagli\b', r'\balle\b',
        r'\bnel\b', r'\bnello\b', r'\bnella\b', r'\bnei\b', r'\bnegli\b', r'\bnelle\b',
        r'\bsul\b', r'\bsullo\b', r'\bsulla\b', r'\bsui\b', r'\bsugli\b', r'\bsulle\b',
        r'\bcol\b', r'\bcoi\b'
    ]
    articles.sort(key=len, reverse=True)
    pattern = re.compile('|'.join(articles), flags=re.IGNORECASE)
    string = pattern.sub('', string)
    return re.sub(r'\s+', ' ', string).strip()

In [16]:
nationality_mapping = {normalize_words(k): v for k, v in country_names_to_alpha2.items()}
nationality_mapping_norm = {remove_articles(k): v for k, v in nationality_mapping.items()}

gender_mapping = {
    'de': {normalize_words(k): v for k, v in german2gender.items()},
    'fr': {normalize_words(k): v for k, v in french2gender.items()},
    'it': {normalize_words(k): v for k, v in italian2gender.items()},
}

### Clean the hometown, place of residence, and nationality column

In [17]:
# 1. Split multiple nationalities into individual columns
def clean_location(language: str, string: str) -> str:
    """Removes artifacts at the beginning of a location such as 'à Zurich' -> 'Zurich'"""
    mapping = {
        'de': ['von', 'in'],
        'fr': ['de', 'du', "d'", 'des', 'à'],
        'it': ['da', "d'", 'in']
    }
    for word in mapping.get(language, []):
        string = re.sub(rf'^{word}\s+', ' ', string)
    return string.strip()

def split_locations(df: pd.DataFrame, orig_col: str = 'nationality'):
    withand2noand = {
        'Antigua und Barbuda': 'Antigua Barbuda',
        'Bosnien und Herzegowina': 'Bosnien Herzegowina',
        'Bonaire, Sint Eustatius und Saba': 'Bonaire, Sint Eustatius Saba',
        'Südgeorgien und die Südlichen Sandwichinseln': 'Südgeorgien die Südlichen Sandwichinseln',
        'Heard und McDonaldinseln': 'Heard McDonaldinseln',
        'Saint Kitts und Nevis': 'Saint Kitts Nevis',
        'Saint-Pierre und Miquelon': 'Saint-Pierre Miquelon',
        'Spitzbergen und Jan Mayen': 'Spitzbergen Jan Mayen',
        'São Tomé und Príncipe': 'São Tomé Príncipe',
        'Turks- und Caicosinseln': 'Turks- Caicosinseln',
        'Französische Süd- und Antarktisgebiete': 'Französische Süd- Antarktisgebiete',
        'Trinidad und Tobago': 'Trinidad Tobago',
        'Saint Vincent und die Grenadinen': 'Saint Vincent die Grenadinen',
        'Wallis und Futuna': 'Wallis Futuna',
        'Antigua et Barbuda': 'Antigua Barbuda',
        'Géorgie du Sud et les îles Sandwich du Sud': 'Géorgie du Sud les îles Sandwich du Sud',
        'Îles Heard et MacDonald': 'Îles Heard MacDonald',
        'Saint-Christophe et Niévès': 'Saint-Christophe Niévès',
        'Saint-Christophe et Nevis': 'Saint-Christophe Nevis',
        'Saint-Pierre et Miquelon': 'Saint-Pierre Miquelon',
        'Sao Tomé et Principe': 'Sao Tomé Principe',
        'Îles Turques et Caïques': 'Îles Turques Caïques',
        'Trinité et Tobago': 'Trinité Tobago',
        'Saint-Vincent et les-Grenadines': 'Saint-Vincent les-Grenadines',
        'Wallis et Futuna': 'Wallis et Futuna',
        'Bosnie et Herzégovine': 'Bosnie Herzégovine',
        'Bonaire, Saint-Eustache et Saba': 'Bonaire, Saint-Eustache Saba',
        'Svalbard et Jan Mayen': 'Svalbard Jan Mayen',
        'Terres australes et antarctiques françaises': 'Terres australes antarctiques françaises',
        'Antigua e Barbuda': 'Antigua Barbuda',
        'Bonaire, Sint Eustatius e Saba': 'Bonaire, Sint Eustatius Saba',
        'Georgia del Sud e isole Sandwich meridionali': 'Georgia del Sud isole Sandwich meridionali',
        'Isole Heard e McDonald': 'Isole Heard McDonald',
        'Saint Kitts e Nevis': 'Saint Kitts Nevis',
        'serbo e montenegrino': 'serbo montenegrino',
        'serba e montenegrina': 'serba montenegrina',
        'Saint-Pierre e Miquelon': 'Saint-Pierre Miquelon',
        'Svalbard e Jan Mayen': 'Svalbard Jan Mayen',
        'São Tomé e Príncipe': 'São Tomé Príncipe',
        'Isole Turks e Caicos': 'Isole Turks Caicos',
        'Trinidad e Tobago': 'Trinidad Tobago',
        'Saint Vincent e Grenadine': 'Saint Vincent Grenadine',
        'Wallis e Futuna': 'Wallis Futuna',
        'Bosnia ed Erzegovina': 'Bosnia Erzegovina',
    }
    for original_name, replacement in withand2noand.items():
        df[orig_col] = df[orig_col].str.replace(original_name, replacement, regex=False)
    if orig_col == 'nationality':
        loc_split = df[orig_col].str.split(r'\sund\s|\set\s|\se\s|\sed\s|\s,\s', regex=True, expand=True)  # Also split by comma
    else:
        loc_split = df[orig_col].str.split(r'\sund\s|\set\s|\se\s|\sed\s', regex=True, expand=True)
    loc_split.columns = [f'{orig_col}_{i+1}' for i in range(loc_split.shape[1])]
    loc_split.fillna('', inplace=True)
    for col in loc_split.columns:
        for replacement, original_name in withand2noand.items():
            loc_split[col] = loc_split[col].str.replace(original_name, replacement, regex=False)
    df = pd.concat([df, loc_split], axis=1)
    return df.drop(columns=[orig_col])

# Sometimes 'de et à' or 'von und in' was not correctly parsed by the LLM, so we ensure that it is the same value
df_people_concat.loc[df_people_concat['hometown'] == 'de et', 'hometown'] = df_people_concat['place_of_residence']
df_people_concat.loc[df_people_concat['hometown'].isin(['von', 'de', 'da']), 'hometown'] = df_people_concat['place_of_residence'].str.replace(r'in |à |au ', '')

df_people_concat.loc[df_people_concat['hometown'].str.contains(r'de et à |du et au |von und in '), 'place_of_residence'] = df_people_concat['hometown'].str.replace(r'de et à |du et au |von und in ', '')
df_people_concat.loc[df_people_concat['place_of_residence'].str.contains(r'de et à |du et au |von und in |und in '), 'hometown'] = df_people_concat['place_of_residence'].str.replace(r'de et à |du et au |von und in |und in ', '')

df_people_concat['hometown'] = df_people_concat['hometown'].str.replace(r'de et à |du et au |von und in ', '')
df_people_concat['place_of_residence'] = df_people_concat['place_of_residence'].str.replace(r'de et à |du et au |von und in ', '')

# Here we want to correct entries like 'de Bagnes, à Londres , GBR' to move GBR into the place of residence col because it is not the nationality of the person
mask = (df_people_concat['nationality'].str.isupper() & (df_people_concat['nationality'] != 'CH'))
df_people_concat.loc[mask, 'place_of_residence'] = (
    df_people_concat.loc[mask, 'place_of_residence'] + 
    ' ( ' + df_people_concat.loc[mask, 'nationality'] + ' )'
)
df_people_concat.loc[mask, 'nationality'] = ''  # remove nationality

# First, split the hometown and place of residence column
df_people_concat = split_locations(df_people_concat, 'hometown')
df_people_concat = split_locations(df_people_concat, 'place_of_residence')

# Remove artifacts from the parsing to clean the location names
for col in [c for c in df_people_concat.columns if c.startswith('hometown_')]:
    df_people_concat[col] = df_people_concat[col].apply(lambda x: clean_location(LANGUAGE, x))

for col in [c for c in df_people_concat.columns if c.startswith('place_of_residence_')]:
    df_people_concat[col] = df_people_concat[col].apply(lambda x: clean_location(LANGUAGE, x))

df_people_concat['nationality'] = df_people_concat['nationality'].apply(lambda x: clean_location(LANGUAGE, x))

In [18]:
# 2. Move nationalities that are in the wrong column
def contains_target_word(text):
    pattern = r'\bstaatsangehoerige\b|\bstaatsbuergerin\b|\bbuergerin\b|\bcittadina\b|\bressortissante\b|\bcitoyenne\b|\bstaatsangehoeriger\b|\bstaatsbuerger\b|\bbuerger\b|\bcittadino\b|\bressortissant\b|\bcitoyen\b'
    return bool(re.search(pattern, text))

def move_nationalities(language: str, entries: list[str], nat: str, country_names: set[str]):
    """
    Checks if any of the hometown columns contains a country name
    """
    mapping = {
        'de': 'und',
        'fr': 'et',
        'it': 'e'
    }
    nat_norm = normalize_words(nat)        
    for i, entry in enumerate(entries):
        if entry:
            entry_norm = normalize_words(entry)
            if (entry_norm in country_names or contains_target_word(entry_norm)) and not entry_norm in ['mex', 'sur']:
                if not re.match(rf'\b{entry_norm}\b', nat_norm):
                    nat = f"{nat} {mapping.get(language, 'und')} {entry}" if nat else entry
                entries[i] = ''
            else:
                # If the name is not a country name and the nationality does not include Swiss yet,
                # we want to add 'CH' to the nationalities, since the hometown is with high probability a Swiss municipality
                if 'CH' not in nat:
                    nat = f"CH {mapping.get(language, 'und')} {nat}" if nat else 'CH'
    return entries + [nat]

countries_norm = nationality_mapping.keys()
hometown_cols = [col for col in df_people_concat.columns if 'hometown' in col]
result_cols = hometown_cols + ['nationality']
df_people_concat[result_cols] = df_people_concat.apply(lambda x: pd.Series(move_nationalities(LANGUAGE, [x[col] for col in hometown_cols], x['nationality'], countries_norm)), axis=1)

In [19]:
# Finally, split the nationality column
df_people_concat = split_locations(df_people_concat, 'nationality')

### Clean the authorization and shares column

In [20]:
def switch_auth_and_shares(language: str, auth: str, shares: str):
    """
    Checks if any of the hometown columns contains a country name
    """
    keyword_mapping = {
        'de': 'unterschrift',
        'fr': 'signature',
        'it': 'firma'
    }
    and_mapping = {
        'de': 'und',
        'fr': 'et',
        'it': 'e'
    }

    # Base Case: no value in both
    if not (auth or shares):
        return [auth, shares]  # no switch
    
    match_auth = re.search(r'\bchf\b', auth.lower()) if auth else None
    match_shares = re.search(keyword_mapping[language], shares.lower()) if shares else None

    # Case 0: no match in both
    if not (match_auth or match_shares):
        return [auth, shares]  # no switch

    # Case 1: match in auth and no match in shares
    elif match_auth and not match_shares:
        if shares:
            return ['', f"{auth} {and_mapping[language]} {shares}"]  # Add auth infront of shares
        else:
            return ['', auth]  # switch columns: auth, share
    
    # Case 2: match in shares and no value in auth
    elif match_shares and not match_auth:
        if shares:
            return [f"{auth} {and_mapping[language]} {shares}", '']  # Add shares after auth
        else:
            return [shares, '']  # switch columns: auth, share
    
    # Case 3: match in auth and shares
    elif match_auth and not match_shares:
        return [shares, auth]  # switch both
    
    else:
        return [auth, shares]

In [21]:
df_people_concat[['signing_rights', 'shares']] = df_people_concat.apply(lambda x: pd.Series(switch_auth_and_shares(LANGUAGE, x['signing_rights'], x['shares'])), axis=1)

### Normalize the nationality and add the iso-3166-1 alpha 2 codes

In [22]:
def map_country(nationality: str, mapping: dict, mapping_norm: dict) -> str:
    if nationality:
        nationality = re.sub(r'\bstaatsangehoeriger\b|\bstaatsbuerger\b|\bbuerger\b|\bcittadino\b|\bressortissant\b|\bcitoyen\b|\bstaatsangehoerige\b|\bstaatsbuergerin\b|\bbuergerin\b|\bcittadina\b|\bressortissante\b|\bcitoyenne\b', '', nationality)
        match = mapping.get(nationality.strip())
        if match:
            return match
        else:
            return mapping_norm.get(remove_articles(nationality), '')
    return ''

In [23]:
mapped = {country: map_country(normalize_words(country), nationality_mapping, nationality_mapping_norm) for country in df_people_concat.nationality_1.unique()}
{k: v for k, v in mapped.items() if v == ''}

{'': ''}

In [23]:
nat_cols = [col for col in df_people_concat.columns if re.match(r'^nationality\_\d{1}$', col)]
for nat_col in nat_cols:
    df_people_concat[f"{nat_col}_norm"] = df_people_concat[nat_col].fillna('').apply(normalize_words)
    df_people_concat[f'{nat_col}_iso_3166_1_alpha_2'] = df_people_concat[f"{nat_col}_norm"].apply(lambda x: map_country(x, nationality_mapping, nationality_mapping_norm))

In [24]:
def custom_sort(entries: list) -> list:
    """Sorts the unique nationality codes such that CH always comes first if present"""
    if len(entries) > 0 and 'CH' in entries:
        remainder = entries.remove('CH')
        return ['CH'] + (remainder if remainder else [])
    return entries


nat_cols = [col for col in df_people_concat.columns if col.endswith('_iso_3166_1_alpha_2')]
for i in range(len(nat_cols)-1):
    for n in range(i+1,len(nat_cols)):
        df_people_concat.loc[df_people_concat[nat_cols[i]] == '', nat_cols[i]] = df_people_concat[nat_cols[n]]

unique_codes = df_people_concat[nat_cols].apply(lambda row: custom_sort(list(pd.unique(row))), axis=1)
result_df = pd.DataFrame(unique_codes.to_list())
result_df = result_df.replace('', None).dropna(how='all', axis=1).fillna(pd.NA)
result_df = result_df.rename(columns={i: f'nationality_{i+1}_iso_3166_1_alpha_2' for i in range(result_df.shape[1])})
df_people_concat = pd.concat([df_people_concat.drop(columns=nat_cols), result_df], axis=1)

ht_cols = [col for col in df_people_concat.columns if col.startswith('hometown_')]
for i in range(len(ht_cols)-1):
    for n in range(i+1,len(ht_cols)):
        df_people_concat.loc[df_people_concat[ht_cols[i]] == '', ht_cols[i]] = df_people_concat[ht_cols[n]]

unique_codes = df_people_concat[ht_cols].apply(lambda row: list(pd.unique(row)), axis=1)
result_df = pd.DataFrame(unique_codes.to_list())
result_df = result_df.replace('', None).dropna(how='all', axis=1).fillna(pd.NA)
result_df = result_df.rename(columns={i: f'hometown_{i+1}' for i in range(result_df.shape[1])})
df_people_concat = pd.concat([df_people_concat.drop(columns=ht_cols), result_df], axis=1)

pr_cols = [col for col in df_people_concat.columns if col.startswith('place_of_residence_')]
for i in range(len(pr_cols)-1):
    for n in range(i+1,len(pr_cols)):
        df_people_concat.loc[df_people_concat[pr_cols[i]] == '', pr_cols[i]] = df_people_concat[pr_cols[n]]

unique_codes = df_people_concat[pr_cols].apply(lambda row: list(pd.unique(row)), axis=1)
result_df = pd.DataFrame(unique_codes.to_list())
result_df = result_df.replace('', None).dropna(how='all', axis=1).fillna(pd.NA)
result_df = result_df.rename(columns={i: f'place_of_residence_{i+1}' for i in range(result_df.shape[1])})
df_people_concat = pd.concat([df_people_concat.drop(columns=pr_cols), result_df], axis=1)

### Find gendered job titles and/or determine gender

In [25]:
gendered_endings = {
    'de': {
        'female': ['in'],
        'male': [],  # no specific ending for male words in German
    },
    'fr': {
        'female': ['euse', 'ienne', 'onne', 'ane', 'trice', 'esse'],
        'male': ['eur', 'ien', 'on', 'an'],
    },
    'it': {
        'female': ['a', 'trice', 'essa'],
        'male': ['o', 'ore']
    }   
}


def extract_nouns(text: str, language: str) -> list:
    """
    Extracts nouns from a given text using spaCy for German, French, and Italian.
    """
    nlp = nlp_models[language]
    return [token.text for token in nlp(text) if token.pos_ == 'NOUN']


def create_gendered_job_names(language: str, df: pd.DataFrame, col: str = 'job_title_norm') -> tuple[list]:
    nouns = set([title.lower() for title in df[col].unique() for title in extract_nouns(title, language)])
       
    female_words = []
    male_words = []
    undetermined = []

    for word in nouns:
        if any(word.endswith(ending) for ending in gendered_endings[language]['female']):
            female_words.append(word)
        elif any(word.endswith(ending) for ending in gendered_endings[language]['male']):
            male_words.append(word)
        else:
            undetermined.append(word)

    if language == 'de':
        for word in female_words:
            male_version = word.removesuffix('in')
            if male_version in undetermined:
                male_words.append(male_version)
        
    undetermined = [w for w in undetermined if w not in male_words]

    return female_words, male_words, undetermined

In [26]:
# Normalize job title
df_people_concat['job_title_norm'] = df_people_concat['job_title'].apply(normalize_words)
df_people_concat['job_title_norm'] = df_people_concat['job_title_norm'].str.replace(r'[^a-zA-Z]', ' ', regex=True).apply(lambda x: ' '.join(x.split()))

In [27]:
CREATE_GENDERED_WORDS = False

if CREATE_GENDERED_WORDS:
    female_words, male_words, undetermined = create_gendered_job_names('fr', df_people_concat)

In [28]:
def clean_names(name: str) -> str:
    name = re.sub(r' genannt | dit | dite | detto | detta ', ' ', name)
    name = re.sub(r'\b\w+\.\s?', '', name)  # Remove things like 'Dr.', 'Prof.', etc.
    name = re.sub(r'\(.*?\)', '', name)
    name = re.sub(r'\[.*?\]', '', name)
    name = re.sub(r'[^\w\s\-.]', '', name)
    name = re.sub(r'\b\d+\b', '', name)
    name = re.sub(r'\b\w{1,4}\.(?=\s|$)', '', name)
    return ' '.join(name.strip().split())


def prepare_country_code(code: str) -> str:
    code = code if code != 'XK' else 'RS'  # gender API does not support Kosovo (XK)
    return code if len(code) <= 2 else None


def determine_gender(
    mapping: dict,
    nationalities: list[str],
    job_title: str
) -> str|None:
    """
    """
    # Try to infer gender via nationality
    for nationality in nationalities:
        if re.search(r'\bstaatsangehoeriger\b|\bstaatsbuerger\b|\bbuerger\b|\bcittadino\b|\bressortissant\b|\bcitoyen\b', nationality):
            return 'm'
        elif re.search(r'\bstaatsangehoerige\b|\bstaatsbuergerin\b|\bbuergerin\b|\bcittadina\b|\bressortissante\b|\bcitoyenne\b', nationality):
            return 'f'

    # Try to infer gender via job title
    genders = [mapping[w] for w in mapping.keys() if re.search(rf'(?<!\w){w}(?!\w)', job_title)]
    if genders:
        # Check if list only contains one gender
        if genders.count(genders[0]) == len(genders):
            return genders[0]

    return None

In [29]:
nat_norm_cols = [col for col in df_people_concat.columns if re.match(r'\bnationality_\d{1}_norm\b', col, flags=re.IGNORECASE)]
df_people_concat['gender'] = df_people_concat.apply(lambda x: determine_gender(gender_mapping[LANGUAGE], [x[col] for col in nat_norm_cols], x['job_title_norm']), axis=1)

In [30]:
zefix_gender_mapping = zefix_gender_mapping.rename(columns={'name': 'first_name_norm', 'country_of_origin': 'nationality_1_iso_3166_1_alpha_2', 'gender': 'gender_name'})
zefix_gender_mapping['first_name_norm'] = zefix_gender_mapping['first_name_norm'].apply(lambda x: clean_names(normalize_words(x)))
zefix_gender_mapping = zefix_gender_mapping.drop_duplicates(subset=['first_name_norm', 'nationality_1_iso_3166_1_alpha_2'], keep='first')

In [31]:
zefix_gender_mapping['split_first_names'] = zefix_gender_mapping[(zefix_gender_mapping.probability > 0.6) & (zefix_gender_mapping.request_type == 'first_name')]['first_name_norm'].str.split()

In [None]:
import numpy as np
from sklearn.metrics import jaccard_score

jaccard_score()

In [32]:
individual_names = zefix_gender_mapping.explode(column=['split_first_names']).dropna(subset=['split_first_names'])

# First drop duplicate male names and duplicate female names by only keeping the first names
individual_names = individual_names.drop_duplicates(subset=['split_first_names', 'gender_name'], keep='first')

# Then drop duplicate names that are both male and female completely
individual_names = individual_names.drop_duplicates(subset=['split_first_names'], keep=False)

# Only keep names with at least 3 letters
individual_names = individual_names[individual_names.split_first_names.str.len() > 2]

In [33]:
print(len(df_people_concat))

2148214


In [34]:
df_people_concat['first_name_norm'] = df_people_concat['first_name'].apply(lambda x: clean_names(normalize_words(x)))
df_people_concat = df_people_concat.merge(zefix_gender_mapping[['first_name_norm', 'nationality_1_iso_3166_1_alpha_2', 'gender_name']], on=['first_name_norm', 'nationality_1_iso_3166_1_alpha_2'], how='left')
df_people_concat.loc[df_people_concat.gender.isna(), 'gender'] = df_people_concat['gender_name']

In [35]:
male_names = individual_names[individual_names.gender_name == 'm']['split_first_names']
female_names = individual_names[individual_names.gender_name == 'f']['split_first_names']

# Compile regex patterns once (efficient)
male_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, male_names)) + r')\b')
female_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, female_names)) + r')\b')

# Mask for rows with gender NaN
mask = df_people_concat['gender'].isna()

# Apply patterns efficiently using pandas string methods
df_people_concat.loc[
    mask & df_people_concat['first_name_norm'].str.contains(male_pattern) &
    ~df_people_concat['first_name_norm'].str.contains(female_pattern),
    'gender'
] = 'm'

df_people_concat.loc[
    mask & df_people_concat['first_name_norm'].str.contains(female_pattern) &
    ~df_people_concat['first_name_norm'].str.contains(male_pattern),
    'gender'
] = 'f'

  mask & df_people_concat['first_name_norm'].str.contains(male_pattern) &
  ~df_people_concat['first_name_norm'].str.contains(female_pattern),
  mask & df_people_concat['first_name_norm'].str.contains(female_pattern) &
  ~df_people_concat['first_name_norm'].str.contains(male_pattern),


In [36]:
print(f'Missing Gender: {len(df_people_concat[df_people_concat.gender.isna()])}')

Missing Gender: 23781


In [37]:
df_people_concat['founders'] = [int('status.neu' in codes) for codes in df_people_concat['codes']]

In [38]:
def ensure_dataframe_columns(df: pd.DataFrame) -> pd.DataFrame:
    required_columns = [
        "ehraid", "shab_date", "shab_id", "keyword", "first_name", "last_name",
        "job_title", "signing_rights", "shares",
        "hometown_1", "hometown_2", "hometown_3", "hometown_4", "hometown_5",
        "place_of_residence_1", "place_of_residence_2",
        "nationality_1_iso_3166_1_alpha_2", "nationality_2_iso_3166_1_alpha_2", "nationality_3_iso_3166_1_alpha_2",
        "gender", "founders"
    ]

    # Add missing columns with NaN values
    for col in required_columns:
        if col not in df.columns:
            df[col] = pd.NA

    # Ensure correct column order
    df = df[required_columns]

    return df

In [39]:
df_people_concat = ensure_dataframe_columns(df_people_concat)

In [40]:
df_firms_concat = df_firms_concat.drop(columns=['message_raw', 'codes'], errors='ignore')

In [41]:
df_people_concat.head()

Unnamed: 0,ehraid,shab_date,shab_id,keyword,first_name,last_name,job_title,signing_rights,shares,hometown_1,...,hometown_3,hometown_4,hometown_5,place_of_residence_1,place_of_residence_2,nationality_1_iso_3166_1_alpha_2,nationality_2_iso_3166_1_alpha_2,nationality_3_iso_3166_1_alpha_2,gender,founders
0,2,2019-09-06,1004711015,ausgeschiedene personen und erloschene untersc...,Philippe,Nappez,Mitglied des Verwaltungsrates,mit Kollektivunterschrift zu zweien,,Grandfontaine,...,,,,Binningen,,CH,,,m,0
1,2,2019-09-06,1004711015,eingetragene personen neu oder mutierend,Daniel,Ebneter,Mitglied des Verwaltungsrates,mit Kollektivunterschrift zu zweien,,Häggenschwil,...,,,,Rheinfelden,,CH,,,m,0
2,2,2020-02-07,1004825123,eingetragene personen neu oder mutierend,Gabriella,Karger Travella,Mitglied des Verwaltungsrates,mit Kollektivunterschrift zu zweien,,Basel,...,,,,Basel,,CH,,,f,0
3,2,2020-02-07,1004825123,eingetragene personen neu oder mutierend,Julien,Orsini,Liquidator,mit Einzelunterschrift,,Basel,...,,,,Reinach ( BL ),,CH,,,m,0
4,15,2022-04-19,1005453034,eingetragene personen neu oder mutierend,Urs,Antener,Präsident des Verwaltungsrates,mit Einzelunterschrift,,Eggiwil,...,,,,Sarnen,,CH,,,m,0


In [42]:
INIT_HISTORY_INSCRIBED_PEOPLE = f"""CREATE TABLE IF NOT EXISTS zefix.history_inscribed_people (
                ehraid INT,
                shab_date DATE,
                shab_id INT,
                keyword TEXT,
                first_name TEXT,
                last_name TEXT,
                job_title TEXT,
                signing_rights TEXT,
                shares TEXT,
                hometown_1 TEXT,
                hometown_2 TEXT,
                hometown_3 TEXT,
                hometown_4 TEXT,
                hometown_5 TEXT,
                place_of_residence_1 TEXT,
                place_of_residence_2 TEXT,
                nationality_1_iso_3166_1_alpha_2 TEXT,
                nationality_2_iso_3166_1_alpha_2 TEXT,
                nationality_3_iso_3166_1_alpha_2 TEXT,
                gender TEXT,
                founders BOOLEAN
            )"""

INIT_HISTORY_INSCRIBED_FIRMS = f"""CREATE TABLE IF NOT EXISTS zefix.history_inscribed_firms (
                ehraid INT,
                shab_date DATE,
                shab_id INT,
                keyword TEXT,
                firm_name TEXT,
                firm_uid TEXT,
                firm_seat TEXT,
                firm_type TEXT,
                firm_shares TEXT
            )"""

In [43]:
name2table = {
    'history_inscribed_people': (INIT_HISTORY_INSCRIBED_PEOPLE, df_people_concat),
    'history_inscribed_firms': (INIT_HISTORY_INSCRIBED_FIRMS, df_firms_concat),
}
for table_name, (query, df) in name2table.items():
    with connect_database() as con:
        con.execute(query)
        save_to_database(con, df, table_name, 'zefix')

In [39]:
# df_people_concat.to_csv(PROCESSED_DATA_DIR / f'people_and_firms_{LANGUAGE}.csv', index=False)

# CREATE HISTORICAL PURPOSE FRAME

In [40]:
purpose_df = shab_merged[shab_merged.category == 'purpose'].copy()

In [41]:
purpose_lists = [l for l in purpose_df.text_slices if l and len(l) > 1]
assert len(purpose_lists) == 0

In [42]:
purpose_df['purpose_raw'] = [' . '.join(purpose) if purpose else '' for purpose in purpose_df['text_slices']]

In [43]:
purpose_df

Unnamed: 0,ehraid,shab_id,shab_date,registry_office_canton,message_raw,codes,keyword,text_slices,parsed_variables,category,purpose_raw
107,54,1005740573,2023-05-08,SO,"Aare Finanz- und Holding-AG, in Olten, CHE-102...","[zweckaenderung, kapitalaenderung, kapitalaend...",zweck neu,[Übernahme und Verwaltung von Beteiligungen so...,{},purpose,Übernahme und Verwaltung von Beteiligungen sow...
110,55,1005420418,2022-03-04,AG,"Aaraucar AG, in Aarau, CHE-101.892.069, Aktien...",[zweckaenderung],zweck neu,"[Zweck der Gesellschaft ist der Erwerb , die V...",{},purpose,"Zweck der Gesellschaft ist der Erwerb , die Ve..."
117,57,1005770142,2023-06-16,BE,"GASSER CERAMIC AG, in Rapperswil (BE), CHE-102...","[zweckaenderung, aenderungkapitalband]",zweck neu,[Die Gesellschaft bezweckt das Halten und Verw...,{},purpose,Die Gesellschaft bezweckt das Halten und Verwa...
161,72,1004882905,2020-05-05,BE,"Aare-Kies AG, in Kirchdorf (BE), CHE-104.079.6...","[zweckaenderung, aenderungorgane]",zweck neu,"[Die Gesellschaft bezweckt die Gewinnung , Auf...",{},purpose,"Die Gesellschaft bezweckt die Gewinnung , Aufb..."
169,74,1004675877,2019-07-15,AG,"Aare-Taxi AG, in Brugg, CHE-107.123.578, Aktie...","[zweckaenderung, aenderungorgane]",zweck neu,[Führen eines Taxigeschäftes mit Festangestell...,{},purpose,Führen eines Taxigeschäftes mit Festangestellt...
...,...,...,...,...,...,...,...,...,...,...,...
5274311,1682040,1006272424,2025-03-04,LU,"Hooshyarsangari, in Luzern, CHE-455.021.858, L...","[status, status.neu]",zweck neu,[Durchführung von Umzügen und Reinigungsarbeit...,{},purpose,Durchführung von Umzügen und Reinigungsarbeite...
5274319,1682041,1006272425,2025-03-04,LU,"LA Capital AG, in Beromünster, CHE-343.494.505...","[status, status.neu]",zweck neu,"[Die Gesellschaft bezweckt den Erwerb , das Ha...",{},purpose,"Die Gesellschaft bezweckt den Erwerb , das Hal..."
5274325,1682042,1006272426,2025-03-04,LU,"Mentor Gerüst GmbH, in Wauwil, CHE-483.874.731...","[status, status.neu]",zweck neu,"[Die Gesellschaft bezweckt die Vermietung , Mo...",{},purpose,"Die Gesellschaft bezweckt die Vermietung , Mon..."
5274328,1682043,1006272427,2025-03-04,LU,"REINIGUNG - SARACENO, in Rickenbach (LU), CHE-...","[status, status.neu]",zweck neu,[Reinigung von Privatwohnungen ; gewerbliche B...,{},purpose,Reinigung von Privatwohnungen ; gewerbliche Bü...


In [44]:
purpose_df['branch'] = [int(any(w in keyword for w in ['succursale', 'zweigniederlassung'])) for keyword in purpose_df['keyword']]
purpose_df['main_seat'] = [int(any(w in keyword for w in ['principal', 'hauptsitz'])) for keyword in purpose_df['keyword']]
purpose_df['founding_purpose'] = [int('status.neu' in codes) if isinstance(codes, list) else 0 for codes in purpose_df['codes']]

In [45]:
purpose_df.keyword.unique()

array(['zweck neu', 'zweck hauptsitz neu', 'zweck zweigniederlassung neu'],
      dtype=object)

In [46]:
purpose_df = purpose_df.drop(columns=['registry_office_canton', 'message_raw', 'codes', 'parsed_variables', 'category', 'text_slices'])

In [47]:
INIT_HISTORY_PURPOSE = f"""
    CREATE TABLE IF NOT EXISTS zefix.history_purpose (
    ehraid INT,
    shab_date DATE,
    shab_id INT,
    keyword TEXT,
    purpose_raw TEXT,
    branch BOOLEAN,
    main_seat BOOLEAN,
    founding_purpose BOOLEAN
);"""

name2table = {
    'history_purpose': (INIT_HISTORY_PURPOSE, purpose_df),
}
for table_name, (query, df) in name2table.items():
    with connect_database() as con:
        con.execute(query)
        save_to_database(con, df, table_name, 'zefix')

# PROCESS HISTORICAL FIRM CHANGES

In [48]:
firm_changes_df = shab_merged[shab_merged.category == 'firm and address changes'].copy()

In [49]:
codes = [codes for codes in firm_changes_df.codes if isinstance(codes, list)]

In [50]:
{c for code in codes for c in code}

{'adressaenderung',
 'aenderungkapitalband',
 'aenderungorgane',
 'aenderunguid',
 'firmenaenderung',
 'fusion',
 'kapitalaenderung',
 'kapitalaenderung.libriert',
 'kapitalaenderung.nominell',
 'kapitalaenderung.stueckelung',
 'rechtsformaenderung',
 'spaltung',
 'status',
 'status.aufl',
 'status.aufl.konk',
 'status.aufl.liq',
 'status.loeschung',
 'status.neu',
 'status.wiedereintrag',
 'status.wiederrufliq',
 'vermoegenstransfer',
 'zweckaenderung'}

In [51]:
mapping = {
    'de': [
        ['firma neu'],
        ['adresse neu', 'weitere adressen', 'weitere adressen gestrichen', 'liquidationsadresse', 'postadresse neu'],
        ['zweigniederlassung neu', 'zweigniederlassung gestrichen'],
    ],
    'fr': [
        ['nouvelle raison sociale'],
        ['nouvelle adresse', 'autres adresses', 'autres adresses radiées', 'adresse de liquidation', 'nouvelle adresse postale'],
        ['nouvelle succursale', 'succursale radiée'],
    ],
    'it': [
        ['nuova ditta'],
        ['nuovo recapito', 'altri indirizzi', 'indirizzo della liquidazione', 'nuovo indirizzo postale'],
        ['nuova succursale', 'succursale radiata'],
    ]
}
firm_name_df = firm_changes_df[firm_changes_df['keyword'].isin(mapping[LANGUAGE][0])].copy()
firm_address_df = firm_changes_df[firm_changes_df['keyword'].isin(mapping[LANGUAGE][1])].copy()
branches_df = firm_changes_df[firm_changes_df['keyword'].isin(mapping[LANGUAGE][2])].copy()

### Prepare dissolution table

In [52]:
dissolution_df = shab_merged[shab_merged.codes.apply(lambda x: ('status.aufl' in x) if isinstance(x, list) else False)]

ehraids = []
shab_dates = []
shab_ids = []
keywords = []
codes_list = []
reasons_for_dissolution_list = []

lang2dissolution = {
    'de': ['gerichtspräsidium', 'einzelgericht', 'zivilgericht', 'kreisgericht', 'bezirksgericht', 'kantonsgericht', 'gemäss verfügung', 'einzelrichter', 'konkursrichter', 'konkursverfahren', 'auflösung', 'aufgelöst', 'aufgehoben', 'konkurs eröffnet', 'gestorben', 'verstorben'],
    'fr': ['tribunal', 'assemblée générale', 'assemblée des associés', 'juge unique compétent', 'cour de justice', 'selon décision des associés', 'déclarée dissoute', 'par décision', 'selon décision', 'est dissoute', 'déclarée dissoute', 'la dissolution', 'mort', 'décédé'],
    'it': ['tribunale', 'pretura del distretto', 'assemblea generale', 'assemblea dei soci', 'assemblea sociale', 'autorità federale', 'ordinata la liquidazione', 'è sciolta', 'è dichiarata sciolta', 'con decisione', 'con decreto', 'morto', 'deceduto']
}

# 'è sciolta', 'con decreto', 'con decisione', "sciolta d'ufficio", 
for i, row in dissolution_df.iterrows():
    shab_id = row['shab_id']
    ehraid = row['ehraid']
    shab_date = row['shab_date']
    keyword = row['keyword']
    codes = row['codes']

    text_slices = row['text_slices']
    reasons_for_dissolution = []
    for slice in text_slices:
        for s in slice.split(' . '):
            words = lang2dissolution[LANGUAGE]
            if any(word in s.lower() for word in words):
                reasons_for_dissolution.append(s)
    if len(reasons_for_dissolution) > 0:
        reasons_for_dissolution_list.append(reasons_for_dissolution)
        shab_ids.append(shab_id)
        ehraids.append(ehraid)
        shab_dates.append(shab_date)
        keywords.append(keyword)
        codes_list.append(codes)

return_df = pd.DataFrame({
    'ehraid': ehraids,
    'shab_date': shab_dates,
    'shab_id': shab_ids,
    'keyword': keywords,
    'codes': codes_list,
    'reason_for_dissolution': reasons_for_dissolution_list
})
missing_df = dissolution_df[~dissolution_df.shab_id.isin(return_df.shab_id)][['ehraid', 'shab_date', 'shab_id', 'codes']].drop_duplicates(subset=['shab_id'], keep='first')
missing_df['keyword'] = ''
missing_df['reason_for_dissolution'] = '[]'
missing_df['reason_for_dissolution'] = missing_df['reason_for_dissolution'].apply(ast.literal_eval)

return_df = pd.concat([return_df, missing_df])
return_df['reason_for_dissolution'] = return_df['reason_for_dissolution'].apply(lambda x: '{' + ','.join(map(str, x)) + '}' if x else '{}')
return_df['liquidation'] = [int('status.aufl.liq' in codes) if isinstance(codes, list) else False for codes in return_df['codes']]
return_df['bankruptcy'] = [int('status.aufl.konk' in codes) if isinstance(codes, list) else False for codes in return_df['codes']]
return_df = return_df.drop(columns=['codes'])
return_df.head()

Unnamed: 0,ehraid,shab_date,shab_id,keyword,reason_for_dissolution,liquidation,bankruptcy
0,2,2020-02-07,1004825123,uebersetzungen der firma neu,{Die Gesellschaft ist mit Beschluss der Genera...,1,0
1,38,2020-08-10,1004954893,firma neu,{Die Gesellschaft ist mit Beschluss der Genera...,1,0
2,93,2017-11-16,3873587,adresse neu,{Die Genossenschaft ist mit Beschluss der Gene...,1,0
3,140,2024-04-12,1006007698,liquidationsadresse,{Die Gesellschaft ist mit Beschluss der Genera...,1,0
4,185,2016-02-17,2672921,firma neu,{Mit Verfügung des Gerichtspräsidiums Rheinfel...,0,1


In [53]:
INIT_HISTORY_DISSOLUTIONS = f"""
    CREATE TABLE IF NOT EXISTS zefix.history_dissolutions (
    ehraid INT,
    shab_date DATE,
    shab_id INT,
    keyword TEXT,
    reason_for_dissolution TEXT[],
    liquidation BOOLEAN,
    bankruptcy BOOLEAN
);"""

name2table = {
    'history_dissolutions': (INIT_HISTORY_DISSOLUTIONS, return_df),
}
for table_name, (query, df) in name2table.items():
    with connect_database() as con:
        con.execute(query)
        save_to_database(con, df, table_name, 'zefix')

### Prepare firm name changes table

In [54]:
firm_name_df['firm_name'] = firm_name_df['parsed_variables'].apply(lambda x: x.get('firm_name', [''])[0])
firm_name_df = firm_name_df.drop(columns=['registry_office_canton', 'message_raw', 'codes', 'parsed_variables', 'category', 'text_slices'])
firm_name_df.head()

Unnamed: 0,ehraid,shab_id,shab_date,keyword,firm_name
8,2,1004825123,2020-02-07,firma neu,AA-Annoncen Agentur AG in Liquidation
79,38,1004954893,2020-08-10,firma neu,AAFC Financial Consult Ltd
178,77,1005402464,2022-02-10,firma neu,ATG Aare Touring Garage AG
193,83,1006046071,2024-06-03,firma neu,Aareschlucht AG
214,93,3873587,2017-11-16,firma neu,Genossenschaft PRO BON AARGAU in Liq


In [55]:
INIT_HISTORY_FIRM_NAMES = f"""
    CREATE TABLE IF NOT EXISTS zefix.history_firm_names (
    ehraid INT,
    shab_date DATE,
    shab_id INT,
    keyword TEXT,
    firm_name TEXT
);"""

name2table = {
    'history_firm_names': (INIT_HISTORY_FIRM_NAMES, firm_name_df),
}
for table_name, (query, df) in name2table.items():
    with connect_database() as con:
        con.execute(query)
        save_to_database(con, df, table_name, 'zefix')

### Prepare firm address changes table

In [56]:
shab_ids = []
keywords = []
ehraids = []
shab_dates = []

care_ofs = []
streets = []
postal_codes = []
towns = []

new = []
until_now = []
deleted = []

for i, row in firm_address_df.iterrows():
    shab_id = row['shab_id']
    keyword = row['keyword']
    ehraid = row['ehraid']
    shab_date = row['shab_date']
    
    parsed_variables = row['parsed_variables']
    address_new = parsed_variables.get('addresses_new', [])
    address_until = parsed_variables.get('addresses_until_now', [])
    address_deleted = parsed_variables.get('addresses_deleted', [])

    for address in address_new:

        shab_ids.append(shab_id)
        keywords.append(keyword)
        ehraids.append(ehraid)
        shab_dates.append(shab_date)

        care_ofs.append(address.get('care_of', ''))
        streets.append(address.get('street', ''))
        postal_codes.append(address.get('postal_code', -1))
        towns.append(address.get('town', ''))
        new.append(1)
        until_now.append(0)
        deleted.append(0)

    for address in address_until:

        shab_ids.append(shab_id)
        keywords.append(keyword)
        ehraids.append(ehraid)
        shab_dates.append(shab_date)

        care_ofs.append(address.get('care_of', ''))
        streets.append(address.get('street', ''))
        postal_codes.append(address.get('postal_code', -1))
        towns.append(address.get('town', ''))
        new.append(0)
        until_now.append(1)
        deleted.append(0)

    for address in address_deleted:

        shab_ids.append(shab_id)
        keywords.append(keyword)
        ehraids.append(ehraid)
        shab_dates.append(shab_date)

        care_ofs.append(address.get('care_of', ''))
        streets.append(address.get('street', ''))
        postal_codes.append(address.get('postal_code', -1))
        towns.append(address.get('town', ''))
        new.append(0)
        until_now.append(0)
        deleted.append(1)

firm_address_df = pd.DataFrame({
    'ehraid': ehraid,
    'shab_id': shab_ids,
    'shab_date': shab_dates,
    'keyword': keywords,
    'care_of': care_ofs,
    'street': streets,
    'postal_code': postal_codes,
    'town': towns,
    'new': new,
    'until_now': until_now,
    'deleted': deleted
})
firm_address_df.head()

Unnamed: 0,ehraid,shab_id,shab_date,keyword,care_of,street,postal_code,town,new,until_now,deleted
0,1681790,1005867319,2023-10-24,weitere adressen,,Postfach 1589,8027,Zürich,0,0,1
1,1681790,1005249079,2021-07-14,adresse neu,,Tellistrasse 114,5000,Aarau,1,0,0
2,1681790,1005249078,2021-07-14,adresse neu,,Tellistrasse 114,5000,Aarau,1,0,0
3,1681790,3533349,2017-05-19,adresse neu,,Hafenstrasse 50 D,8280,Kreuzlingen,1,0,0
4,1681790,3540613,2017-05-24,adresse neu,,Sigelwiesstrasse 21,8451,Kleinandelfingen,1,0,0


In [57]:
INIT_HISTORY_FIRM_ADDRESSES = f"""
    CREATE TABLE IF NOT EXISTS zefix.history_firm_addresses (
    ehraid INT,
    shab_date DATE,
    shab_id INT,
    keyword TEXT,
    care_of TEXT,
    street TEXT,
    postal_code INT,
    town TEXT,
    new BOOLEAN,
    until_now BOOLEAN,
    deleted BOOLEAN
);"""

name2table = {
    'history_firm_addresses': (INIT_HISTORY_FIRM_ADDRESSES, firm_address_df),
}
for table_name, (query, df) in name2table.items():
    with connect_database() as con:
        con.execute(query)
        save_to_database(con, df, table_name, 'zefix')

### Prepare firm branches table

In [58]:
shab_ids = []
keywords = []
ehraids = []
shab_dates = []

locations = []
ids = []

new = []
until_now = []
deleted = []

for i, row in branches_df.iterrows():
    shab_id = row['shab_id']
    keyword = row['keyword']
    ehraid = row['ehraid']
    shab_date = row['shab_date']
    
    parsed_variables = row['parsed_variables']
    branches_new = parsed_variables.get('branches_new', [])
    branches_until = parsed_variables.get('branches_until_now', [])
    branches_deleted = parsed_variables.get('branches_deleted', [])

    for branch in branches_new:

        shab_ids.append(shab_id)
        keywords.append(keyword)
        ehraids.append(ehraid)
        shab_dates.append(shab_date)

        locations.append(branch.get('location', ''))
        ids.append(branch.get('id', ''))

        new.append(1)
        until_now.append(0)
        deleted.append(0)

    for branch in branches_until:

        shab_ids.append(shab_id)
        keywords.append(keyword)
        ehraids.append(ehraid)
        shab_dates.append(shab_date)

        locations.append(branch.get('location', ''))
        ids.append(branch.get('id', ''))

        new.append(0)
        until_now.append(1)
        deleted.append(0)

    for branch in branches_deleted:

        shab_ids.append(shab_id)
        keywords.append(keyword)
        ehraids.append(ehraid)
        shab_dates.append(shab_date)

        locations.append(branch.get('location', ''))
        ids.append(branch.get('id', ''))

        new.append(0)
        until_now.append(0)
        deleted.append(1)

branches_df = pd.DataFrame({
    'ehraid': ehraid,
    'shab_id': shab_ids,
    'shab_date': shab_dates,
    'keyword': keywords,
    'location': locations,
    'id': ids,
    'new': new,
    'until_now': until_now,
    'deleted': deleted
})
branches_df.loc[branches_df.id.str.contains(r'radiat|radié|gestrichen'), 'id'] = ''
branches_df.head()

Unnamed: 0,ehraid,shab_id,shab_date,keyword,location,id,new,until_now,deleted
0,1680346,1004515888,2018-12-10,zweigniederlassung neu,Frauenfeld,CHE-443.103.739,0,0,1
1,1680346,1004745179,2019-10-25,zweigniederlassung neu,Schwyz,,0,0,1
2,1680346,1004745179,2019-10-25,zweigniederlassung neu,Zürich,,0,0,1
3,1680346,1005473054,2022-05-13,zweigniederlassung neu,Affoltern am Albis,CHE-413.995.453,1,0,0
4,1680346,1006068778,2024-06-27,zweigniederlassung neu,Affoltern am Albis,CHE-413.995.453,0,0,1


In [59]:
INIT_HISTORY_BRANCHES = f"""
    CREATE TABLE IF NOT EXISTS zefix.history_branches (
    ehraid INT,
    shab_date DATE,
    shab_id INT,
    keyword TEXT,
    location TEXT,
    id TEXT,
    new BOOLEAN,
    until_now BOOLEAN,
    deleted BOOLEAN
);"""

name2table = {
    'history_branches': (INIT_HISTORY_BRANCHES, branches_df),
}
for table_name, (query, df) in name2table.items():
    with connect_database() as con:
        con.execute(query)
        save_to_database(con, df, table_name, 'zefix')

# PROCESS CAPITAL CHANGES

In [60]:
import numpy as np

In [61]:
capital_changes = shab_merged[shab_merged.category == 'capital and legal changes'].copy()

In [62]:
# Correct wrong values like '446.001.000.00'
def correct_number(number_str: str) -> str:
    parts = number_str.rsplit('.', 1)
    return parts[0].replace('.', '') + '.' + parts[1] if len(parts) > 1 else parts[0].replace('.', '')

In [63]:
shab_ids = [[], []]
keywords = [[], []]
ehraids = [[], []]
shab_dates = [[], []]

capital_new = []
capital_until_now = []
num_shares_new = []
val_shares_new = []
typ_shares_new = []

for i, row in capital_changes.iterrows():
    shab_id = row['shab_id']
    keyword = row['keyword']
    ehraid = row['ehraid']
    shab_date = row['shab_date']

    parsed_variables = row['parsed_variables']
    cap_new = parsed_variables.get('capital_new', [])
    cap_unt = parsed_variables.get('capital_until_now', [])
    srs_new = parsed_variables.get('shares_new', [])

    if cap_new or cap_unt:
        capital_new.append(cap_new[0] if len(cap_new) > 0 else None)
        capital_until_now.append(cap_unt[0] if len(cap_unt) > 0 else None)
        shab_ids[0].append(shab_id)
        keywords[0].append(keyword)
        ehraids[0].append(ehraid)
        shab_dates[0].append(shab_date)
    if srs_new:
        for s in srs_new:
            num_shares_new.append(s.get('number'))
            val_shares_new.append(s.get('value'))
            typ_shares_new.append(s.get('type'))
            shab_ids[1].append(shab_id)
            keywords[1].append(keyword)
            ehraids[1].append(ehraid)
            shab_dates[1].append(shab_date)

In [64]:
cap_new = pd.DataFrame({
    'ehraid': ehraids[0],
    'shab_date': shab_dates[0],
    'shab_id': shab_ids[0],
    'keyword': keywords[0],
    'capital_new': capital_new,
    'capital_until_now': capital_until_now})

cap_new['capital_new'] = cap_new['capital_new'].fillna('').str.replace("'", "", regex=False)
cap_new['capital_until_now'] = cap_new['capital_until_now'].fillna('').str.replace("'", "", regex=False)

# Extract Währung
cap_new['currency_new'] = cap_new['capital_new'].str.extract(r'^([^\d\s]+)')
cap_new['currency_new'] = cap_new['currency_new'].fillna('')

cap_new['currency_until_now'] = cap_new['capital_until_now'].str.extract(r'^([^\d\s]+)')
cap_new['currency_until_now'] = cap_new['currency_until_now'].fillna('')

# Extract Kapital
cap_new['capital_new'] = cap_new['capital_new'].str.extract(r'([\d.,]+)').astype(str)
cap_new['capital_until_now'] = cap_new['capital_until_now'].str.extract(r'([\d.,]+)').astype(str)

# Apply correction to remove unneccessary punctuations
cap_new['capital_new'] = cap_new['capital_new'].apply(correct_number)
cap_new['capital_until_now'] = cap_new['capital_until_now'].apply(correct_number)

cap_new['capital_new'] = [v.replace(',', '.') if not '.' in v else v.replace(',', '') for v in cap_new['capital_new']]
cap_new.loc[cap_new['capital_new'] == '.', 'capital_new'] = np.nan

cap_new['capital_until_now'] = [v.replace(',', '.') if not '.' in v else v.replace(',', '') for v in cap_new['capital_until_now']]
cap_new.loc[cap_new['capital_until_now'] == '.', 'capital_until_now'] = np.nan

# Ensure correct types
cap_new['capital_new'] = cap_new['capital_new'].astype(float)
cap_new['capital_until_now'] = cap_new['capital_until_now'].astype(float)

In [65]:
stocks_new = pd.DataFrame({
    'ehraid': ehraids[1],
    'shab_date': shab_dates[1],
    'shab_id': shab_ids[1],
    'keyword': keywords[1],
    'num_shares_new': num_shares_new,
    'val_shares_new': val_shares_new,
    'typ_shares_new': typ_shares_new,})

stocks_new['val_shares_new'] = stocks_new['val_shares_new'].fillna('').str.replace("'", "", regex=False)

# Extract Währung
stocks_new['currency_shares_new'] = stocks_new['val_shares_new'].str.extract(r'^([^\d\s]+)')
stocks_new['currency_shares_new'] = stocks_new['currency_shares_new'].fillna('')

# Extract number of Stocks, etc.
stocks_new['val_shares_new'] = stocks_new['val_shares_new'].str.extract(r'([\d.,]+)').astype(str)

# Apply correction to remove unneccessary punctuations
stocks_new['val_shares_new'] = stocks_new['val_shares_new'].apply(correct_number)
stocks_new['val_shares_new'] = [v.replace(',', '.') if not '.' in v else v.replace(',', '') for v in stocks_new['val_shares_new']]
stocks_new.loc[stocks_new['val_shares_new'] == '.', 'val_shares_new'] = np.nan

# Ensure correct types
stocks_new['val_shares_new'] = stocks_new['val_shares_new'].astype(float)
stocks_new['num_shares_new'] = stocks_new['num_shares_new'].astype(int)

In [66]:
# Calculate total value of the capital by multiplying the number of shares with their individual value
stocks_new['value_total'] = stocks_new['num_shares_new'] * stocks_new['val_shares_new']

# Calculate new capital for shab ids where kapital_neu variable is not given, but scheine_neu is
missing_ids = set(stocks_new.shab_id).difference(set(cap_new.shab_id))
stocks_new_missing = stocks_new[stocks_new.shab_id.isin(missing_ids)]

cap_new_missing = stocks_new_missing.groupby(['ehraid', 'shab_date', 'shab_id', 'keyword']).agg(
    capital_new=pd.NamedAgg(column='value_total', aggfunc='sum'),
    currency_new=pd.NamedAgg(column='currency_shares_new', aggfunc=lambda x: list(set([currency for currency in x if currency != ''])))).reset_index()

In [67]:
# Check if there are mixed currencies
assert len([cur_set for cur_set in cap_new_missing['currency_new'] if len(cur_set) > 1]) == 0

cap_new_missing['currency_new'] = [v[0] if len(v) > 0 else '' for v in cap_new_missing['currency_new']]
cap_new_missing['currency_until_now'] = ''
cap_new_missing['capital_until_now'] = np.nan


In [68]:
# Combine the two dataframes to get all capital changes
cap_new_concat = pd.concat([cap_new, cap_new_missing])

In [69]:
cap_new_concat.head()

Unnamed: 0,ehraid,shab_date,shab_id,keyword,capital_new,capital_until_now,currency_new,currency_until_now
0,54,2023-05-08,1005740573,aktienkapital neu,1350000.0,,CHF,
1,54,2023-05-08,1005740573,liberierung aktienkapital neu,1350000.0,,CHF,
2,149,2020-09-11,1004976547,aktienkapital neu,100000.0,,CHF,
3,149,2020-09-11,1004976547,liberierung aktienkapital neu,100000.0,,CHF,
4,158,2019-08-20,1004698965,aktienkapital neu,150000.0,,CHF,


In [70]:
INIT_HISTORY_REGISTERED_CAPITAL = f"""
    CREATE TABLE IF NOT EXISTS zefix.history_registered_capital (
    ehraid INT,
    shab_date DATE,
    shab_id INT,
    keyword TEXT,
    capital_new NUMERIC,
    capital_until_now NUMERIC,
    currency_new TEXT,
    currency_until_now TEXT
)"""

name2table = {
    'history_registered_capital': (INIT_HISTORY_REGISTERED_CAPITAL, cap_new_concat),
}
for table_name, (query, df) in name2table.items():
    with connect_database() as con:
        con.execute(query)
        save_to_database(con, df, table_name, 'zefix')

In [71]:
# cap_new_concat.to_csv(EXTERNAL_DATA_DIR / 'capital_changes.csv', index=False)

# PROCESS MERGERS AND ACQUISIITONS

In [72]:
mergers_and_acquisitions = shab_merged[shab_merged.category == 'mergers and separations'].copy()

In [73]:
mergers_and_acquisitions

Unnamed: 0,ehraid,shab_id,shab_date,registry_office_canton,message_raw,codes,keyword,text_slices,parsed_variables,category
19,11,1004788634,2019-12-19,VS,"AA'S. AG, in Zermatt, CHE-106.369.697, Aktieng...","[aenderungorgane, fusion]",fusion,[Übernahme der Aktiven und Passiven der AA 'S....,"{'balance_sheet_date': ['30.06.2019'], 'firms_...",mergers and separations
98,54,1005740573,2023-05-08,SO,"Aare Finanz- und Holding-AG, in Olten, CHE-102...","[zweckaenderung, kapitalaenderung, kapitalaend...",abspaltung,[Ein Teil der Aktiven und Passiven geht gemäss...,{},mergers and separations
203,87,2693405,2016-02-29,AG,"Aarfim AG, in Aarburg, CHE-102.588.119, Aktien...","[adressaenderung, aenderungorgane, fusion]",fusion,[Übernahme der Aktiven und Passiven der Mininv...,"{'balance_sheet_date': ['31.12.2015'], 'firms_...",mergers and separations
281,133,1004662690,2019-06-28,AG,"Aarolac AG Lack- und Farbenfabrik, in Oberentf...",[fusion],fusion,[Übernahme der Aktiven und Passiven der MMA Ho...,"{'balance_sheet_date': ['31.12.2018'], 'firms_...",mergers and separations
650,261,1005484711,2022-05-31,ZH,"ABB Asea Brown Boveri Ltd, in Zürich, CHE-106....",[fusion],fusion,[Übernahme der Aktiven und Passiven der ABB In...,"{'balance_sheet_date': ['31.12.2021'], 'firms_...",mergers and separations
...,...,...,...,...,...,...,...,...,...,...
5238855,1672008,1006216881,2024-12-27,SG,"mirame AG, in Wil (SG), CHE-395.572.325, c/o R...","[status, status.neu]",abspaltung,[Die Gesellschaft entsteht aus der Abspaltung ...,{},mergers and separations
5238950,1672023,1006216892,2024-12-27,SG,"Stephan Wenger AG, in Wil (SG), CHE-496.436.41...","[status, status.neu]",abspaltung,[Die Gesellschaft entsteht aus der Abspaltung ...,{},mergers and separations
5240219,1672396,1006218724,2024-12-30,GR,"Krol-Active Company GmbH, in Trun, CHE-471.878...","[status, status.neu]",abspaltung,[Die Gesellschaft entsteht aus der Abspaltung ...,{},mergers and separations
5248376,1674791,1006234147,2025-01-21,AG,"PARU Finanz Holding AG, in Neuenhof, CHE-147.4...","[status, status.neu]",abspaltung,[Die Gesellschaft entsteht aus der Abspaltung ...,{},mergers and separations


In [74]:
shab_ids = []
keywords = []
ehraids = []
shab_dates = []

contract_dates = []
balance_sheet_dates = []

firm_taken_over = []
location_taken_over = []
id_taken_over = []
assets_taken_over = []
liabilities_taken_over = []

for i, row in mergers_and_acquisitions.iterrows():
    shab_id = row['shab_id']
    keyword = row['keyword']
    shab_date = row['shab_date']
    ehraid = row['ehraid']

    parsed_variables = row['parsed_variables']

    if parsed_variables:
        contract_date = parsed_variables.get('contract_date', [])
        balance_sheet_date = parsed_variables.get('balance_sheet_date', [])
        firms_taken_over = parsed_variables.get('firms_taken_over', [])
        for firm in firms_taken_over:
            firm_name = firm.get('firm_name', '')
            location = firm.get('location', '')
            id = firm.get('id', '')
            capital_taken_over = firm.get('capital_taken_over', {})
            assets = capital_taken_over.get('assets', '') if capital_taken_over else ''
            liabilities = capital_taken_over.get('liabilities', '') if capital_taken_over else ''

            contract_dates.append(contract_date[0] if len(contract_date) > 0 else '')
            balance_sheet_dates.append(balance_sheet_date[0] if len(balance_sheet_date) > 0 else '')
            firm_taken_over.append(firm_name)
            location_taken_over.append(location)
            id_taken_over.append(id)
            assets_taken_over.append(assets)
            liabilities_taken_over.append(liabilities)
            shab_ids.append(shab_id)
            keywords.append(keyword)
            ehraids.append(ehraid)
            shab_dates.append(shab_date)
    else:
        firm_taken_over.append('')
        location_taken_over.append('')
        id_taken_over.append('')
        assets_taken_over.append('')
        liabilities_taken_over.append('')
        shab_ids.append(shab_id)
        keywords.append(keyword)
        ehraids.append(ehraid)
        shab_dates.append(shab_date)


In [75]:
assert len(shab_ids) == len(keywords) == len(firm_taken_over) == len(location_taken_over) == len(id_taken_over) == len(assets_taken_over) == len(liabilities_taken_over)

In [76]:
processed_mergers = pd.DataFrame({
    'ehraid': ehraids,
    'shab_date': shab_dates,
    'shab_id': shab_ids,
    'keyword': keywords,
    'firm_taken_over': firm_taken_over,
    'location_taken_over': location_taken_over,
    'id_taken_over': id_taken_over,
    'assets_taken_over': assets_taken_over,
    'liabilities_taken_over': liabilities_taken_over
})

processed_mergers['assets_taken_over'] = processed_mergers['assets_taken_over'].fillna('').str.replace("'", "", regex=False)
processed_mergers['liabilities_taken_over'] = processed_mergers['liabilities_taken_over'].fillna('').str.replace("'", "", regex=False)

# Extract Währung of Aktiven/Passiven
processed_mergers['currency_assets_taken_over'] = processed_mergers['assets_taken_over'].str.extract(r'^([^\d\s]+)')
processed_mergers['currency_assets_taken_over'] = processed_mergers['currency_assets_taken_over'].fillna('')

processed_mergers['currency_liabilities_taken_over'] = processed_mergers['liabilities_taken_over'].str.extract(r'^([^\d\s]+)')
processed_mergers['currency_liabilities_taken_over'] = processed_mergers['currency_liabilities_taken_over'].fillna('')

# Extract value of Aktiven/Passiven
processed_mergers['assets_taken_over'] = processed_mergers['assets_taken_over'].str.extract(r'([\d.,]+)').astype(str)

processed_mergers['liabilities_taken_over'] = processed_mergers['liabilities_taken_over'].str.extract(r'([\d.,]+)').astype(str)

# Apply correction to remove unneccessary punctuations
processed_mergers['assets_taken_over'] = processed_mergers['assets_taken_over'].apply(correct_number)
processed_mergers['assets_taken_over'] = [v.replace(',', '.') if not '.' in v else v.replace(',', '') for v in processed_mergers['assets_taken_over']]

processed_mergers['liabilities_taken_over'] = processed_mergers['liabilities_taken_over'].apply(correct_number)
processed_mergers['liabilities_taken_over'] = [v.replace(',', '.') if not '.' in v else v.replace(',', '') for v in processed_mergers['liabilities_taken_over']]

# Ensure correct types
processed_mergers['assets_taken_over'] = processed_mergers['assets_taken_over'].astype(float)
processed_mergers['liabilities_taken_over'] = processed_mergers['liabilities_taken_over'].astype(float)

In [77]:
processed_mergers.head()

Unnamed: 0,ehraid,shab_date,shab_id,keyword,firm_taken_over,location_taken_over,id_taken_over,assets_taken_over,liabilities_taken_over,currency_assets_taken_over,currency_liabilities_taken_over
0,11,2019-12-19,1004788634,fusion,AA 'S. Immobilien AG,Grächen,CHE-101.430.817,2847237.0,2269431.7,CHF,CHF
1,54,2023-05-08,1005740573,abspaltung,,,,,,,
2,87,2016-02-29,2693405,fusion,Mininvest AG,Aarburg,CHE-103.927.388,2510449.0,106923.0,CHF,CHF
3,133,2019-06-28,1004662690,fusion,MMA Holding AG,Oberentfelden,CHE-114.610.317,3495312.0,0.0,CHF,CHF
4,261,2022-05-31,1005484711,fusion,ABB Investment Holding 2 GmbH,Zürich,CHE-203.422.841,971946100.0,43080.0,CHF,CHF


In [78]:
INIT_HISTORY_MERGER = f"""
    CREATE TABLE IF NOT EXISTS zefix.history_merger (
    ehraid INT,
    shab_date DATE,
    shab_id INT,
    keyword TEXT,
    firm_taken_over TEXT,
    location_taken_over TEXT,
    id_taken_over TEXT,
    assets_taken_over NUMERIC,
    liabilities_taken_over NUMERIC,
    currency_assets_taken_over TEXT,
    currency_liabilities_taken_over TEXT
);"""

name2table = {
    'history_merger': (INIT_HISTORY_MERGER, processed_mergers),
}
for table_name, (query, df) in name2table.items():
    with connect_database() as con:
        con.execute(query)
        save_to_database(con, df, table_name, 'zefix')

In [79]:
# processed_mergers.to_csv(EXTERNAL_DATA_DIR / 'merger_sizes.csv', index=False)

# PROCESS LEGAL FORM CHANGES

In [80]:
legal_form_changes = shab_merged[shab_merged.category == 'capital and legal changes'].copy()

shab_ids = []
keywords = []
ehraids = []
shab_dates = []

legal_forms_new = []
legal_forms_until_now = []
legal_forms_deleted = []
conversion_assets = []
conversion_liabilities = []

for i, row in legal_form_changes.iterrows():
    shab_id = row['shab_id']
    keyword = row['keyword']
    shab_date = row['shab_date']
    ehraid = row['ehraid']

    parsed_variables = row['parsed_variables']

    if parsed_variables:
        legal_form_new = parsed_variables.get('legal_form_new', [])
        legal_form_until_now = parsed_variables.get('legal_form_until_now', [])
        legal_form_deleted = parsed_variables.get('legal_form_deleted', [])
        assets = parsed_variables.get('assets', [])
        liabilities = parsed_variables.get('liabilities', [])
        if legal_form_new or legal_form_until_now or legal_form_deleted:
            legal_forms_new.append(legal_form_new[0] if len(legal_form_new) > 0 else '')
            legal_forms_until_now.append(legal_form_until_now[0] if len(legal_form_until_now) > 0 else '')
            legal_forms_deleted.append(legal_form_deleted[0] if len(legal_form_deleted) > 0 else '')
            conversion_assets.append(assets[0] if len(assets) > 0 else '')
            conversion_liabilities.append(liabilities[0] if len(liabilities) > 0 else '')
            shab_ids.append(shab_id)
            keywords.append(keyword)
            ehraids.append(ehraid)
            shab_dates.append(shab_date)

return_df = pd.DataFrame({
    'ehraid': ehraids,
    'shab_date': shab_dates,
    'shab_id': shab_ids,
    'keyword': keywords,
    'legal_form_new': legal_forms_new,
    'legal_form_until_now': legal_forms_until_now,
    'legal_form_deleted': legal_forms_deleted,
    'assets': conversion_assets,
    'liabilities': conversion_liabilities
})

In [81]:
return_df

Unnamed: 0,ehraid,shab_date,shab_id,keyword,legal_form_new,legal_form_until_now,legal_form_deleted,assets,liabilities
0,181,2016-03-07,2706811,rechtsform hauptsitz neu,Aktiengesellschaft,,,,
1,308,2019-12-12,1004781481,rechtsform neu,Gesellschaft mit beschränkter Haftung,,,,
2,308,2019-12-12,1004781481,umwandlung,Gesellschaft mit beschränkter Haftung,,,CHF 499'207'508.00,CHF 169'631'206.00
3,7073,2018-11-26,1004505486,rechtsform neu,Aktiengesellschaft,,,,
4,7073,2018-11-26,1004505486,umwandlung,Aktiengesellschaft,Gesellschaft mit beschränkter Haftung,,CHF 3'692'400,CHF 1'569'412
...,...,...,...,...,...,...,...,...,...
18331,1681739,2025-03-03,1006271202,rechtsform hauptsitz neu,Aktiengesellschaft,,,,
18332,1681782,2025-03-03,1006271600,rechtsform hauptsitz neu,Aktiengesellschaft,,,,
18333,1681808,2025-03-03,1006271284,rechtsform hauptsitz neu,Aktiengesellschaft,,,,
18334,1681915,2025-03-04,1006272100,rechtsform hauptsitz neu,Aktiengesellschaft,,,,


In [82]:
return_df['assets'] = return_df['assets'].fillna('').str.replace("'", "", regex=False)
return_df['liabilities'] = return_df['liabilities'].fillna('').str.replace("'", "", regex=False)

# Extract Währung of Aktiven/Passiven
return_df['currency_assets'] = return_df['assets'].str.extract(r'^([^\d\s]+)')
return_df['currency_assets'] = return_df['currency_assets'].fillna('')

return_df['currency_liabilities'] = return_df['liabilities'].str.extract(r'^([^\d\s]+)')
return_df['currency_liabilities'] = return_df['currency_liabilities'].fillna('')

# Extract value of Aktiven/Passiven
return_df['assets'] = return_df['assets'].str.extract(r'([\d.,]+)').astype(str)

return_df['liabilities'] = return_df['liabilities'].str.extract(r'([\d.,]+)').astype(str)

# Apply correction to remove unneccessary punctuations
return_df['assets'] = return_df['assets'].apply(correct_number)
return_df['assets'] = [v.replace(',', '.') if not '.' in v else v.replace(',', '') for v in return_df['assets']]

return_df['liabilities'] = return_df['liabilities'].apply(correct_number)
return_df['liabilities'] = [v.replace(',', '.') if not '.' in v else v.replace(',', '') for v in return_df['liabilities']]

# Ensure correct types
return_df['assets'] = return_df['assets'].astype(float)
return_df['liabilities'] = return_df['liabilities'].astype(float)

In [83]:
return_df

Unnamed: 0,ehraid,shab_date,shab_id,keyword,legal_form_new,legal_form_until_now,legal_form_deleted,assets,liabilities,currency_assets,currency_liabilities
0,181,2016-03-07,2706811,rechtsform hauptsitz neu,Aktiengesellschaft,,,,,,
1,308,2019-12-12,1004781481,rechtsform neu,Gesellschaft mit beschränkter Haftung,,,,,,
2,308,2019-12-12,1004781481,umwandlung,Gesellschaft mit beschränkter Haftung,,,499207508.0,169631206.0,CHF,CHF
3,7073,2018-11-26,1004505486,rechtsform neu,Aktiengesellschaft,,,,,,
4,7073,2018-11-26,1004505486,umwandlung,Aktiengesellschaft,Gesellschaft mit beschränkter Haftung,,3692400.0,1569412.0,CHF,CHF
...,...,...,...,...,...,...,...,...,...,...,...
18331,1681739,2025-03-03,1006271202,rechtsform hauptsitz neu,Aktiengesellschaft,,,,,,
18332,1681782,2025-03-03,1006271600,rechtsform hauptsitz neu,Aktiengesellschaft,,,,,,
18333,1681808,2025-03-03,1006271284,rechtsform hauptsitz neu,Aktiengesellschaft,,,,,,
18334,1681915,2025-03-04,1006272100,rechtsform hauptsitz neu,Aktiengesellschaft,,,,,,


In [84]:
INIT_HISTORY_LEGAL_FORMS= f"""
    CREATE TABLE IF NOT EXISTS zefix.history_legal_forms (
    ehraid INT,
    shab_date DATE,
    shab_id INT,
    keyword TEXT,
    legal_form_new TEXT,
    legal_form_until_now TEXT,
    legal_form_deleted TEXT,
    assets NUMERIC,
    liabilities NUMERIC,
    currency_assets TEXT,
    currency_liabilities TEXT
);"""

name2table = {
    'history_legal_forms': (INIT_HISTORY_LEGAL_FORMS, return_df),
}
for table_name, (query, df) in name2table.items():
    with connect_database() as con:
        con.execute(query)
        save_to_database(con, df, table_name, 'zefix')