In [1]:
import ast
from collections import defaultdict
import pandas as pd
from pocketknife.database import connect_database, read_from_database
from config import EXTERNAL_DATA_DIR, PROCESSED_DATA_DIR

[32m2025-03-07 22:33:58.647[0m | [1mINFO    [0m | [36mconfig[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/manuelbolz/Documents/git/for_work/company_success_prediction[0m


# LOAD DATA

In [2]:
# db queries
query_shab = """
    SELECT
        ehraid,
        shab_id,
        shab_date,
        registry_office_canton,
        message AS message_raw
    FROM zefix.shab
"""

query_shab_mutation = """
    SELECT * 
    FROM zefix.shab_mutation
"""

In [3]:
with connect_database() as con:
    raw_shab = read_from_database(con, query_shab)
    raw_shab_mutation = read_from_database(con, query_shab_mutation)

In [4]:
LANGUAGE = 'it'

In [5]:
parsed_shab_messages = pd.read_csv(EXTERNAL_DATA_DIR / f'final_{LANGUAGE}.csv')

In [6]:
parsed_shab_messages['parsed_variables'] = parsed_shab_messages.parsed_variables.fillna('{}')
parsed_shab_messages['parsed_variables'] = parsed_shab_messages.parsed_variables.apply(ast.literal_eval)
parsed_shab_messages['parsed_variables'] = parsed_shab_messages.parsed_variables.fillna({})

In [7]:
parsed_shab_messages['text_slice'] = parsed_shab_messages['text_slice'].fillna('[]')
parsed_shab_messages['text_slices'] = parsed_shab_messages['text_slice'].apply(ast.literal_eval)
parsed_shab_messages = parsed_shab_messages.drop(columns=['text_slice'])
parsed_shab_messages.head()

Unnamed: 0,shab_id,keyword,parsed_variables,main_group,text_slices
0,2636071,BEGINNING,{},undetermined,"[BFC REI AG , Balzers , succursale di Mendrisi..."
1,2636071,capitale sociale della sede principale,"{'capital_new': ['CHF 50'000.00'], 'capital_un...",capital and legal changes,[CHF 50'000.00 diviso in 500 azioni nominative...
2,2636071,numero di identificazione della sede principale,"{'id_new': ['FL-0002.487.430-2'], 'id_until_no...",firm and address changes,[FL-0002.487.430-2]
3,2636071,nuova natura giuridica della sede principale,"{'legal_form_deleted': [''], 'legal_form_until...",capital and legal changes,[Società anonima di diritto lussemburghese . S...
4,2636071,nuove osservazioni della sede principale,{},undetermined,[Iscritta il 29.10.2014 al Fürstentum Liechten...


In [8]:
print(parsed_shab_messages['shab_id'].nunique())
print(raw_shab['shab_id'].nunique())

1647629
2464967


In [72]:
1625920
2457792

2457792

In [8]:
# Filter out the companies where the shab entries have been parsed
raw_shab = raw_shab[raw_shab.shab_id.isin(parsed_shab_messages['shab_id'].unique())].copy()
raw_shab_mutation = raw_shab_mutation[raw_shab_mutation.shab_id.isin(parsed_shab_messages['shab_id'].unique())].copy()
raw_shab_mutation_grouped = (
    raw_shab_mutation
    .groupby('shab_id')
    .agg(codes=pd.NamedAgg(column='description', aggfunc=lambda x: [v for v in x]))
    .reset_index()
)

In [9]:
# Merge all the dataframes
shab_merged = (
    raw_shab
    .merge(raw_shab_mutation_grouped, on='shab_id', how='left')
    .merge(parsed_shab_messages, on='shab_id', how='left')
)

# Sort values in the correct temporal order
shab_merged = shab_merged.sort_values(['ehraid', 'shab_date', 'shab_id'], ascending=[True, True, True]).reset_index(drop=True)

In [None]:
shab_merged_temp = shab_merged[shab_merged.ehraid.isin([905876, 905843, 905844])]
shab_merged_temp = shab_merged_temp.sort_values(['ehraid', 'shab_date', 'shab_id'], ascending=[True, True, True]).reset_index()

In [None]:
def create_raw_json_history(df: pd.DataFrame) -> dict:
    json_structure = defaultdict(lambda: {'history': []})
    for _, row in df.iterrows():
        # Extract the fields to identify a company and its entries
        ehraid = row['ehraid']
        shab_date = row['shab_date'] if isinstance(row['shab_date'], str) else row['shab_date'].strftime('%Y-%d-%m')
        shab_id = row['shab_id']
        main_group = row['main_group']
        keyword = row['keyword']

        # Extract main information
        message_info = {
            'registry_office_canton': row['registry_office_canton'],
            'codes': row['codes'],
            'message_raw': row['message_raw'],
            'extracted_content': {
                main_group: {
                    keyword: {
                        'text_slices': row['text_slices'],
                        'variables': row['parsed_variables']
                    }
                }
            }
        }

        # Search if shab_date already exists in the history
        date_entry = next((entry for entry in json_structure[ehraid]['history'] if shab_date in entry), None)

        if date_entry is None:
            # If the date does not exist, create a new entry
            date_entry = {shab_date: {shab_id: message_info}}
            json_structure[ehraid]['history'].append(date_entry)
        else:
            # If the date exists, check if the shab_id already exists
            id_entry = date_entry[shab_date].get(shab_id, None)
            if id_entry is None:
                # If the shab_id does not exists, we can simply add it to the shab_date
                date_entry[shab_date][shab_id] = message_info
            else:
                # If the shab_id exists, we need to check if the main_group already exists
                main_group_entry = date_entry[shab_date][shab_id]['extracted_content'].get(main_group, None)
                if main_group_entry is None:
                    # If it does not exist, we add it to the extracted content
                    date_entry[shab_date][shab_id]['extracted_content'][main_group] = message_info['extracted_content'][main_group]
                else:
                    # If it does, we add the keyword to the main_group, since a keyword can only appear once within the main_group
                    date_entry[shab_date][shab_id]['extracted_content'][main_group][keyword] = message_info['extracted_content'][main_group][keyword]      

    return dict(json_structure)

In [None]:
history_json = create_raw_json_history(shab_merged)

# CREATE REGISTERED PEOPLE AND FIRMS TABLE

In [11]:
df_people = shab_merged[shab_merged.main_group == 'natural persons and legal entities'].copy()

In [12]:
# Make sure only the expected fields are there
def validate_person_and_firms(x: dict):    
    schema = {'firms': [], 'people': []}
    for firm in x.get('firms', []):
        if isinstance(firm, dict):
            schema['firms'].append({
                'firm_name': firm.get('firm_name'),
                'firm_uid': firm.get('id'),
                'firm_seat': firm.get('location'),
                'firm_type': firm.get('type'),
                'firm_shares': firm.get('shares')
            })
    for person in x.get('people', []):
        if isinstance(person, dict):
            schema['people'].append({
                'first_name': person.get('first_name'),
                'last_name': person.get('last_name'),
                'hometown': person.get('hometown'),
                'place_of_residence': person.get('place_of_residence'),
                'nationality': person.get('nationality'),
                'job_title': person.get('job_title'),
                'authorization': person.get('authorization'),
                'shares': person.get('shares')
            })
    return schema

df_people['validated_variables'] = df_people['parsed_variables'].apply(validate_person_and_firms)  
df_people['firms'] = df_people['validated_variables'].apply(lambda x: x.get('firms', []))
df_people['people'] = df_people['validated_variables'].apply(lambda x: x.get('people', []))

# Split individual firm dictionaries into individual rows
df_firms_exploded = df_people.explode(column=['firms']).dropna()
df_people_exploded = df_people.explode(column=['people']).dropna()

# Create individual columns from the dictionary
df_firms_norm = pd.json_normalize(
    df_firms_exploded['firms'],
    errors='raise'
)
df_firms_concat = pd.concat([df_firms_exploded[['ehraid', 'message_raw', 'shab_date', 'shab_id', 'codes', 'keyword']].reset_index(drop=True), df_firms_norm], axis=1)

df_people_norm = pd.json_normalize(
    df_people_exploded['people'],
    errors='raise'
)
df_people_concat = pd.concat([df_people_exploded[['ehraid', 'message_raw', 'shab_date', 'shab_id', 'codes', 'keyword']].reset_index(drop=True), df_people_norm], axis=1)

In [13]:
df_people_concat

Unnamed: 0,ehraid,message_raw,shab_date,shab_id,codes,keyword,first_name,last_name,hometown,place_of_residence,nationality,job_title,authorization,shares
0,2,"AA-Annoncen Agentur AG, in Basel, CHE-102.721....",2019-09-06,1004711015,[aenderungorgane],ausgeschiedene personen und erloschene untersc...,Philippe,Nappez,Grandfontaine,Binningen,CH,Mitglied des Verwaltungsrates,mit Kollektivunterschrift zu zweien,
1,2,"AA-Annoncen Agentur AG, in Basel, CHE-102.721....",2019-09-06,1004711015,[aenderungorgane],eingetragene personen neu oder mutierend,Daniel,Ebneter,Häggenschwil,Rheinfelden,CH,Mitglied des Verwaltungsrates,mit Kollektivunterschrift zu zweien,
2,2,"AA-Annoncen Agentur AG, in Basel, CHE-102.721....",2020-02-07,1004825123,"[status, status.aufl, status.aufl.liq, aenderu...",eingetragene personen neu oder mutierend,Gabriella,Karger Travella,Basel,Basel,CH,Mitglied des Verwaltungsrates,mit Kollektivunterschrift zu zweien,
3,2,"AA-Annoncen Agentur AG, in Basel, CHE-102.721....",2020-02-07,1004825123,"[status, status.aufl, status.aufl.liq, aenderu...",eingetragene personen neu oder mutierend,Julien,Orsini,Basel,Reinach ( BL ),CH,Liquidator,mit Einzelunterschrift,
4,15,"AAA EDV Software AG, in Aarau, CHE-106.307.377...",2022-04-19,1005453034,[aenderungorgane],eingetragene personen neu oder mutierend,Urs,Antener,Eggiwil,Sarnen,CH,Präsident des Verwaltungsrates,mit Einzelunterschrift,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2148376,1682041,"LA Capital AG, in Beromünster, CHE-343.494.505...",2025-03-04,1006272425,"[status, status.neu]",eingetragene personen,Andreas,Tresch,Silenen,Beromünster,CH,Präsident des Verwaltungsrates,mit Einzelunterschrift,
2148377,1682041,"LA Capital AG, in Beromünster, CHE-343.494.505...",2025-03-04,1006272425,"[status, status.neu]",eingetragene personen,Leila,Tresch,Malters,Beromünster,CH,Mitglied des Verwaltungsrates,mit Einzelunterschrift,
2148378,1682042,"Mentor Gerüst GmbH, in Wauwil, CHE-483.874.731...",2025-03-04,1006272426,"[status, status.neu]",eingetragene personen,Mentor,Berisha,Sursee,Wauwil,CH,Gesellschafter und Geschäftsführer,mit Einzelunterschrift,mit 20 Stammanteilen zu je CHF 1000.00
2148379,1682043,"REINIGUNG - SARACENO, in Rickenbach (LU), CHE-...",2025-03-04,1006272427,"[status, status.neu]",eingetragene personen,Mirko,Saraceno,,Rickenbach ( LU ),italienischer Staatsangehöriger,Inhaber,mit Einzelunterschrift,


# Add Gender and Nationality to the data

In [14]:
import re
import spacy
from unidecode import unidecode
from zefix_processing.country_mapping import german2alpha2, french2alpha2, italian2alpha2
from zefix_processing.gender_mapping import german2gender, french2gender, italian2gender

nlp_models = {
    "de": spacy.load("de_core_news_sm"),
    "fr": spacy.load("fr_core_news_sm"),
    "it": spacy.load("it_core_news_sm")
}

In [15]:
def normalize_words(string: str) -> str:
    replacements = {
        'ä': 'ae',
        'ö': 'oe',
        'ü': 'ue'
    }
    for char, replacement in replacements.items():
        string = string.replace(char, replacement)
    
    return unidecode(string.lower())

In [16]:
nationality_mapping = {
    'de': {normalize_words(k): v for k, v in german2alpha2.items()},
    'fr': {normalize_words(k): v for k, v in french2alpha2.items()},
    'it': {normalize_words(k): v for k, v in italian2alpha2.items()}
}

gender_mapping = {
    'de': {normalize_words(k): v for k, v in german2gender.items()},
    'fr': {normalize_words(k): v for k, v in french2gender.items()},
    'it': {normalize_words(k): v for k, v in italian2gender.items()},
}

### Clean the hometown, place of residence, and nationality column

In [17]:
countries_containing_and = [
    'bosnie et herzégovine',
    'svalbard et île jan mayen',
    'îles turques et caïques',
    'géorgie du sud et îles sandwich du sud',
    'bonaire, saint eustatius et saba',
    'terres australes et antarctiques françaises',
    'de são tomé e príncipe',
    'serbo e montenegrino',
    'serba e montenegrina',
    'di saint christopher e nevis',
    'de são tomé e príncipe'
]

# 1. Split multiple nationalities into individual columns
def clean_location(language: str, string: str) -> str:
    mapping = {
        'de': ['von', 'in'],
        'fr': ['de', 'du', "d'", 'des', 'à'],
        'it': ['da', "d'", 'in']
    }
    for word in mapping[language]:
        string = re.sub(rf'\b{word}\b', '', string)
    return string.strip()

def split_locations(df: pd.DataFrame, orig_col: str = 'nationality'):
    loc_split = df[orig_col].str.split(r'\sund\s|\set\s|\se\s', regex=True, expand=True)
    loc_split.columns = [f'{orig_col}_{i+1}' for i in range(loc_split.shape[1])]
    loc_split.fillna('', inplace=True)
    df = pd.concat([df, loc_split], axis=1)
    return df.drop(columns=[orig_col])

df_people_concat['hometown'] = df_people_concat['hometown'].apply(lambda x: clean_location(LANGUAGE, x))
df_people_concat['place_of_residence'] = df_people_concat['place_of_residence'].apply(lambda x: clean_location(LANGUAGE, x))
df_people_concat['nationality'] = df_people_concat['nationality'].apply(lambda x: clean_location(LANGUAGE, x))

# First, split the hometown and place of residence column
df_people_concat = split_locations(df_people_concat, 'hometown')
df_people_concat = split_locations(df_people_concat, 'place_of_residence')

In [18]:
# 2. Move nationalities that are in the wrong column
def contains_target_word(text):
    pattern = r'\bstaatsangehoerige\b|\bcittadina\b|\bressortissante\b|\bcitoyenne\b|\bstaatsangehoeriger\b|\bcittadino\b|\bressortissant\b|\bcitoyen\b'
    return bool(re.search(pattern, text))

def move_nationalities(language: str, entries: list[str], nat: str, country_names: set[str]):
    """
    Checks if any of the hometown columns contains a country name
    """
    mapping = {
        'de': 'und',
        'fr': 'et',
        'it': 'e'
    }
    nat_norm = normalize_words(nat)
    for i, entry in enumerate(entries):
        if entry:
            entry_norm = normalize_words(entry)
            if entry_norm in country_names or contains_target_word(entry_norm):
                if not re.match(rf'\b{entry_norm}\b', nat_norm):
                    nat = f"{nat} {mapping.get(language, ' und ')} {entry}" if nat else entry
                entries[i] = ''
            else:
                # If the name is not a country name and the nationality does not include Swiss yet,
                # we want to add 'CH' to the nationalities, since the hometown is with high probability a Swiss municipality
                if 'CH' not in nat:
                    nat = f"{nat} {mapping.get(language, ' und ')} CH" if nat else 'CH'
    return entries + [nat]

countries_norm = french2alpha2.keys()
hometown_cols = [col for col in df_people_concat.columns if 'hometown' in col]
result_cols = hometown_cols + ['nationality']
df_people_concat[result_cols] = df_people_concat.apply(lambda x: pd.Series(move_nationalities(LANGUAGE, [x[col] for col in hometown_cols], x['nationality'], countries_norm)), axis=1)

In [19]:
# Finally, split the nationality column
df_people_concat = split_locations(df_people_concat, 'nationality')

### Clean the authorization and shares column

In [20]:
def switch_auth_and_shares(language: str, auth: str, shares: str):
    """
    Checks if any of the hometown columns contains a country name
    """
    keyword_mapping = {
        'de': 'unterschrift',
        'fr': 'signature',
        'it': 'firma'
    }
    and_mapping = {
        'de': 'und',
        'fr': 'et',
        'it': 'e'
    }

    # Base Case: no value in both
    if not (auth or shares):
        return [auth, shares]  # no switch
    
    match_auth = re.search(r'\bchf\b', auth.lower()) if auth else None
    match_shares = re.search(keyword_mapping[language], shares.lower()) if shares else None

    # Case 0: no match in both
    if not (match_auth or match_shares):
        return [auth, shares]  # no switch

    # Case 1: match in auth and no match in shares
    elif match_auth and not match_shares:
        if shares:
            return ['', f"{auth} {and_mapping[language]} {shares}"]  # Add auth infront of shares
        else:
            return ['', auth]  # switch columns: auth, share
    
    # Case 2: match in shares and no value in auth
    elif match_shares and not match_auth:
        if shares:
            return [f"{auth} {and_mapping[language]} {shares}", '']  # Add shares after auth
        else:
            return [shares, '']  # switch columns: auth, share
    
    # Case 3: match in auth and shares
    elif match_auth and not match_shares:
        return [shares, auth]  # switch both
    
    else:
        return [auth, shares]

In [21]:
df_people_concat[['authorization', 'shares']] = df_people_concat.apply(lambda x: pd.Series(switch_auth_and_shares(LANGUAGE, x['authorization'], x['shares'])), axis=1)

### Normalize the nationality and add the iso-3166-1 alpha 2 codes

In [22]:
def map_country(language: str, nationality: str, mapping: dict) -> str:
    if nationality:
        if language == 'de':
            nationality = nationality.split()[0].strip()
        else:
            nationality = nationality.split()[-1].strip()
        return mapping[language].get(nationality)
    return ''

In [23]:
nat_cols = [col for col in df_people_concat.columns if re.match(r'^nationality\_\d{1}$', col)]
for nat_col in nat_cols:
    df_people_concat[f"{nat_col}_norm"] = df_people_concat[nat_col].fillna('').apply(normalize_words)
    df_people_concat[f'{nat_col}_iso_3166_1_alpha_2'] = df_people_concat[f"{nat_col}_norm"].apply(lambda x: map_country(LANGUAGE, x, nationality_mapping))

### Find gendered job titles and/or determine gender

In [24]:
gendered_endings = {
    'de': {
        'female': ['in'],
        'male': [],  # no specific ending for male words in German
    },
    'fr': {
        'female': ['euse', 'ienne', 'onne', 'ane', 'trice', 'esse'],
        'male': ['eur', 'ien', 'on', 'an'],
    },
    'it': {
        'female': ['a', 'trice', 'essa'],
        'male': ['o', 'ore']
    }   
}


def extract_nouns(text: str, language: str) -> list:
    """
    Extracts nouns from a given text using spaCy for German, French, and Italian.
    """
    nlp = nlp_models[language]
    return [token.text for token in nlp(text) if token.pos_ == 'NOUN']


def create_gendered_job_names(language: str, df: pd.DataFrame, col: str = 'job_title_norm') -> tuple[list]:
    nouns = set([title.lower() for title in df[col].unique() for title in extract_nouns(title, language)])
       
    female_words = []
    male_words = []
    undetermined = []

    for word in nouns:
        if any(word.endswith(ending) for ending in gendered_endings[language]['female']):
            female_words.append(word)
        elif any(word.endswith(ending) for ending in gendered_endings[language]['male']):
            male_words.append(word)
        else:
            undetermined.append(word)

    if language == 'de':
        for word in female_words:
            male_version = word.removesuffix('in')
            if male_version in undetermined:
                male_words.append(male_version)
        
    undetermined = [w for w in undetermined if w not in male_words]

    return female_words, male_words, undetermined

In [25]:
# Normalize job title
df_people_concat['job_title_norm'] = df_people_concat['job_title'].apply(normalize_words)
df_people_concat['job_title_norm'] = df_people_concat['job_title_norm'].str.replace(r'[^a-zA-Z]', ' ', regex=True).apply(lambda x: ' '.join(x.split()))

In [26]:
CREATE_GENDERED_WORDS = False

if CREATE_GENDERED_WORDS:
    female_words, male_words, undetermined = create_gendered_job_names('fr', df_people_concat)

In [27]:
def clean_names(name: str) -> str:
    name = re.sub(r' genannt | dit | dite | detto | detta ', ' ', name)
    name = re.sub(r'\(.*?\)', '', name)
    name = re.sub(r'\[.*?\]', '', name)
    name = re.sub(r'[^\w\s\-.]', '', name)
    name = re.sub(r'\b\d+\b', '', name)
    name = re.sub(r'\b\w{1,4}\.(?=\s|$)', '', name)
    return ' '.join(name.strip().split())


def prepare_country_code(code: str) -> str:
    code = code if code != 'XK' else 'RS'  # gender API does not support Kosovo (XK)
    return code if len(code) <= 2 else None


def determine_gender(
    mapping: dict,
    nationalities: list[str],
    job_title: str
) -> str|None:
    """
    """
    # Try to infer gender via nationality
    for nationality in nationalities:
        if re.search(r'\bstaatsangehoeriger\b|\bcittadino\b|\bressortissant\b|\bcitoyen\b', nationality):
            return 'm'
        elif re.search(r'\bstaatsangehoerige\b|\bcittadina\b|\bressortissante\b|\bcitoyenne\b', nationality):
            return 'f'

    # Try to infer gender via job title
    genders = [mapping[w] for w in mapping.keys() if re.match(rf'\b{w}\b', job_title)]
    if genders:
        # Check if list only contains one gender
        if genders.count(genders[0]) == len(genders):
            return genders[0]

    return None

In [28]:
nat_norm_cols = [col for col in df_people_concat.columns if re.match(r'\bnationality_\d{1}_norm\b', col)]
df_people_concat['gender'] = df_people_concat.apply(lambda x: determine_gender(gender_mapping[LANGUAGE], [x[col] for col in nat_norm_cols], x['job_title_norm']), axis=1)

In [29]:
df_upload = df_people_concat.drop(columns=['nationality_1_norm', 'nationality_2_norm', 'nationality_3_norm', 'job_title_norm'])
df_upload.head()

Unnamed: 0,ehraid,message_raw,shab_date,shab_id,codes,keyword,first_name,last_name,job_title,authorization,...,nationality_1,nationality_2,nationality_3,nationality_4,nationality_1_iso_3166_1_alpha_2,nationality_2_iso_3166_1_alpha_2,nationality_3_iso_3166_1_alpha_2,nationality_4_norm,nationality_4_iso_3166_1_alpha_2,gender
0,2,"AA-Annoncen Agentur AG, in Basel, CHE-102.721....",2019-09-06,1004711015,[aenderungorgane],ausgeschiedene personen und erloschene untersc...,Philippe,Nappez,Mitglied des Verwaltungsrates,mit Kollektivunterschrift zu zweien,...,CH,,,,CH,,,,,
1,2,"AA-Annoncen Agentur AG, in Basel, CHE-102.721....",2019-09-06,1004711015,[aenderungorgane],eingetragene personen neu oder mutierend,Daniel,Ebneter,Mitglied des Verwaltungsrates,mit Kollektivunterschrift zu zweien,...,CH,,,,CH,,,,,
2,2,"AA-Annoncen Agentur AG, in Basel, CHE-102.721....",2020-02-07,1004825123,"[status, status.aufl, status.aufl.liq, aenderu...",eingetragene personen neu oder mutierend,Gabriella,Karger Travella,Mitglied des Verwaltungsrates,mit Kollektivunterschrift zu zweien,...,CH,,,,CH,,,,,
3,2,"AA-Annoncen Agentur AG, in Basel, CHE-102.721....",2020-02-07,1004825123,"[status, status.aufl, status.aufl.liq, aenderu...",eingetragene personen neu oder mutierend,Julien,Orsini,Liquidator,mit Einzelunterschrift,...,CH,,,,CH,,,,,m
4,15,"AAA EDV Software AG, in Aarau, CHE-106.307.377...",2022-04-19,1005453034,[aenderungorgane],eingetragene personen neu oder mutierend,Urs,Antener,Präsident des Verwaltungsrates,mit Einzelunterschrift,...,CH,,,,CH,,,,,m


In [30]:
df_upload.to_csv(PROCESSED_DATA_DIR / f'people_and_firms_{LANGUAGE}.csv', index=False)

In [27]:
shab_merged.main_group.unique()

array(['undetermined', 'natural persons and legal entities', 'purpose',
       'firm and address changes', 'capital and legal changes',
       'mergers and separations'], dtype=object)

# CREATE HISTORICAL PURPOSE FRAME

In [28]:
purpose_df = shab_merged[shab_merged.main_group == 'purpose'].copy()

In [30]:
purpose_df.keyword.unique()

array(['nuovo scopo', 'nuovo scopo della sede principale',
       'nuovo scopo della succursale'], dtype=object)

In [31]:
purpose_lists = [l for l in purpose_df.text_slices if l and len(l) > 1]
assert len(purpose_lists) == 0

In [32]:
purpose_df['purpose_raw'] = [l[0] if l else '' for l in purpose_df['text_slices']]

In [46]:
purpose_df['branch'] = [int('succursale' in keyword) for keyword in purpose_df['keyword']]
purpose_df['main_seat'] = [int('principale' in keyword) for keyword in purpose_df['keyword']]
purpose_df['founding_purpose'] = [int('status.neu' in codes) for codes in purpose_df['codes']]

In [39]:
purpose_df = purpose_df.drop(columns=['registry_office_canton', 'message_raw', 'parsed_variables', 'main_group', 'text_slices'])

In [47]:
purpose_df

Unnamed: 0,ehraid,shab_id,shab_date,codes,keyword,purpose_raw,branch,main_seat,founding_purpose
2,157,3204479,2016-12-06,"[zweckaenderung, aenderungorgane]",nuovo scopo,"La società si propone la progettazione , la pr...",0,0,0
48,441,1005401458,2022-02-09,"[zweckaenderung, aenderungorgane]",nuovo scopo,"L'acquisto , la vendita , la costruzione , la ...",0,0,0
60,441,1005905406,2023-12-08,[zweckaenderung],nuovo scopo,"L'acquisto , la vendita , la costruzione , la ...",0,0,0
63,441,1005956417,2024-02-08,[zweckaenderung],nuovo scopo,"L'acquisto , la vendita , la costruzione , la ...",0,0,0
194,1038,1005214878,2021-06-11,[zweckaenderung],nuovo scopo,"La società ha per scopo l'importazione , l'esp...",0,0,0
...,...,...,...,...,...,...,...,...,...
545499,1681756,1006271807,2025-03-03,"[status, status.neu]",nuovo scopo,"Lo scopo della società è : sviluppo , progetta...",0,0,1
545505,1681757,1006271809,2025-03-03,"[status, status.neu]",nuovo scopo,"La gestione di bar , tea room , ristoranti e e...",0,0,1
545509,1681758,1006271810,2025-03-03,"[status, status.neu]",nuovo scopo,Gestione cure del benessere personale unitamen...,0,0,1
545517,1681777,1006271808,2025-03-03,"[status, status.neu, vermoegenstransfer]",nuovo scopo,La società ha per scopo la gestione di centri ...,0,0,1


# PROCESS HISTORICAL FIRM CHANGES

In [63]:
firm_changes_df = shab_merged[shab_merged.main_group == 'firm and address changes'].copy()

In [64]:
firm_changes_df.keyword.unique()

array(['statuti modificati', 'nuovo recapito', 'altri indirizzi',
       'indirizzo della liquidazione', 'nuova ditta', 'nuova sede',
       'nuova succursale', 'statuti originari', 'nuovo indirizzo postale',
       'nuova sede principale',
       'numero di identificazione della sede principale',
       'data dello statuto',
       'nuovo nome della ditta della sede principale', 'sede principale',
       'atto pubblico modificato', 'nuova ide', 'succursale radiata',
       'atto pubblico originario', 'inizio'], dtype=object)

In [65]:
firm_name_df = firm_changes_df[firm_changes_df['keyword'] == 'nuova ditta']
firm_address_df = firm_changes_df[firm_changes_df['keyword'].isin(['nuovo recapito', 'altri indirizzi', 'indirizzo della liquidazione', 'nuovo indirizzo postale'])]
firm_seat_df = firm_changes_df[firm_changes_df['keyword'].isin(['nuova sede', 'nuova sede principale', 'sede principale'])]
branches_df = firm_changes_df[firm_changes_df['keyword'].isin(['nuova succursale', 'succursale radiata'])]

In [66]:
branches_df

Unnamed: 0,ehraid,shab_id,shab_date,registry_office_canton,message_raw,codes,keyword,parsed_variables,main_group,text_slices
224,1311,2707803,2016-03-07,TI,"Adelheid AG, finora in Samedan, CHE-101.463.26...","[zweckaenderung, adressaenderung, aenderungorg...",nuova succursale,"{'branches_until_now': [], 'branches_new': [],...",firm and address changes,[[ radiati : Lugano ]]
2060,8433,1005483263,2022-05-27,TI,"ARGOR-HERAEUS SA, in Mendrisio, CHE-102.670.49...",,nuova succursale,"{'branches_until_now': [], 'branches_new': [{'...",firm and address changes,[Yverdon-les-Bains ( CHE-389.686.914 )]
2960,10845,2883205,2016-06-10,TI,"Audio-Video G + M AG, in Lamone, CHE-106.048.0...",,nuova succursale,{'branches_until_now': [{'location': 'Yverdon-...,firm and address changes,[Yverdon-les-Bains ( CHE-279.466.641 ) [ finor...
4098,14130,3794931,2017-10-06,GR,"Barella SA, in Mesocco, CHE-101.430.409, socie...",,nuova succursale,"{'branches_until_now': [], 'branches_new': [{'...",firm and address changes,[Roveredo ( GR ) ( CHE-279.082.542 )]
4108,14130,1005730560,2023-04-24,GR,"Barella SA, in Mesocco, CHE-101.430.409, socie...",,nuova succursale,"{'branches_until_now': [], 'branches_new': [],...",firm and address changes,[[ radiati : Roveredo ( GR ) ( CHE-279.082.542...
...,...,...,...,...,...,...,...,...,...,...
531406,1631773,1006217211,2024-12-27,TI,"Studio ICARA SA, in Lugano, CHE-461.344.790, s...",,nuova succursale,"{'branches_until_now': [], 'branches_new': [{'...",firm and address changes,[Mendrisio ( CHE-145.530.944 )]
531580,1632745,1006018052,2024-04-25,TI,"MC Grecof SA, in Lugano, CHE-160.298.683, soci...",,nuova succursale,"{'branches_until_now': [], 'branches_new': [{'...",firm and address changes,[Diepoldsau ( CHE-109.795.820 )]
540646,1663701,1006234069,2025-01-21,GR,"SEF Associati GmbH, in St. Moritz, CHE-462.298...",,nuova succursale,"{'branches_until_now': [], 'branches_new': [{'...",firm and address changes,[Zug ( CHE-135.660.396 )]
542027,1668815,1006229074,2025-01-15,TI,"Yamamay Suisse SA, in Lugano, CHE-316.062.103,...",,nuova succursale,"{'branches_until_now': [], 'branches_new': [{'...",firm and address changes,[Collina d'Oro ( CHE-342.328.896 )]


# PROCESS CAPITAL CHANGES

In [None]:
capital_changes = shab_merged[shab_merged.main_group == 'capital and legal changes'].copy()

In [None]:
# Correct wrong values like '446.001.000.00'
def correct_number(number_str: str) -> str:
    parts = number_str.rsplit('.', 1)
    return parts[0].replace('.', '') + '.' + parts[1] if len(parts) > 1 else parts[0].replace('.', '')

In [None]:
shab_ids = [[], []]
keywords = [[], []]
main_groups = [[], []]

capital_new = []
capital_until_now = []
num_shares_new = []
val_shares_new = []
typ_shares_new = []

for i, row in capital_changes.iterrows():
    shab_id = row['shab_id']
    keyword = row['keyword']
    main_group = row['main_group']

    parsed_variables = row['parsed_variables']
    cap_new = parsed_variables.get('capital_new', [])
    cap_unt = parsed_variables.get('capital_until_now', [])
    srs_new = parsed_variables.get('shares_new', [])

    if cap_new or cap_unt:
        capital_new.append(cap_new[0] if len(cap_new) > 0 else None)
        capital_until_now.append(cap_unt[0] if len(cap_unt) > 0 else None)
        shab_ids[0].append(shab_id)
        keywords[0].append(keyword)
        main_groups[0].append(main_group)
    if srs_new:
        for s in srs_new:
            num_shares_new.append(s.get('number'))
            val_shares_new.append(s.get('value'))
            typ_shares_new.append(s.get('type'))
            shab_ids[1].append(shab_id)
            keywords[1].append(keyword)
            main_groups[1].append(main_group)

In [None]:
cap_new = pd.DataFrame({
    'shab_id': shab_ids[0],
    'keyword': keywords[0],
    'main_group': main_groups[0],
    'capital_new': capital_new,
    'capital_until_now': capital_until_now})

cap_new['capital_new'] = cap_new['capital_new'].fillna('').str.replace("'", "", regex=False)
cap_new['capital_until_now'] = cap_new['capital_until_now'].fillna('').str.replace("'", "", regex=False)

# Extract Währung
cap_new['currency_new'] = cap_new['capital_new'].str.extract(r'^([^\d\s]+)')
cap_new['currency_new'] = cap_new['currency_new'].fillna('')

cap_new['currency_until_now'] = cap_new['capital_until_now'].str.extract(r'^([^\d\s]+)')
cap_new['currency_until_now'] = cap_new['currency_until_now'].fillna('')

# Extract Kapital
cap_new['capital_new'] = cap_new['capital_new'].str.extract(r'([\d.,]+)').astype(str)
cap_new['capital_until_now'] = cap_new['capital_until_now'].str.extract(r'([\d.,]+)').astype(str)

# Apply correction to remove unneccessary punctuations
cap_new['capital_new'] = cap_new['capital_new'].apply(correct_number)
cap_new['capital_until_now'] = cap_new['capital_until_now'].apply(correct_number)

cap_new['capital_new'] = [v.replace(',', '.') if not '.' in v else v.replace(',', '') for v in cap_new['capital_new']]
cap_new.loc[cap_new['capital_new'] == '.', 'capital_new'] = np.nan

cap_new['capital_until_now'] = [v.replace(',', '.') if not '.' in v else v.replace(',', '') for v in cap_new['capital_until_now']]
cap_new.loc[cap_new['capital_until_now'] == '.', 'capital_until_now'] = np.nan

# Ensure correct types
cap_new['capital_new'] = cap_new['capital_new'].astype(float)
cap_new['capital_until_now'] = cap_new['capital_until_now'].astype(float)

In [None]:
stocks_new = pd.DataFrame({
    'shab_id': shab_ids[1],
    'keyword': keywords[1],
    'hauptkategorie': main_groups[1],
    'num_shares_new': num_shares_new,
    'val_shares_new': val_shares_new,
    'typ_shares_new': typ_shares_new,})

stocks_new['val_shares_new'] = stocks_new['val_shares_new'].fillna('').str.replace("'", "", regex=False)

# Extract Währung
stocks_new['currency_shares_new'] = stocks_new['val_shares_new'].str.extract(r'^([^\d\s]+)')
stocks_new['currency_shares_new'] = stocks_new['currency_shares_new'].fillna('')

# Extract number of Stocks, etc.
stocks_new['val_shares_new'] = stocks_new['val_shares_new'].str.extract(r'([\d.,]+)').astype(str)

# Apply correction to remove unneccessary punctuations
stocks_new['val_shares_new'] = stocks_new['val_shares_new'].apply(correct_number)
stocks_new['val_shares_new'] = [v.replace(',', '.') if not '.' in v else v.replace(',', '') for v in stocks_new['val_shares_new']]
stocks_new.loc[stocks_new['val_shares_new'] == '.', 'val_shares_new'] = np.nan

# Ensure correct types
stocks_new['val_shares_new'] = stocks_new['val_shares_new'].astype(float)
stocks_new['num_shares_new'] = stocks_new['num_shares_new'].astype(int)

In [None]:
# Calculate total value of the capital by multiplying the number of shares with their individual value
stocks_new['value_total'] = stocks_new['num_shares_new'] * stocks_new['val_shares_new']

# Calculate new capital for shab ids where kapital_neu variable is not given, but scheine_neu is
missing_ids = set(stocks_new.shab_id).difference(set(cap_new.shab_id))
stocks_new_missing = stocks_new[stocks_new.shab_id.isin(missing_ids)]

cap_new_missing = stocks_new_missing.groupby(['shab_id', 'keyword', 'main_group']).agg(
    capital_new=pd.NamedAgg(column='value_total', aggfunc='sum'),
    currency_new=pd.NamedAgg(column='currency_shares_new', aggfunc=lambda x: list(set([currency for currency in x if currency != ''])))).reset_index()

In [None]:
# Check if there are mixed currencies
assert len([cur_set for cur_set in cap_new_missing['currency_new'] if len(cur_set) > 1]) == 0

cap_new_missing['currency_new'] = [v[0] if len(v) > 0 else '' for v in cap_new_missing['currency_new']]
cap_new_missing['currency_until_now'] = ''
cap_new_missing['capital_new'] = np.nan

In [None]:
# Combine the two dataframes to get all capital changes
cap_new_concat = pd.concat([cap_new, cap_new_missing])

In [None]:
cap_new_concat.head()

In [None]:
cap_new_concat.to_csv(EXTERNAL_DATA_DIR / 'capital_changes.csv', index=False)

# PROCESS MERGERS AND ACQUISIITONS

In [None]:
mergers_and_acquisitions = shab_merged[shab_merged.main_group == 'mergers and separations'].copy()

In [None]:
shab_ids = []
keywords = []
main_groups = []
text_passages = []

contract_dates = []
bilanzdaten = []

firm_taken_over = []
location_taken_over = []
id_taken_over = []
assets_taken_over = []
liabilities_taken_over = []

for i, row in mergers_and_acquisitions.iterrows():
    shab_id = row['shab_id']
    keyword = row['keyword']
    main_group = row['main_group']
    text_passage = row['text_slice']

    parsed_variables = row['parsed_variables']

    if parsed_variables:
        contract_date = parsed_variables.get('contract_date', [])
        bilanzdatum = parsed_variables.get('bilanzdatum', [])
        firms_taken_over = parsed_variables.get('firms_taken_over', [])
        for firm in firms_taken_over:
            firm_name = firm.get('firm_name', '')
            location = firm.get('location', '')
            id = firm.get('id', '')
            capital_taken_over = firm.get('capital_taken_over', {})
            assets = capital_taken_over.get('aktiven', '') if capital_taken_over else ''
            liabilities = capital_taken_over.get('passiven', '') if capital_taken_over else ''

            contract_dates.append(contract_date[0] if len(contract_date) > 0 else '')
            bilanzdaten.append(bilanzdatum[0] if len(bilanzdatum) > 0 else '')
            firm_taken_over.append(firm_name)
            location_taken_over.append(location)
            id_taken_over.append(id)
            assets_taken_over.append(assets)
            liabilities_taken_over.append(liabilities)
            shab_ids.append(shab_id)
            keywords.append(keyword)
            main_groups.append(main_group)
            text_passages.append(text_passage)
    else:
        firm_taken_over.append('')
        location_taken_over.append('')
        id_taken_over.append('')
        assets_taken_over.append('')
        liabilities_taken_over.append('')
        shab_ids.append(shab_id)
        keywords.append(keyword)
        main_groups.append(main_group)
        text_passages.append(text_passage)

In [None]:
assert len(shab_ids) == len(keywords) == len(main_groups) == len(firm_taken_over) == len(location_taken_over) == len(id_taken_over) == len(assets_taken_over) == len(liabilities_taken_over)

In [None]:
processed_mergers = pd.DataFrame({
    'shab_id': shab_ids,
    'keyword': keywords,
    'hauptkategorie': main_groups,
    'texte': text_passages,
    'firm_taken_over': firm_taken_over,
    'location_taken_over': location_taken_over,
    'id_taken_over': id_taken_over,
    'assets_taken_over': assets_taken_over,
    'liabilities_taken_over': liabilities_taken_over
})

processed_mergers['assets_taken_over'] = processed_mergers['assets_taken_over'].fillna('').str.replace("'", "", regex=False)
processed_mergers['liabilities_taken_over'] = processed_mergers['liabilities_taken_over'].fillna('').str.replace("'", "", regex=False)

# Extract Währung of Aktiven/Passiven
processed_mergers['currency_assets_taken_over'] = processed_mergers['assets_taken_over'].str.extract(r'^([^\d\s]+)')
processed_mergers['currency_assets_taken_over'] = processed_mergers['currency_assets_taken_over'].fillna('')

processed_mergers['currency_liabilities_taken_over'] = processed_mergers['liabilities_taken_over'].str.extract(r'^([^\d\s]+)')
processed_mergers['currency_liabilities_taken_over'] = processed_mergers['currency_liabilities_taken_over'].fillna('')

# Extract value of Aktiven/Passiven
processed_mergers['assets_taken_over'] = processed_mergers['assets_taken_over'].str.extract(r'([\d.,]+)').astype(str)

processed_mergers['liabilities_taken_over'] = processed_mergers['liabilities_taken_over'].str.extract(r'([\d.,]+)').astype(str)

# Apply correction to remove unneccessary punctuations
processed_mergers['assets_taken_over'] = processed_mergers['assets_taken_over'].apply(correct_number)
processed_mergers['assets_taken_over'] = [v.replace(',', '.') if not '.' in v else v.replace(',', '') for v in processed_mergers['assets_taken_over']]

processed_mergers['liabilities_taken_over'] = processed_mergers['liabilities_taken_over'].apply(correct_number)
processed_mergers['liabilities_taken_over'] = [v.replace(',', '.') if not '.' in v else v.replace(',', '') for v in processed_mergers['liabilities_taken_over']]

# Ensure correct types
processed_mergers['assets_taken_over'] = processed_mergers['assets_taken_over'].astype(float)
processed_mergers['liabilities_taken_over'] = processed_mergers['liabilities_taken_over'].astype(float)

In [None]:
processed_mergers.head()

In [None]:
processed_mergers.to_csv(EXTERNAL_DATA_DIR / 'merger_sizes.csv', index=False)