# **PROCESS FIRM-LEVEL FEATURES**

In [1]:
import os
import dotenv
import geopandas as gpd
import numpy as np
import pandas as pd
from ftlangdetect import detect
from geopy.geocoders import Nominatim, GoogleV3

from pocketknife.database import (
    connect_database, read_from_database)

from success_prediction.config import (
    PROJ_ROOT, RAW_DATA_DIR, EXTERNAL_DATA_DIR, PROCESSED_DATA_DIR)

dotenv_path = os.path.join(PROJ_ROOT, '.env')
dotenv.load_dotenv(dotenv_path)

[32m2025-05-21 16:40:01.682[0m | [1mINFO    [0m | [36msuccess_prediction.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/manuelbolz/Documents/git/for_work/company_success_prediction[0m


True

In [2]:
id2legalform = {
    1: 'Sole proprietorship',  # Einzelunternehmen  ->  EXCLUDE
    2: 'General Partnership',  # Kollektivgesellschaft  ->  INCLUDE
    3: 'Corporation',  # Aktiengesellschaft  ->  INCLUDE
    4: 'Limited Liability Company',  # Gesellschaft mit beschränkter Haftung  ->  INCLUDE
    5: 'Cooperative',  # Genossenschaft  ->  EXCLUDE
    6: 'Association',  # Verein  ->  EXCLUDE
    7: 'Foundation',  # Stiftung  ->  EXCLUDE
    8: 'Public sector institution',  # Institut des öffentlichen Rechts  ->  EXCLUDE
    9: 'Branch',  # Zweigniederlassung  ->  EXCLUDE
    10: 'Limited Partnership',  # Kommanditgesellschaft  ->  INCLUDE
    11: 'Foreign branch',  # Zweigniederlassung einer ausl. Gesellschaft  ->  EXCLUDE
    12: 'Corporation with unlimited partners',  # Kommanditaktiengesellschaft  ->  INCLUDE
    13: 'Special legal form',  # Besondere Rechtsform  ->  EXCLUDE
    14: 'Ownership in undivided shares',  # Gemeinderschaft  ->  EXCLUDE
    15: 'Limited Partnership for collective investment schemes with a fixed capital',  # Investmentgesellschaft mit festem Kapital  ->  INCLUDE
    16: 'Limited Partnership for collective investment schemes with a variable capital',  # Investmentgesellschaft mit variablem Kapital  ->  INCLUDE
    17: 'Limited Partnership for collective investment schemes',  # Kommanditgesellschaft für kollektive Kapitalanlagen  ->  INCLUDE
    18: 'Non commercial power of attorney',  # Nichtkaufmännische Prokure  ->  EXCLUDE
    19: '(unknown)',  # (unbekannt)  ->  EXCLUDE
}

growth_oriented_legal_forms = [2, 3, 4, 10, 12, 15, 16, 17]

In [703]:
# This query gets the sample of growth oriented firms that were founded between 2016 and current for the prediction sample

query_founded_firms = """ 
    SELECT
        base.ehraid,
        base.uid,

        -- Dissolution information
        base.delete_date,
        dissolution.shab_date as dissolution_date,
        dissolution.reason_for_dissolution,
        dissolution.liquidation,
        dissolution.bankruptcy,
        dissolution.other_exit,

        -- Names
        base.name AS current_name,
        founding_name.firm_name AS founding_name,

        -- Legal forms
        base.legal_form_id AS current_legal_form,
        legal_form.legal_form_id AS founding_legal_form,

        -- Purpose
        base.purpose_raw AS current_purpose,
        founding_purpose.purpose_raw AS founding_purpose,

        -- Current address
        COALESCE(address.street, '') || ' ' || COALESCE(address.house_number, '') AS current_street,
        address.town AS current_town,
        address.swiss_zip_code AS current_zip_code,
        address.country AS current_country,

        -- Founding address
        founding_address.street AS founding_street,
        founding_address.town AS founding_town,
        founding_address.postal_code AS founding_zip_code,
        founding_address.town_bfs_gmde_code_latest AS founding_bfs_code,

        -- Founding SHAB entry
        shab.shab_id,
        shab.shab_date AS founding_date,
        shab.message AS founding_message

    FROM zefix_release_159.base base

    -- Founding SHAB messages
    INNER JOIN (
        SELECT s.ehraid, s.shab_id, s.shab_date, s.message
        FROM zefix_release_159.shab s
        INNER JOIN zefix_release_159.shab_mutation sm ON s.shab_id = sm.shab_id
        WHERE sm.description = 'status.neu'
    ) AS shab ON base.ehraid = shab.ehraid

    -- Current address
    LEFT JOIN zefix_release_159.address address ON base.ehraid = address.ehraid

    -- Founding address
    LEFT JOIN (
        SELECT DISTINCT hfa.ehraid, hfa.street, hfa.postal_code, hfa.town, hfa.town_bfs_gmde_code_latest
        FROM zefix.history_firm_addresses hfa
        WHERE founding = TRUE
    ) AS founding_address ON base.ehraid = founding_address.ehraid

    -- Founding name
    LEFT JOIN (
        SELECT DISTINCT hfn.ehraid, hfn.firm_name
        FROM zefix.history_firm_names hfn
        WHERE hfn.founding = TRUE
    ) AS founding_name ON base.ehraid = founding_name.ehraid

    -- Founding purpose
    LEFT JOIN (
        SELECT DISTINCT hp.ehraid, hp.purpose_raw
        FROM zefix.history_purpose hp
        WHERE hp.founding_purpose = TRUE
    ) AS founding_purpose ON base.ehraid = founding_purpose.ehraid

    -- Founding legal form
    LEFT JOIN (
        SELECT DISTINCT hlf.ehraid, hlf.legal_form_id
        FROM zefix.history_founding_legal_form hlf
    ) AS legal_form ON base.ehraid = legal_form.ehraid

    -- Dissolution information
    LEFT JOIN (
        -- Only keep the last dissolution message as the final dissolution
        SELECT hd.ehraid, hd.shab_date, hd.reason_for_dissolution, hd.liquidation, hd.bankruptcy, hd.other_exit
        FROM (
            SELECT *,
                ROW_NUMBER() OVER (PARTITION BY ehraid ORDER BY shab_date DESC) AS rn
            FROM zefix.history_dissolutions
        ) hd
        WHERE hd.rn = 1
    ) AS dissolution ON base.ehraid = dissolution.ehraid

    -- Filter out irrelevant records
    WHERE
        NOT base.is_branch
        AND shab.shab_date < '2024-01-01'
        AND base.legal_form_id IN (2, 3, 4, 10, 12, 15, 16, 17)
        AND LOWER(base.name) NOT LIKE '%zweigniederlassung%'
        AND LOWER(base.name) NOT LIKE '%succursale%';
"""

In [704]:
with connect_database() as con:
    df_startups = read_from_database(connection=con, query=query_founded_firms)

df_startups['founding_date'] = pd.to_datetime(df_startups['founding_date'])

In [706]:
# Observed duplicates stem from entries having multiple new inscriptions in Zefix. -> Remove them from the sample because history seems to contain errors
display(df_startups[df_startups.duplicated(subset=['ehraid', 'founding_town'], keep=False)].uid.unique())
df_startups = df_startups.drop_duplicates(subset=['ehraid'], keep=False)
display(df_startups.shape)

array([], dtype=object)

(226559, 25)

### GEO ENCODE ADDRESS INFORMATION

In [None]:
# Use the current information if the founding address is missing
df_startups['founding_street'] = df_startups['founding_street'].fillna(df_startups['current_street'])
df_startups['founding_zip_code'] = df_startups['founding_zip_code'].fillna(df_startups['current_zip_code'])
df_startups['founding_town'] = df_startups['founding_town'].fillna(df_startups['current_town'])

In [116]:
assert df_startups[df_startups['founding_street'].isna()].empty
assert df_startups[df_startups['founding_zip_code'].isna()].empty
assert df_startups[df_startups['founding_town'].isna()].empty

In [None]:
nominatim_geolocator = Nominatim(
    user_agent="local_geocoder",
    domain="localhost:8080",
    scheme="http"
)
google_geolocator = GoogleV3(api_key=os.getenv('GOOGLE_GEOCODE_API_KEY'))


def geocode_address(nominatim_geolocator, google_geolocator, row):
    try:
        location = nominatim_geolocator.geocode({
            'street': row['founding_street'],
            'city': row['founding_town'],
            'postalcode': int(row['founding_zip_code']),
            'country': 'Schweiz'
        }, timeout=2)
        if location:
            return pd.Series([location.address, location.latitude, location.longitude])
        else:
            location = google_geolocator.geocode({
                'street': row['founding_street'],
                'city': row['founding_town'],
                'postalcode': int(row['founding_zip_code']),
                'country': 'Schweiz'
            }, timeout=1)
            if location:
                return pd.Series([location.address, location.latitude, location.longitude])
            return pd.Series([None, None, None])
    except Exception as e:
        print(f"Error: {e}")
        return pd.Series([None, None, None])

In [None]:
df_startups[['geocoded_address', 'latitude', 'longitude']] = df_startups.apply(lambda row: geocode_address(nominatim_geolocator, google_geolocator, row), axis=1)

In [285]:
df_startups['founding_bfs_code'] = df_startups['founding_bfs_code'].astype(int)
df_startups['founding_zip_code'] = df_startups['founding_zip_code'].astype(int)

### DETERMINE BFS MUNICIPALITY CODE BY COORDINATES WHERE MISSING

In [372]:
gdf = gpd.read_file(EXTERNAL_DATA_DIR / 'geo_data' / 'swissBOUNDARIES3D_1_5_LV95_LN02.gdb', layer="TLM_HOHEITSGEBIET")
gdf = gdf.to_crs("EPSG:4326")
gdf = gdf[['geometry', 'BFS_NUMMER', 'EINWOHNERZAHL']]

df_startups = gpd.GeoDataFrame(
    df_startups,
    geometry=gpd.points_from_xy(df_startups['longitude'], df_startups['latitude']),
    crs="EPSG:4326"
)

df_startups = gpd.sjoin(df_startups, gdf, how="left", predicate="within")

# Replace where code is 0 (unmatched) or where it does not match the coordinates
df_startups.loc[df_startups['founding_bfs_code'] == 0, 'founding_bfs_code'] = pd.NA
df_startups.loc[(df_startups['founding_bfs_code'].astype(float) != df_startups['BFS_NUMMER']) & (~df_startups['BFS_NUMMER'].isna()), 'founding_bfs_code'] = pd.NA
df_startups['founding_bfs_code'] = df_startups['founding_bfs_code'].fillna(df_startups['BFS_NUMMER'])

In [374]:
df_startups['founding_bfs_code'] = df_startups['founding_bfs_code'].astype(int)

In [375]:
df_startups[df_startups.founding_bfs_code.astype(float) != df_startups.BFS_NUMMER][['founding_town', 'combined_address', 'founding_bfs_code', 'BFS_NUMMER']]

Unnamed: 0,founding_town,combined_address,founding_bfs_code,BFS_NUMMER
223603,Chiasso,"Via Henry Dunant 1, 6830 Chiasso",5250,
223613,Morcote,"Via Isella 11, 6922 Morcote",5203,
223639,Brusino Arsizio,"Via Lungolago 83, 6827 Brusino Arsizio",5160,
224506,San Bernardino,"Residenza Mons Avium , appartamento 25, 6565 S...",3822,
224785,La Tène,"route de Bellevue 7, 2074 La Tène",6513,
225829,Roggwil TG,"Im Pünst 1, 9325 Roggwil TG",4431,
225873,Bassins,"Chemin de Raulan 24, 1269 Bassins",5703,
226046,Warth,"Kartause Ittingen, 8532 Warth",4621,


In [None]:
df_startups.drop(columns=['geometry', 'index_right', 'Unnamed: 0', 'BFS_NUMMER'], inplace=True)

### ADD MUNICIPALITY TYPOLOGY

In [None]:
df_typology = pd.read_excel(EXTERNAL_DATA_DIR / 'geo_data' / 'Raumgliederungen.xlsx')
df_typology.drop(columns=['Gemeindename', 'Bezirksname', 'Kanton'], inplace=True)
df_typology = df_typology.rename(columns={'BFS Gde-nummer': 'founding_bfs_code', 'Bezirks-nummer': 'district_id', 'Kantons-nummer': 'canton_id', 'Stadt/Land-Typologie': 'urban_rural', 'Gemeindetypologie (9 Typen)': 'typology_9c', 'Gemeindetypologie (25 Typen)': 'typology_25c'})

In [384]:
df_startups = df_startups.merge(df_typology, on='founding_bfs_code', how='left')

In [None]:
df_startups.rename(columns={'EINWOHNERZAHL': 'population'}, inplace=True)

In [389]:
df_startups[df_startups.canton_id.isna()]

Unnamed: 0,ehraid,uid,delete_date,reason_for_dissolution,liquidation,bankruptcy,other_exit,current_name,founding_name,current_legal_form,...,combined_address,geocoded_address,latitude,longitude,population,canton_id,Bezirks-nummer,urban_rural,typology_9c,typology_25c
2469,1255845,CHE395917849,2020-06-19,['Nachdem kein begründeter Einspruch gegen die...,False,False,True,Same Same GmbH in Liquidation,Same Same GmbH,4,...,"Seeplatz 1, 8820 Wädenswil","1, Seeplatz, Wädenswil, Bezirk Horgen, Zürich,...",47.228758,8.676404,0.0,,,,,
80542,1389236,CHE291873431,,,,,,MS Glärnisch AG,MS Glärnisch AG,3,...,"Seeplatz 1, 8820 Wädenswil","1, Seeplatz, Wädenswil, Bezirk Horgen, Zürich,...",47.228758,8.676404,0.0,,,,,
223147,1310452,CHE338654358,,['Mit Entscheid vom 07.01.2025 hat der Einzelr...,False,True,False,Peter Jegen GmbH in Liquidation,Peter Jegen GmbH,4,...,"Sagastrasse 3, 7214 Grüsch","Sägastrasse 3, 9495 Triesen, Liechtenstein",47.088149,9.522204,5532.0,,,,,


In [392]:
df_startups.head()

Unnamed: 0,ehraid,uid,delete_date,reason_for_dissolution,liquidation,bankruptcy,other_exit,current_name,founding_name,current_legal_form,...,combined_address,geocoded_address,latitude,longitude,population,canton_id,district_id,urban_rural,typology_9c,typology_25c
0,1251325,CHE153193257,,['Mit Urteil des Gerichtspräsidenten des Zivil...,True,False,False,Arlez Carrosserie GmbH in Liquidation,Arlez Carrosserie GmbH,4,...,"Bleichiweg 4, 4460 Gelterkinden","4, Bleichiweg, Gelterkinden, Bezirk Sissach, B...",47.460137,7.86118,6296.0,13.0,1304.0,2.0,21.0,217.0
1,1251326,CHE392024369,2020-11-11,[],False,False,True,Vista Coaching GmbH in Liquidation,Vista Coaching GmbH,4,...,"Ergolzstrasse 13, 4414 Füllinsdorf","13, Ergolzstrasse, Füllinsdorf, Bezirk Liestal...",47.504062,7.724207,4700.0,13.0,1303.0,1.0,11.0,113.0
2,1251327,CHE473646370,,,,,,Wissler Consulting GmbH,Wissler Consulting GmbH,4,...,"Wintersingerstrasse 18a, 4464 Maisprach","18a, Wintersingerstrasse, Maisprach, Bezirk Si...",47.523075,7.845789,941.0,13.0,1304.0,3.0,23.0,236.0
3,1251328,CHE205344235,,,,,,Wolf Regio GmbH,Wolf Regio GmbH,4,...,"Grellingerstrasse 32, 4142 Münchenstein","32, Grellingerstrasse, Münchenstein, Bezirk Ar...",47.518085,7.603328,12304.0,13.0,1301.0,1.0,11.0,112.0
4,1251329,CHE190527339,,"['Mit Entscheid vom 27.09.2022 , 9.15 Uhr , ha...",False,True,False,AHAS GmbH in Liquidation,AHAS GmbH,4,...,"Luzernstrasse 60, 6102 Malters","60, Luzernstrasse, Bühl, Malters, Luzern, 6102...",47.0363,8.177812,7771.0,3.0,312.0,2.0,21.0,216.0


In [None]:
df_startups.to_csv(RAW_DATA_DIR / 'company_sample' / 'geo_coded_company_sample.csv', index=False)

In [2]:
df_startups = pd.read_csv(RAW_DATA_DIR / 'company_sample' / 'geo_coded_company_sample.csv')

### ADD STARTING CAPITAL TO COMPANY DATA

In [None]:
# Get historical exchange rates
import yfinance as yf

exchange_rate_dfs = []
for symbol in ['EUR', 'GBP', 'USD']:
    ticker = yf.Ticker(f'{symbol}CHF=X')
    df_ticker = ticker.history(start='2016-01-01', end='2024-01-01')
    df_ticker['symbol'] = symbol
    exchange_rate_dfs.append(df_ticker)

In [None]:
df_exchange_rates = pd.concat(exchange_rate_dfs).reset_index()[['Date', 'symbol', 'Open']]

df_exchange_rates = df_exchange_rates.rename(columns={'Date': 'founding_date'})
df_exchange_rates['founding_date'] = pd.to_datetime(df_exchange_rates['founding_date']).dt.date
df_exchange_rates['founding_date'] = pd.to_datetime(df_exchange_rates['founding_date'])

df_temp = pd.DataFrame({'founding_date': pd.date_range(start='2016-01-01', end='2024-01-01').tolist() * 3})
df_temp['symbol'] = ['EUR'] * int(len(df_temp) / 3) + ['GBP'] * int(len(df_temp) / 3) + ['USD'] * int(len(df_temp) / 3)

df_exchange_rates = df_temp.merge(df_exchange_rates, on=['founding_date', 'symbol'], how='left')
df_exchange_rates['Open'] = df_exchange_rates['Open'].ffill()
df_exchange_rates['symbol'] = df_exchange_rates['symbol'].ffill()

df_exchange_rates.to_csv(EXTERNAL_DATA_DIR / 'exchange_rates' / 'exchange_rates.csv', index=False)

In [439]:
query_capital = """ 
    SELECT * FROM zefix.history_registered_capital WHERE shab_date < '2024-01-01';
"""

In [663]:
with connect_database() as con:
    df_capital = read_from_database(connection=con, query=query_capital)

In [664]:
df_capital = df_capital.rename(columns={'shab_date': 'founding_date', 'currency_new': 'symbol'})
df_capital['founding_date'] = pd.to_datetime(df_capital['founding_date'])

mapping = {
    'Euro': 'EUR',
    'Eur': 'EUR',
    'EURO': 'EUR',
    '€': 'EUR',
    'fr': 'CHF',
    'Fr.': 'CHF',
    'CHE': 'CHF',
    '£': 'GBP',
    'US': 'USD'
}
df_capital['symbol'] = df_capital['symbol'].replace(mapping)

# Drop duplicate entries where we have libaration information do avoid duplicates before aggregation
df_capital = df_capital[~df_capital.duplicated(subset=['ehraid', 'founding_date'], keep=False) | (df_capital.duplicated(subset=['ehraid', 'founding_date'], keep=False) & ~(df_capital['keyword'].str.contains('liberierung|liberato|libéré', regex=True)))]

# Drop entries where the currency is not a common currency
df_capital = df_capital[df_capital.symbol.isin(['CHF', 'EUR', 'USD', 'GBP'])]

# Add exchange rates and convert registered capital
df_capital = df_capital.merge(df_exchange_rates, on=['symbol', 'founding_date'], how='left')
df_capital['Open'] = df_capital['Open'].fillna(1.0)
df_capital['capital_chf'] = df_capital['capital_new'].astype(float) * df_capital['Open'].astype(float)

# Aggregate capital into one value for registered capital
df_capital_agg = df_capital.groupby(['ehraid', 'founding_date']).agg({'capital_chf': 'sum'}).reset_index()

In [None]:
df_startups = df_startups.merge(df_capital_agg[['ehraid', 'founding_date', 'capital_chf']], on=['ehraid', 'founding_date'], how='left')

In [694]:
df_startups.head()

Unnamed: 0,ehraid,uid,delete_date,reason_for_dissolution,liquidation,bankruptcy,other_exit,current_name,founding_name,current_legal_form,...,latitude,longitude,population,canton_id,district_id,urban_rural,typology_9c,typology_25c,capital_chf,company_url
0,1251325,CHE153193257,,['Mit Urteil des Gerichtspräsidenten des Zivil...,True,False,False,Arlez Carrosserie GmbH in Liquidation,Arlez Carrosserie GmbH,4,...,47.460137,7.86118,6296.0,13.0,1304.0,2.0,21.0,217.0,20000.0,http://www.arlez-carrosserie.ch/
1,1251326,CHE392024369,2020-11-11,[],False,False,True,Vista Coaching GmbH in Liquidation,Vista Coaching GmbH,4,...,47.504062,7.724207,4700.0,13.0,1303.0,1.0,11.0,113.0,20000.0,http://vista-coaching.ch/
2,1251327,CHE473646370,,,,,,Wissler Consulting GmbH,Wissler Consulting GmbH,4,...,47.523075,7.845789,941.0,13.0,1304.0,3.0,23.0,236.0,20000.0,no website available
3,1251328,CHE205344235,,,,,,Wolf Regio GmbH,Wolf Regio GmbH,4,...,47.518085,7.603328,12304.0,13.0,1301.0,1.0,11.0,112.0,20000.0,http://wolfregio.ch/
4,1251329,CHE190527339,,"['Mit Entscheid vom 27.09.2022 , 9.15 Uhr , ha...",False,True,False,AHAS GmbH in Liquidation,AHAS GmbH,4,...,47.0363,8.177812,7771.0,3.0,312.0,2.0,21.0,216.0,20000.0,no website available


### ENCODE BPS FEATURES

In [26]:
from collections import Counter
from unidecode import unidecode
import nltk
from nltk.corpus import stopwords
from transformers import pipeline

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

porter_stemmer = nltk.stem.PorterStemmer()

UMLAUT_REPLACEMENTS = {
    'ä': 'ae',
    'ö': 'oe',
    'ü': 'ue',
}

def normalize_words(text: str) -> str:
    if not isinstance(text, str):
        return ''
    for char, replacement in UMLAUT_REPLACEMENTS.items():
        text = text.replace(char, replacement)
    return unidecode(text.lower())

def get_language(text: str) -> str:
    language = detect(text)
    return language.get('lang', 'de')

def stem_text(text: str, lang_code: str) -> str:
    code2lang = {
        'de': 'german',
        'en': 'english',
        'fr': 'french',
        'it': 'italian'
    }
    language = code2lang.get(lang_code, 'german')
    stop_words = set(stopwords.words(language))
    tokens = nltk.tokenize.word_tokenize(text, language=language)
    return [porter_stemmer.stem(w) for w in tokens if not w.lower() in stop_words]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/manuelbolz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/manuelbolz/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manuelbolz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
male_names = pd.read_csv(EXTERNAL_DATA_DIR / 'newborn_names' / 'maennliche_vornamen.csv', encoding='ISO-8859-15', usecols=['Vorname'])
female_names = pd.read_csv(EXTERNAL_DATA_DIR / 'newborn_names' / 'weibliche_vornamen.csv', encoding='ISO-8859-15', usecols=['Vorname'])

male_first_names = [normalize_words(name) for name in male_names['Vorname']]
female_first_names = [normalize_words(name) for name in female_names['Vorname']]

In [None]:
df_startups.loc[df_startups.founding_purpose.isna(), 'founding_purpose'] = df_startups['current_purpose']  # Use current purpose if founding purpose is missing
df_startups['bps_language'] = df_startups['founding_purpose'].apply(get_language)

corrections = {
    'cs': 'de',
    'en': 'de',
    'sv': 'de',
    'es': 'fr',
    'pt': 'de',
    'pl': 'de',
    'nl': 'de',
    'ca': 'it',
    'et': 'de'
}

df_startups['bps_language'] = df_startups['bps_language'].replace(corrections)

In [20]:
# 1. Apply stop-word removal and stemming
df_startups['bps_normalized'] = df_startups.apply(lambda row: stem_text(row['founding_purpose'], lang_code=row['bps_language']), axis=1)

In [21]:
# 2. Encode basic features
df_startups['bps_length'] = df_startups['bps_normalized'].apply(lambda x: len(' '.join(x)))  # bps length
df_startups['bps_mean_word_length'] = [(1 / len(word_list)) * np.array([len(w) for w in word_list]).sum() for word_list in df_startups['bps_normalized']]  # average word length per bps

In [22]:
# 3. Get quantiles of length metrics
def get_quantiles(df: pd.DataFrame, column: str, q: list[float] = [.2, .4, .6, .8]) -> int:
    quantiles = df[column].quantile(q)
    def get_numeric_quintile(text_len: int, quantiles: list[float]):
        for i, tau in enumerate(quantiles, start=1):
            if text_len <= tau:
                return i
        return len(quantiles) + 1
    df[f'{column}_quantiles_{len(quantiles) + 1}'] = df[column].apply(lambda x: get_numeric_quintile(x, quantiles))
    return df

df_startups = get_quantiles(df_startups, 'bps_length')

In [24]:
# 4. Calculate LIX
def calculate_lix(word_list: list[str]):
    pct_above_six = len([w for w in word_list if len(w) > 6])  / len(word_list) * 100
    return len(word_list) + pct_above_six

df_startups['bps_lix'] = df_startups['bps_normalized'].apply(calculate_lix)

In [None]:
# 5. Word-frequency Features
word_dictionary_de = [token for sublist in df_startups[df_startups['bps_language'] == 'de']['bps_normalized'] for token in sublist if token.isalpha() and len(token) > 4]
word_freqencies_de = Counter(word_dictionary_de)

word_dictionary_fr = [token for sublist in df_startups[df_startups['bps_language'] == 'fr']['bps_normalized'] for token in sublist if token.isalpha() and len(token) > 4]
word_freqencies_fr = Counter(word_dictionary_fr)

word_dictionary_it = [token for sublist in df_startups[df_startups['bps_language'] == 'it']['bps_normalized'] for token in sublist if token.isalpha() and len(token) > 4]
word_freqencies_it = Counter(word_dictionary_it)

def compute_specificity_features(bps_tokens: list[str], lang_code: str) -> tuple[int, int, float]:
    if not isinstance(bps_tokens, list):
        print(bps_tokens)
    if lang_code == 'fr':
        word_freqencies = word_freqencies_fr
    elif lang_code == 'it':
        word_freqencies = word_freqencies_it
    else:
        word_freqencies = word_freqencies_de
    token_freqs = [word_freqencies[token] for token in bps_tokens if token in word_freqencies]
    if not token_freqs:
        return (0, 0, 0.0)

    total_tokens = sum(word_freqencies.values())
    # Normalize it by total number of tokens to account for differences between languages
    min_freq_norm, max_freq_norm = min(token_freqs) / total_tokens, max(token_freqs) / total_tokens

    ratio = min_freq_norm / max_freq_norm if max_freq_norm > 0 else 0.0
    return (min_freq_norm, max_freq_norm, ratio)

df_startups[['bps_min_word_freq_norm', 'bps_max_word_freq_norm', 'bps_freq_ratio_norm']] = df_startups.apply(lambda row: compute_specificity_features(row['bps_normalized'], row['bps_language']), axis=1).apply(pd.Series)

In [None]:
# 5. Encode geographic and name features
token_classifier = pipeline(
  model="ZurichNLP/swissbert-ner",
  aggregation_strategy="simple",
  device='mps'
)

def ner_tag_bps(text, token_classifier: pipeline):
    ner_tags = token_classifier(text)
    people = [entry['word'] for entry in ner_tags if entry['entity_group'] == 'PER']
    locations = [entry['word'] for entry in ner_tags if entry['entity_group'] == 'LOC']
    return people, locations

def contains_male_or_female_name(names, gendered_first_names):
    for name in [n.split() for n in names]:
        gender = name in gendered_first_names
        if gender:
            return 1
    return 0
    
dfs = []
for language in ['de', 'fr', 'it']:
    token_classifier.model.set_default_language(f"{language}_CH")
    df_lang = df_startups[df_startups['bfs_language'] == language]
    df_lang[['people', 'locations']] = df_lang['founding_purpose'].apply(lambda x: ner_tag_bps(x, token_classifier))

    df_lang['has_location'] = 0
    df_lang[df_lang['locations'].apply(lambda x: len(x)) > 0, 'has_location'] = 1

    df_lang['people'] = df_lang['people'].apply(lambda names: [normalize_words(name) for name in names])
    df_lang['has_male_name'] = df_lang['people'].apply(lambda x: contains_male_or_female_name(x, male_first_names))
    df_lang['has_female_name'] = df_lang['people'].apply(lambda x: contains_male_or_female_name(x, female_first_names))

    dfs.append(df_lang)

df_startups = pd.concat(dfs)

### ENCODE FIRM NAME FEATUES

### ADD WEBSITE URLS TO COMPANY DATA

In [720]:
websites = pd.read_csv(RAW_DATA_DIR / 'company_urls' / 'urls.csv')
websites['ehraid'] = websites['ehraid'].astype(int)

assert websites[websites.duplicated(subset='ehraid', keep=False)].empty

In [721]:
df_startups = df_startups.merge(websites[['ehraid', 'company_url']], on='ehraid', how='left')

In [722]:
print(f'Percentage of companies with found website: {len(df_startups[df_startups.company_url != 'no website available']) / len(df_startups) * 100}')

Percentage of companies with found website: 47.308415505080376


In [723]:
df_startups[df_startups.company_url != 'no website available'].to_csv(RAW_DATA_DIR / 'company_sample' / 'company_sample_website.csv', index=False)
df_startups[df_startups.company_url == 'no website available'].to_csv(RAW_DATA_DIR / 'company_sample' / 'company_sample_no_website.csv', index=False)

In [732]:
total = df_startups.shape[0]
num_website = df_startups[(df_startups.company_url != 'no website available')].shape[0]
num_no_website = df_startups[(df_startups.company_url == 'no website available')].shape[0]

num_exits = df_startups[(df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)].shape[0]
num_exits_website = df_startups[(df_startups.company_url != 'no website available') & (df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)].shape[0]
num_exits_no_website = df_startups[(df_startups.company_url == 'no website available') & (df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)].shape[0]

num_survival = df_startups[~(df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)].shape[0]
num_survival_website = df_startups[(df_startups.company_url != 'no website available') & ~(df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)].shape[0]
num_survival_no_website = df_startups[(df_startups.company_url == 'no website available') & ~(df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)].shape[0]

In [None]:
print(f'Percentage of firms having website: {num_website / total:.4f}')
print(f'Percentage of exited firms having website: {num_exits_website / num_exits:.4f}')
print(f'Percentage of survived firms having website: {num_survival_website / num_survival:.4f}')


print(f'Ratio of survival to exit: {num_survival / num_exits:.2f}')
print(f'Ratio of survival with website to exit with website: {num_survival_website / num_exits_website:.2f}')

Percentage of firms having website: 0.4731
Percentage of exited firms having website: 0.2702
Percentage of survived firms having website: 0.5143
Ratio of survival to exit: 4.92
Ratio of survival with website to exit with website: 9.36


In [727]:
df_startups[~(df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)]

Unnamed: 0,ehraid,uid,delete_date,dissolution_date,reason_for_dissolution,liquidation,bankruptcy,other_exit,current_name,founding_name,...,latitude,longitude,population,canton_id,district_id,urban_rural,typology_9c,typology_25c,capital_chf,company_url
2,1251327,CHE473646370,,,,,,,Wissler Consulting GmbH,Wissler Consulting GmbH,...,47.523075,7.845789,941.0,13.0,1304.0,3.0,23.0,236.0,20000.0,no website available
3,1251328,CHE205344235,,,,,,,Wolf Regio GmbH,Wolf Regio GmbH,...,47.518085,7.603328,12304.0,13.0,1301.0,1.0,11.0,112.0,20000.0,http://wolfregio.ch/
6,1251331,CHE226355598,,,,,,,J. Uebelmann Gartengestaltung AG,J. Uebelmann Gartengestaltung AG,...,47.033009,8.176346,7771.0,3.0,312.0,2.0,21.0,216.0,100000.0,http://www.uebelmann-garten.ch/
7,1251333,CHE285667805,,,,,,,KERAS GmbH,KERAS GmbH,...,47.128558,8.192858,4160.0,3.0,314.0,3.0,32.0,326.0,20000.0,http://keras.ch/
8,1251335,CHE428921587,,,,,,,DISU's Fahrschule AG,L-Simulator AG,...,47.047219,8.307857,85534.0,3.0,311.0,1.0,12.0,121.0,100000.0,http://www.disus-fahrschule.ch/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226553,1618578,CHE349470093,,,,,,,SIRAP Holding SA,SIRAP Holding SA,...,46.560951,6.521709,462.0,22.0,2227.0,3.0,23.0,236.0,100000.0,http://www.sirap.ch/
226554,1618579,CHE306057827,,,,,,,SIRAP Immobilier SA,SIRAP Immobilier SA,...,46.560951,6.521709,462.0,22.0,2227.0,3.0,23.0,236.0,102000.0,no website available
226555,1618768,CHE130017661,,,,,,,Ambiens Estates SA,Ambiens Estates SA,...,46.311858,7.482353,10488.0,23.0,2311.0,1.0,13.0,134.0,300000.0,no website available
226556,1618773,CHE275237254,,,,,,,Heritage Estates SA,Heritage Estates SA,...,46.311858,7.482353,10488.0,23.0,2311.0,1.0,13.0,134.0,300000.0,no website available


In [728]:
df_startups[(df_startups.company_url != 'no website available') & ~(df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)]

Unnamed: 0,ehraid,uid,delete_date,dissolution_date,reason_for_dissolution,liquidation,bankruptcy,other_exit,current_name,founding_name,...,latitude,longitude,population,canton_id,district_id,urban_rural,typology_9c,typology_25c,capital_chf,company_url
3,1251328,CHE205344235,,,,,,,Wolf Regio GmbH,Wolf Regio GmbH,...,47.518085,7.603328,12304.0,13.0,1301.0,1.0,11.0,112.0,20000.0,http://wolfregio.ch/
6,1251331,CHE226355598,,,,,,,J. Uebelmann Gartengestaltung AG,J. Uebelmann Gartengestaltung AG,...,47.033009,8.176346,7771.0,3.0,312.0,2.0,21.0,216.0,100000.0,http://www.uebelmann-garten.ch/
7,1251333,CHE285667805,,,,,,,KERAS GmbH,KERAS GmbH,...,47.128558,8.192858,4160.0,3.0,314.0,3.0,32.0,326.0,20000.0,http://keras.ch/
8,1251335,CHE428921587,,,,,,,DISU's Fahrschule AG,L-Simulator AG,...,47.047219,8.307857,85534.0,3.0,311.0,1.0,12.0,121.0,100000.0,http://www.disus-fahrschule.ch/
10,1251337,CHE208273206,,,,,,,Lio Ko Sàrl,Book2Cook Sàrl,...,46.605823,7.097012,2810.0,10.0,1003.0,2.0,22.0,226.0,21000.0,http://www.book2cook.ch/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226538,1615399,CHE410263402,,,,,,,Grafanova Sàrl,Grafanova Sàrl,...,46.353632,6.930815,1949.0,22.0,2221.0,2.0,22.0,226.0,20000.0,http://www.grafanova.ch/
226541,1616166,CHE392221591,,,,,,,Nicomatic SA,Nicomatic SA,...,46.226536,6.104038,26882.0,25.0,2500.0,1.0,11.0,112.0,100000.0,http://www.nicomatic.com/
226544,1616650,CHE313606780,,,,,,,Brügglifeld Catering AG,Brügglifeld Catering AG,...,47.383334,8.059803,11340.0,19.0,1901.0,1.0,12.0,123.0,100000.0,http://fcaarau.ch/
226548,1617797,CHE474535922,,,,,,,Piercing Gaby GmbH,Piercing Gaby GmbH,...,47.411037,9.625626,8481.0,17.0,1723.0,1.0,12.0,122.0,20000.0,http://www.piercinggaby.ch/


In [700]:
df_startups[(df_startups.company_url != 'no website available') & (df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)]

Unnamed: 0,ehraid,uid,delete_date,reason_for_dissolution,liquidation,bankruptcy,other_exit,current_name,founding_name,current_legal_form,...,latitude,longitude,population,canton_id,district_id,urban_rural,typology_9c,typology_25c,capital_chf,company_url
0,1251325,CHE153193257,,['Mit Urteil des Gerichtspräsidenten des Zivil...,True,False,False,Arlez Carrosserie GmbH in Liquidation,Arlez Carrosserie GmbH,4,...,47.460137,7.861180,6296.0,13.0,1304.0,2.0,21.0,217.0,20000.0,http://www.arlez-carrosserie.ch/
1,1251326,CHE392024369,2020-11-11,[],False,False,True,Vista Coaching GmbH in Liquidation,Vista Coaching GmbH,4,...,47.504062,7.724207,4700.0,13.0,1303.0,1.0,11.0,113.0,20000.0,http://vista-coaching.ch/
5,1251330,CHE218480994,2022-10-31,['Die Gesellschaft hat sich aufgelöst'],False,False,True,City Martial Arts - Berisha & Colic,City Martial Arts - Berisha & Colic,2,...,47.046432,8.313085,85534.0,3.0,311.0,1.0,12.0,121.0,,http://www.citymartialarts.ch/
38,1251395,CHE432339864,2021-07-28,[],False,False,True,IdeeTransfer Region Bern-Thun GmbH in Liquidation,IdeeTransfer Region Bern-Thun GmbH,4,...,46.872344,7.547907,13113.0,2.0,246.0,1.0,11.0,113.0,20000.0,http://ideetransfer.ch/
45,1251404,CHE461778266,2023-12-28,[],False,False,True,hbs-consulting GmbH in Liquidation,hbs-consulting GmbH,4,...,46.923657,7.411052,42958.0,2.0,246.0,1.0,11.0,113.0,20000.0,http://hbs-consulting.ch/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226274,1547270,CHE280958589,,"[""Par décision du Tribunal de l'arrondissement...",False,True,False,HA Conseils SA en liquidation,HA Conseils SA,3,...,46.528576,6.594157,21408.0,22.0,2229.0,1.0,11.0,112.0,100000.0,http://www.ha-conseils.ch/
226294,1551774,CHE311587812,,['Die Gesellschaft ist mit Beschluss der Gesel...,True,False,False,M & B Logistik GmbH in Liquidation,M & B Logistik GmbH,4,...,47.674132,9.019238,947.0,20.0,2012.0,3.0,32.0,327.0,20000.0,http://www.mb-logistik.ch/
226315,1558171,CHE296632509,,"[""La société est dissoute par décision de l'as...",True,False,False,"J.R. Edwards Financial Sàrl, en liquidation",J.R. Edwards Financial Sàrl,4,...,46.190684,6.146015,206635.0,25.0,2500.0,1.0,11.0,111.0,20000.0,http://jredwardsfinancial.com/
226401,1580186,CHE264305615,,"[""La société est dissoute par décision de l'as...",True,False,False,Soft & Clean SA en liquidation,Soft & Clean SA ( Soft & Clean AG ) ( Soft & C...,3,...,47.136778,7.246791,55932.0,2.0,242.0,1.0,12.0,121.0,100000.0,http://www.softandclean.ch/


### ENCODE FIRM_LEVEL FEATURES

# **PROCESS INSCRIBED PEOPLE/FIRMS FEATURES**

In [396]:
from success_prediction.zefix_processing.clustering import PersonClustering

pd.set_option('future.no_silent_downcasting', True)

In [397]:
query_inscribed_people = """ 
    SELECT * FROM zefix.history_inscribed_people WHERE founders = TRUE AND shab_date < '2024-01-01';
"""

query_inscribed_firms = """ 
    SELECT * FROM zefix.history_inscribed_firms WHERE shab_date < '2024-01-01';
"""

In [398]:
with connect_database() as con:
    df_insc_people = read_from_database(connection=con, query=query_inscribed_people)
    df_insc_firms = read_from_database(connection=con, query=query_inscribed_firms)

In [400]:
df_insc_firms

Unnamed: 0,ehraid,shab_date,shab_id,keyword,firm_name,firm_uid,firm_seat,firm_type,firm_shares
0,256,2017-03-02,3380321,nouvel organe de révision,Fiprom S.A. Fiduciaire de Prométerre,CHE-108.474.342,Lausanne,,
1,256,2017-03-02,3380321,personnes inscrites special,Hervest Fiduciaire SA,CHE-107.877.252,,organe de révision,
2,283,2018-07-12,4353769,nouvel organe de révision,KPMG AG,CHE-106.084.881,Zurich,,
3,283,2020-07-14,1004936962,nouvel organe de révision,Ernst & Young AG,CHE-491.907.686,Zurich,succursale,
4,371,2020-03-13,1004852619,personnes inscrites special,CO1 LLC,7553319,"Lewes, USA",associée,50 parts de CHF 1'000
...,...,...,...,...,...,...,...,...,...
172971,1619464,2023-12-29,1005922598,eingetragene personen,Aircon Holding AG,CHE-385.982.271,Otelfingen,Gesellschafterin,mit 10 Stammanteilen zu je CHF 1000.00
172972,1619490,2023-12-29,1005922617,eingetragene personen,Mäder + Baumgartner Treuhand AG,CHE-103.815.364,Neuhausen am Rheinfall,Revisionsstelle,
172973,1619494,2023-12-29,1005922621,eingetragene personen,KBT Revisions AG,CHE-102.663.608,Zürich,Revisionsstelle,
172974,1619495,2023-12-29,1005922622,eingetragene personen,KPMG AG,CHE-106.084.881,Zürich,Revisionsstelle,


In [658]:
df_insc_people[df_insc_people.ehraid == 1603889]

Unnamed: 0,ehraid,shab_date,shab_id,keyword,first_name,first_name_norm,last_name,last_name_norm,job_title,signing_rights,...,place_of_residence_1_bfs_stand_origin,place_of_residence_2_bfs_gmde_code_origin,place_of_residence_2_bfs_stand_origin,hometown_1_bfs_gmde_code_latest,hometown_2_bfs_gmde_code_latest,hometown_3_bfs_gmde_code_latest,hometown_4_bfs_gmde_code_latest,hometown_5_bfs_gmde_code_latest,place_of_residence_1_bfs_gmde_code_latest,place_of_residence_2_bfs_gmde_code_latest
560526,1603889,2023-09-13,1005837221,eingetragene personen,Christian,christian,Bosshard,bosshard,Präsident des Vorstandes,mit Kollektivunterschrift zu zweien,...,01-01-2025,,,261.0,,,,,,
560527,1603889,2023-09-13,1005837221,eingetragene personen,Thomas,thomas,Hessler,hessler,Mitglied des Vorstandes,mit Kollektivunterschrift zu zweien,...,01-01-2025,,,1031.0,,,,,,
560528,1603889,2023-09-13,1005837221,eingetragene personen,Claudia,claudia,Hössbacher,hoessbacher,Mitglied des Vorstandes,mit Kollektivunterschrift zu zweien,...,01-01-2025,,,,,,,,,
560529,1603889,2023-09-13,1005837221,eingetragene personen,Kirsten,kirsten,Moselund,moselund,Mitglied des Vorstandes,mit Kollektivunterschrift zu zweien,...,01-01-2025,,,139.0,,,,,,


In [403]:
# Pre-process dataframe
bfs_code_cols = [col for col in df_insc_people.columns if 'bfs_gmde_code_' in col]
df_insc_people[bfs_code_cols] = df_insc_people[bfs_code_cols].astype(str).replace('0', np.nan)

In [None]:
# test_df = people_df[people_df.ehraid.isin([1600448, 1251490, 1260743, 1328630])].reset_index(drop=True).copy()

In [404]:
clustering = PersonClustering(df_insc_people)
clustered_df = clustering.cluster()

Cluster people within company: 100%|██████████| 357357/357357 [05:53<00:00, 1011.56it/s]


In [405]:
clustered_df

Unnamed: 0,ehraid,shab_date,shab_id,keyword,first_name,first_name_norm,last_name,last_name_norm,job_title,signing_rights,...,place_of_residence_2_bfs_stand_origin,hometown_1_bfs_gmde_code_latest,hometown_2_bfs_gmde_code_latest,hometown_3_bfs_gmde_code_latest,hometown_4_bfs_gmde_code_latest,hometown_5_bfs_gmde_code_latest,place_of_residence_1_bfs_gmde_code_latest,place_of_residence_2_bfs_gmde_code_latest,heuristic,fid
178772,1251436,2016-02-03,2637391,eingetragene personen,Hans-Peter Gunnar,hans-peter gunnar,Lennhag,lennhag,Gesellschafter und Geschäftsführer,mit Einzelunterschrift,...,,4195,,,,,,,,1
178798,1251492,2016-02-03,2637449,eingetragene personen,Roman,roman,Schleiss,schleiss,Mitglied des Stiftungsrates,mit Kollektivunterschrift zu zweien,...,,1402,,,,,,,,2
178799,1251456,2016-02-03,2636785,eingetragene personen,Peter,peter,von Gunten,von gunten,Gesellschafter und Geschäftsführer,mit Einzelunterschrift,...,,938,,,,,,,,3
178800,1251457,2016-02-03,2636787,eingetragene personen,Roland,roland,Kalt,kalt,Gesellschafter und Geschäftsführer,mit Einzelunterschrift,...,,261,,,,,,,,4
178801,1251459,2016-02-03,2636791,eingetragene personen,Raffaele,raffaele,Nardone,nardone,Präsident des Verwaltungsrates,mit Kollektivunterschrift zu zweien,...,,3395,,,,,,,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
576773,1619418,2023-12-29,1005923336,eingetragene personen,Sabrina,sabrina,Marbet,marbet,Mitglied des Vorstandes,mit Einzelunterschrift,...,,612,,,,,,,,384812
576772,1619418,2023-12-29,1005923336,eingetragene personen,Martin,martin,Marbet,marbet,Präsident des Vorstandes,mit Einzelunterschrift,...,,2404,,,,,,,,463418
576771,1619416,2023-12-29,1005923335,eingetragene personen,Robyn Brayan,robyn brayan,Nobs,nobs,Gesellschafter und Geschäftsführer,mit Einzelunterschrift,...,,360,,,,,,,,463419
576769,1619396,2023-12-29,1005923331,eingetragene personen,Marco Andreas,marco andreas,Zühlke,zuehlke,Gesellschafter,Einzelunterschrift,...,,2939,,,,,,,,225405


# **PROCESS ADDITIONAL OUTPUT FEATURES**

### INVOLUNTARY EXIT TARGET

### ACQUISITION TARGET

In [None]:
query_merger = """ 
    SELECT * FROM zefix.history_merger WHERE shab_date < '2024-01-01';
"""

In [None]:
with connect_database() as con:
    df_merger = read_from_database(connection=con, query=query_merger)

### FUNDING TARGET

### NEW PATENT TARGET