In [2]:
import dotenv
import os
import re
from collections import Counter
from unidecode import unidecode
from tqdm import tqdm

import geopandas as gpd
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd

from ftlangdetect import detect
from geopy.geocoders import Nominatim, GoogleV3
from transformers import pipeline

from pocketknife.database import (
    connect_database, read_from_database)

from success_prediction.config import (
    PROJ_ROOT, RAW_DATA_DIR, EXTERNAL_DATA_DIR, PROCESSED_DATA_DIR)
from success_prediction.zefix_processing.clustering import PersonClustering

dotenv_path = os.path.join(PROJ_ROOT, '.env')
dotenv.load_dotenv(dotenv_path)

pd.set_option('future.no_silent_downcasting', True)

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[32m2025-06-14 12:30:36.398[0m | [1mINFO    [0m | [36msuccess_prediction.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/manuelbolz/Documents/git/for_work/company_success_prediction[0m
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/manuelbolz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/manuelbolz/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manuelbolz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
CUTOFF_DATE = '2020-04-01'

# **PREPARE INSCRIBED PEOPLE/FIRMS DATA**

In [None]:
query_inscribed_people = f""" 
    SELECT * FROM zefix.history_inscribed_people WHERE founders = TRUE AND shab_date <= '{CUTOFF_DATE}';
"""

query_inscribed_firms = f""" 
    SELECT * FROM zefix.history_inscribed_firms WHERE shab_date <= '{CUTOFF_DATE}';
"""

with connect_database() as con:
    df_insc_people = read_from_database(connection=con, query=query_inscribed_people)
    df_insc_firms = read_from_database(connection=con, query=query_inscribed_firms)

In [30]:
def group_fids_with_combine_first(df, group_cols=['ehraid', 'fid']):
    """
    Combines instances of the same fid within a company and fills missing
    to create a complete representation for each founder
    """
    filled_rows = []
    for _, group in df.groupby(group_cols, sort=False):
        # Use first entry as base instance of fid
        combined = group.iloc[0]
        # Iteratively combine_first with the next row(s)
        for i in range(1, len(group)):
            combined = combined.combine_first(group.iloc[i])
        filled_rows.append(combined)
    return pd.DataFrame(filled_rows).reset_index(drop=True)

def build_founder_dict(df):
    founder_dict = {}
    for _, row in df.iterrows():
        ehraid = row['ehraid']
        fid = row['fid']
        
        # Handle BFS code lists for hometown (1–5)
        hometown_bfs_codes_latest = [
            int(code) for code in [
                row.get('hometown_1_bfs_gmde_code_latest'),
                row.get('hometown_2_bfs_gmde_code_latest'),
                row.get('hometown_3_bfs_gmde_code_latest'),
                row.get('hometown_4_bfs_gmde_code_latest'),
                row.get('hometown_5_bfs_gmde_code_latest'),
            ] if pd.notnull(code)
        ]
        
        # Handle BFS code lists for places of residence (1–2)
        place_of_residence_bfs_codes_latest = [
            int(code) for code in [
                row.get('place_of_residence_1_bfs_gmde_code_latest'),
                row.get('place_of_residence_2_bfs_gmde_code_latest'),
            ] if pd.notnull(code)
        ]
        
        # Handle nationality codes (1–3)
        nationality_iso_codes = [
            str(code) for code in [
                row.get('nationality_1_iso_3166_1_alpha_2'),
                row.get('nationality_2_iso_3166_1_alpha_2'),
                row.get('nationality_3_iso_3166_1_alpha_2'),
            ] if pd.notnull(code) and code != np.nan
        ]
        
        # Prepare entry
        founder_data = {
            'first_name': row.get('first_name') or '',
            'last_name': row.get('last_name') or '',
            'first_name_norm': row.get('first_name_norm') or '',
            'last_name_norm': row.get('last_name_norm') or '',
            'gender': row.get('gender') or '',
            'job_title': row.get('job_title') or '',
            'dr_title': row.get('founder_with_academic_title'),
            'signing_rights': row.get('signing_rights') or '',
            'shares': row.get('shares') or '',
            'hometown_bfs_codes_latest': hometown_bfs_codes_latest,
            'place_of_residence_bfs_codes_latest': place_of_residence_bfs_codes_latest,
            'nationality_iso_codes': nationality_iso_codes,
        }
        
        # Insert into nested dictionary
        if ehraid not in founder_dict:
            founder_dict[ehraid] = {}
        founder_dict[ehraid][fid] = founder_data
    
    return founder_dict

def count_founders(founder_dict):
    stats = {}
    for ehraid, founders in founder_dict.items():
        total = 0
        male = 0
        female = 0
        swiss = 0
        foreign = 0
        dr = 0
        for fdata in founders.values():
            total += 1
            gender = fdata.get('gender', '')
            if gender == 'm':
                male += 1
            elif gender == 'f':
                female += 1
            nationalities = fdata.get('nationality_iso_codes', [])
            if 'CH' in nationalities:
                swiss += 1
            else:
                foreign += 1
            dr += fdata.get('dr_title', 0)

        stats[ehraid] = {
            'n_founders': total,
            'n_female_founders': female,
            'n_male_founders': male,
            'n_swiss_founders': swiss,
            'n_foreign_founders': foreign,
            'n_dr_titles': dr
        }
    return stats

def get_founder_lists(founder_dict):
    stats = {}
    for ehraid, founders in founder_dict.items():
        names = [(fdata.get('first_name_norm', ''), fdata.get('last_name_norm', '')) for fdata in founders.values()]
        fids = [k for k in founders.keys()]
        nationalities = [fdata.get('nationality_iso_codes', []) for fdata in founders.values()]
        hometowns = [fdata.get('hometown_bfs_codes_latest', []) for fdata in founders.values()]
        residencies = [fdata.get('place_of_residence_bfs_codes_latest', []) for fdata in founders.values()]

        stats[ehraid] = {
            'founder_names': names,
            'founder_fids': fids,
            'founder_nationalities': nationalities,
            'founder_hometowns': hometowns,
            'founder_residencies': residencies
        }
    return stats

In [31]:
# Pre-process and cluster the dataframe
bfs_code_cols = [col for col in df_insc_people.columns if 'bfs_gmde_code_' in col]
df_insc_people[bfs_code_cols] = df_insc_people[bfs_code_cols].astype(str).replace('0', np.nan)
df_insc_people[bfs_code_cols] = df_insc_people[bfs_code_cols].replace('None', np.nan).replace('', np.nan)

# Cluster the people
clustering = PersonClustering(df_insc_people)
clustered_df = clustering.cluster()

Cluster people within company: 100%|██████████| 357357/357357 [09:28<00:00, 629.04it/s]


In [32]:
# Group fids within companies
filled_df = group_fids_with_combine_first(clustered_df)
for col in bfs_code_cols:
    filled_df[col] = filled_df[col].astype(float)

In [33]:
doctor_titles = [
    r"dr\.?",
    r"doctor",
    r"doktor",
    r"prof\.?",
    r"ph\.?\s*d\.?",         # PhD / Ph.D.
    r"dphil\.?",             # DPhil (Oxford)
    r"sc\.?\s*d\.?",         # ScD / Sc.D.
    r"dsc\.?", r"drsc\.?",   # DSc / DrSc
    r"dr\.?\s*-?\s*ing\.?",  # Dr-Ing.
    r"dott\.?",  # Italian variants
    r"dottore",
    r"hdr", # French short forms
]
pattern = re.compile(rf"\b(?:{'|'.join(doctor_titles)})\b", flags=re.IGNORECASE)
filled_df['founder_with_academic_title'] = filled_df['first_name'].fillna('').str.contains(pattern) | filled_df['last_name'].fillna('').str.contains(pattern)
filled_df['founder_with_academic_title'] = filled_df['founder_with_academic_title'].astype(int)

In [34]:
# Build dictionary with founders
founder_dict = build_founder_dict(filled_df)

# Count the total founders, female foundes, and male founders
count_stats = count_founders(founder_dict)

# Get the names, nationalities, hometowns, and place of residency of all founders as flat lists
founder_lists = get_founder_lists(founder_dict)

In [35]:
# Prepare the number of inscribed firms at founding
grouped_insc_firms = df_insc_firms.groupby(['ehraid', 'shab_date']).agg({'shab_id': 'count'}).reset_index().rename(columns={'shab_id': 'n_inscribed_firms', 'shab_date': 'firm_inscription_date'})
grouped_insc_firms = grouped_insc_firms.sort_values('firm_inscription_date').drop_duplicates(subset=['ehraid'])

# **PREPARE FIRM-LEVEL DATA**

In [36]:
id2legalform = {
    1: 'Sole proprietorship',  # Einzelunternehmen  ->  EXCLUDE
    2: 'General Partnership',  # Kollektivgesellschaft  ->  INCLUDE
    3: 'Corporation',  # Aktiengesellschaft  ->  INCLUDE
    4: 'Limited Liability Company',  # Gesellschaft mit beschränkter Haftung  ->  INCLUDE
    5: 'Cooperative',  # Genossenschaft  ->  EXCLUDE
    6: 'Association',  # Verein  ->  EXCLUDE
    7: 'Foundation',  # Stiftung  ->  EXCLUDE
    8: 'Public sector institution',  # Institut des öffentlichen Rechts  ->  EXCLUDE
    9: 'Branch',  # Zweigniederlassung  ->  EXCLUDE
    10: 'Limited Partnership',  # Kommanditgesellschaft  ->  INCLUDE
    11: 'Foreign branch',  # Zweigniederlassung einer ausl. Gesellschaft  ->  EXCLUDE
    12: 'Corporation with unlimited partners',  # Kommanditaktiengesellschaft  ->  INCLUDE
    13: 'Special legal form',  # Besondere Rechtsform  ->  EXCLUDE
    14: 'Ownership in undivided shares',  # Gemeinderschaft  ->  EXCLUDE
    15: 'Limited Partnership for collective investment schemes with a fixed capital',  # Investmentgesellschaft mit festem Kapital  ->  INCLUDE
    16: 'Limited Partnership for collective investment schemes with a variable capital',  # Investmentgesellschaft mit variablem Kapital  ->  INCLUDE
    17: 'Limited Partnership for collective investment schemes',  # Kommanditgesellschaft für kollektive Kapitalanlagen  ->  INCLUDE
    18: 'Non commercial power of attorney',  # Nichtkaufmännische Prokure  ->  EXCLUDE
    19: '(unknown)',  # (unbekannt)  ->  EXCLUDE
}

growth_oriented_legal_forms = [2, 3, 4, 10, 12, 15, 16, 17]

In [None]:
# This query gets the sample of growth oriented firms that were founded between 2016 and current for the prediction sample

query_founded_firms = f""" 
    SELECT
        base.ehraid,
        base.uid,

        -- Names
        base.name AS current_name,
        founding_name.firm_name AS founding_name,

        -- Legal forms
        base.legal_form_id AS current_legal_form,
        legal_form.legal_form_id AS founding_legal_form,

        -- Purpose
        base.purpose_raw AS current_purpose,
        founding_purpose.purpose_raw AS founding_purpose,
        founding_purpose.purpose_clean AS founding_purpose_clean,

        -- Founding NOGA code
        founding_purpose.section_1_label AS founding_noga_section_1,
        founding_purpose.class_1_label AS founding_noga_class_1,
        founding_purpose.prediction_1_score AS founding_noga_score_1,
        founding_purpose.section_2_label AS founding_noga_section_2,
        founding_purpose.class_2_label AS founding_noga_class_2,
        founding_purpose.prediction_2_score AS founding_noga_score_2,
        founding_purpose.section_2_label AS founding_noga_section_3,
        founding_purpose.class_2_label AS founding_noga_class_3,
        founding_purpose.prediction_2_score AS founding_noga_score_3,

        -- Current address
        COALESCE(address.street, '') || ' ' || COALESCE(address.house_number, '') AS current_street,
        address.town AS current_town,
        address.swiss_zip_code AS current_zip_code,
        address.country AS current_country,

        -- Founding address
        founding_address.street AS founding_street,
        founding_address.town AS founding_town,
        founding_address.postal_code AS founding_zip_code,
        founding_address.town_bfs_gmde_code_latest AS founding_bfs_code,

        -- Founding SHAB entry
        shab.shab_id,
        shab.shab_date AS founding_date,
        shab.message AS founding_message

    FROM zefix_release_159.base base

    -- Founding SHAB messages
    INNER JOIN (
        SELECT s.ehraid, s.shab_id, s.shab_date, s.message
        FROM zefix_release_159.shab s
        INNER JOIN zefix_release_159.shab_mutation sm ON s.shab_id = sm.shab_id
        WHERE sm.description = 'status.neu'
    ) AS shab ON base.ehraid = shab.ehraid

    -- Current address
    LEFT JOIN zefix_release_159.address address ON base.ehraid = address.ehraid

    -- Founding address
    LEFT JOIN (
        SELECT DISTINCT hfa.ehraid, hfa.street, hfa.postal_code, hfa.town, hfa.town_bfs_gmde_code_latest
        FROM zefix.history_firm_addresses hfa
        WHERE founding = TRUE
    ) AS founding_address ON base.ehraid = founding_address.ehraid

    -- Founding name
    LEFT JOIN (
        SELECT DISTINCT hfn.ehraid, hfn.firm_name
        FROM zefix.history_firm_names hfn
        WHERE hfn.founding = TRUE
    ) AS founding_name ON base.ehraid = founding_name.ehraid

    -- Founding purpose
    LEFT JOIN (
        SELECT DISTINCT 
            hp.ehraid, hp.purpose_raw, sec.purpose as purpose_clean,
            sec.section_1_label, sec.class_1_label, sec.section_1_label, sec.prediction_1_score,
            sec.section_2_label, sec.class_2_label, sec.section_2_label, sec.prediction_2_score,
            sec.section_3_label, sec.class_3_label, sec.section_3_label, sec.prediction_3_score
        FROM zefix.history_purpose hp
        LEFT JOIN zefix.history_sector sec
        ON hp.ehraid = sec.ehraid AND hp.shab_id = sec.shab_id
        WHERE hp.founding_purpose = TRUE
    ) AS founding_purpose ON base.ehraid = founding_purpose.ehraid

    -- Founding legal form
    LEFT JOIN (
        SELECT DISTINCT hlf.ehraid, hlf.legal_form_id
        FROM zefix.history_founding_legal_form hlf
    ) AS legal_form ON base.ehraid = legal_form.ehraid

    -- Filter out irrelevant records
    WHERE
        NOT base.is_branch
        AND shab.shab_date < '{CUTOFF_DATE}'
        AND base.legal_form_id IN (2, 3, 4, 10, 12, 15, 16, 17)
        AND LOWER(base.name) NOT LIKE '%zweigniederlassung%'
        AND LOWER(base.name) NOT LIKE '%succursale%';
"""

In [704]:
with connect_database() as con:
    df_startups = read_from_database(connection=con, query=query_founded_firms)

df_startups['founding_date'] = pd.to_datetime(df_startups['founding_date'])

In [706]:
# Observed duplicates stem from entries having multiple new inscriptions in Zefix. -> Remove them from the sample because history seems to contain errors
display(df_startups[df_startups.duplicated(subset=['ehraid', 'founding_town'], keep=False)].uid.unique())
df_startups = df_startups.drop_duplicates(subset=['ehraid'], keep=False)
display(df_startups.shape)

array([], dtype=object)

(226559, 25)

### GEO ENCODE ADDRESS INFORMATION

In [None]:
# Use the current information if the founding address is missing
df_startups['founding_street'] = df_startups['founding_street'].fillna(df_startups['current_street'])
df_startups['founding_zip_code'] = df_startups['founding_zip_code'].fillna(df_startups['current_zip_code'])
df_startups['founding_town'] = df_startups['founding_town'].fillna(df_startups['current_town'])

In [116]:
assert df_startups[df_startups['founding_street'].isna()].empty
assert df_startups[df_startups['founding_zip_code'].isna()].empty
assert df_startups[df_startups['founding_town'].isna()].empty

In [None]:
nominatim_geolocator = Nominatim(
    user_agent="local_geocoder",
    domain="localhost:8080",
    scheme="http"
)
google_geolocator = GoogleV3(api_key=os.getenv('GOOGLE_GEOCODE_API_KEY'))


def geocode_address(nominatim_geolocator, google_geolocator, row):
    try:
        location = nominatim_geolocator.geocode({
            'street': row['founding_street'],
            'city': row['founding_town'],
            'postalcode': int(row['founding_zip_code']),
            'country': 'Schweiz'
        }, timeout=2)
        if location:
            return pd.Series([location.address, location.latitude, location.longitude])
        else:
            location = google_geolocator.geocode({
                'street': row['founding_street'],
                'city': row['founding_town'],
                'postalcode': int(row['founding_zip_code']),
                'country': 'Schweiz'
            }, timeout=1)
            if location:
                return pd.Series([location.address, location.latitude, location.longitude])
            return pd.Series([None, None, None])
    except Exception as e:
        print(f"Error: {e}")
        return pd.Series([None, None, None])

In [None]:
df_startups[['geocoded_address', 'latitude', 'longitude']] = df_startups.apply(lambda row: geocode_address(nominatim_geolocator, google_geolocator, row), axis=1)

In [285]:
df_startups['founding_bfs_code'] = df_startups['founding_bfs_code'].astype(int)
df_startups['founding_zip_code'] = df_startups['founding_zip_code'].astype(int)

### DETERMINE BFS MUNICIPALITY CODE BY COORDINATES WHERE MISSING

In [372]:
gdf = gpd.read_file(EXTERNAL_DATA_DIR / 'geo_data' / 'swissBOUNDARIES3D_1_5_LV95_LN02.gdb', layer="TLM_HOHEITSGEBIET")
gdf = gdf.to_crs("EPSG:4326")
gdf = gdf[['geometry', 'BFS_NUMMER', 'EINWOHNERZAHL']]

df_startups = gpd.GeoDataFrame(
    df_startups,
    geometry=gpd.points_from_xy(df_startups['longitude'], df_startups['latitude']),
    crs="EPSG:4326"
)

df_startups = gpd.sjoin(df_startups, gdf, how="left", predicate="within")

# Replace where code is 0 (unmatched) or where it does not match the coordinates
df_startups.loc[df_startups['founding_bfs_code'] == 0, 'founding_bfs_code'] = pd.NA
df_startups.loc[(df_startups['founding_bfs_code'].astype(float) != df_startups['BFS_NUMMER']) & (~df_startups['BFS_NUMMER'].isna()), 'founding_bfs_code'] = pd.NA
df_startups['founding_bfs_code'] = df_startups['founding_bfs_code'].fillna(df_startups['BFS_NUMMER'])

In [374]:
df_startups['founding_bfs_code'] = df_startups['founding_bfs_code'].astype(int)

In [375]:
df_startups[df_startups.founding_bfs_code.astype(float) != df_startups.BFS_NUMMER][['founding_town', 'combined_address', 'founding_bfs_code', 'BFS_NUMMER']]

Unnamed: 0,founding_town,combined_address,founding_bfs_code,BFS_NUMMER
223603,Chiasso,"Via Henry Dunant 1, 6830 Chiasso",5250,
223613,Morcote,"Via Isella 11, 6922 Morcote",5203,
223639,Brusino Arsizio,"Via Lungolago 83, 6827 Brusino Arsizio",5160,
224506,San Bernardino,"Residenza Mons Avium , appartamento 25, 6565 S...",3822,
224785,La Tène,"route de Bellevue 7, 2074 La Tène",6513,
225829,Roggwil TG,"Im Pünst 1, 9325 Roggwil TG",4431,
225873,Bassins,"Chemin de Raulan 24, 1269 Bassins",5703,
226046,Warth,"Kartause Ittingen, 8532 Warth",4621,


In [None]:
df_startups.drop(columns=['geometry', 'index_right', 'Unnamed: 0', 'BFS_NUMMER'], inplace=True)

### ADD MUNICIPALITY TYPOLOGY

In [None]:
df_typology = pd.read_excel(EXTERNAL_DATA_DIR / 'geo_data' / 'Raumgliederungen.xlsx')
df_typology.drop(columns=['Gemeindename', 'Bezirksname', 'Kanton'], inplace=True)
df_typology = df_typology.rename(columns={'BFS Gde-nummer': 'founding_bfs_code', 'Bezirks-nummer': 'district_id', 'Kantons-nummer': 'canton_id', 'Stadt/Land-Typologie': 'urban_rural', 'Gemeindetypologie (9 Typen)': 'typology_9c', 'Gemeindetypologie (25 Typen)': 'typology_25c'})

In [None]:
df_startups = df_startups.merge(df_typology, on='founding_bfs_code', how='left')
df_startups.rename(columns={'EINWOHNERZAHL': 'population'}, inplace=True)

In [389]:
df_startups[df_startups.canton_id.isna()]

Unnamed: 0,ehraid,uid,delete_date,reason_for_dissolution,liquidation,bankruptcy,other_exit,current_name,founding_name,current_legal_form,...,combined_address,geocoded_address,latitude,longitude,population,canton_id,Bezirks-nummer,urban_rural,typology_9c,typology_25c
2469,1255845,CHE395917849,2020-06-19,['Nachdem kein begründeter Einspruch gegen die...,False,False,True,Same Same GmbH in Liquidation,Same Same GmbH,4,...,"Seeplatz 1, 8820 Wädenswil","1, Seeplatz, Wädenswil, Bezirk Horgen, Zürich,...",47.228758,8.676404,0.0,,,,,
80542,1389236,CHE291873431,,,,,,MS Glärnisch AG,MS Glärnisch AG,3,...,"Seeplatz 1, 8820 Wädenswil","1, Seeplatz, Wädenswil, Bezirk Horgen, Zürich,...",47.228758,8.676404,0.0,,,,,
223147,1310452,CHE338654358,,['Mit Entscheid vom 07.01.2025 hat der Einzelr...,False,True,False,Peter Jegen GmbH in Liquidation,Peter Jegen GmbH,4,...,"Sagastrasse 3, 7214 Grüsch","Sägastrasse 3, 9495 Triesen, Liechtenstein",47.088149,9.522204,5532.0,,,,,


### ADD STARTING CAPITAL TO COMPANY DATA

In [None]:
# Get historical exchange rates
import yfinance as yf

exchange_rate_dfs = []
for symbol in ['EUR', 'GBP', 'USD']:
    ticker = yf.Ticker(f'{symbol}CHF=X')
    df_ticker = ticker.history(start='2016-01-01', end='2024-01-01')
    df_ticker['symbol'] = symbol
    exchange_rate_dfs.append(df_ticker)

In [None]:
df_exchange_rates = pd.concat(exchange_rate_dfs).reset_index()[['Date', 'symbol', 'Open']]

df_exchange_rates = df_exchange_rates.rename(columns={'Date': 'founding_date'})
df_exchange_rates['founding_date'] = pd.to_datetime(df_exchange_rates['founding_date']).dt.date
df_exchange_rates['founding_date'] = pd.to_datetime(df_exchange_rates['founding_date'])

df_temp = pd.DataFrame({'founding_date': pd.date_range(start='2016-01-01', end='2024-01-01').tolist() * 3})
df_temp['symbol'] = ['EUR'] * int(len(df_temp) / 3) + ['GBP'] * int(len(df_temp) / 3) + ['USD'] * int(len(df_temp) / 3)

df_exchange_rates = df_temp.merge(df_exchange_rates, on=['founding_date', 'symbol'], how='left')
df_exchange_rates['Open'] = df_exchange_rates['Open'].ffill()
df_exchange_rates['symbol'] = df_exchange_rates['symbol'].ffill()

df_exchange_rates.to_csv(EXTERNAL_DATA_DIR / 'exchange_rates' / 'exchange_rates.csv', index=False)

In [439]:
query_capital = """ 
    SELECT * FROM zefix.history_registered_capital WHERE shab_date < '2024-01-01';
"""

In [663]:
with connect_database() as con:
    df_capital = read_from_database(connection=con, query=query_capital)

In [664]:
df_capital = df_capital.rename(columns={'shab_date': 'founding_date', 'currency_new': 'symbol'})
df_capital['founding_date'] = pd.to_datetime(df_capital['founding_date'])

mapping = {
    'Euro': 'EUR',
    'Eur': 'EUR',
    'EURO': 'EUR',
    '€': 'EUR',
    'fr': 'CHF',
    'Fr.': 'CHF',
    'CHE': 'CHF',
    '£': 'GBP',
    'US': 'USD'
}
df_capital['symbol'] = df_capital['symbol'].replace(mapping)

# Drop duplicate entries where we have libaration information do avoid duplicates before aggregation
df_capital = df_capital[~df_capital.duplicated(subset=['ehraid', 'founding_date'], keep=False) | (df_capital.duplicated(subset=['ehraid', 'founding_date'], keep=False) & ~(df_capital['keyword'].str.contains('liberierung|liberato|libéré', regex=True)))]

# Drop entries where the currency is not a common currency
df_capital = df_capital[df_capital.symbol.isin(['CHF', 'EUR', 'USD', 'GBP'])]

# Add exchange rates and convert registered capital
df_capital = df_capital.merge(df_exchange_rates, on=['symbol', 'founding_date'], how='left')
df_capital['Open'] = df_capital['Open'].fillna(1.0)
df_capital['capital_chf'] = df_capital['capital_new'].astype(float) * df_capital['Open'].astype(float)

# Aggregate capital into one value for registered capital
df_capital_agg = df_capital.groupby(['ehraid', 'founding_date']).agg({'capital_chf': 'sum'}).reset_index()

In [None]:
df_startups = df_startups.merge(df_capital_agg[['ehraid', 'founding_date', 'capital_chf']], on=['ehraid', 'founding_date'], how='left')

In [None]:
# Map from legal form to required minimum capital (at registration)
legalform_min_capital = {
    2: 0,        # General Partnership
    3: 100_000,  # Corporation (AG)
    4: 20_000,   # GmbH
    10: 0,       # Limited Partnership
    12: 100_000, # Corporation with unlimited partners
    15: 500_000, # Investmentgesellschaft mit festem Kapital
    16: 5_000_000,  # SICAV (see note)
    17: 100_000, # Kommanditgesellschaft für kollektive Kapitalanlagen
}

# Clean up legal form column
df_startups['founding_legal_form'] = df_startups['founding_legal_form'].fillna(2).astype(int)  # Assume missings are General Partnerships

# Only update capital where it is missing
mask = df_startups['capital_chf'].isna()
df_startups.loc[mask, 'capital_chf'] = (
    df_startups.loc[mask, 'founding_legal_form']
    .map(legalform_min_capital)
    .fillna(0)
)

In [694]:
df_startups.head()

Unnamed: 0,ehraid,uid,delete_date,reason_for_dissolution,liquidation,bankruptcy,other_exit,current_name,founding_name,current_legal_form,...,latitude,longitude,population,canton_id,district_id,urban_rural,typology_9c,typology_25c,capital_chf,company_url
0,1251325,CHE153193257,,['Mit Urteil des Gerichtspräsidenten des Zivil...,True,False,False,Arlez Carrosserie GmbH in Liquidation,Arlez Carrosserie GmbH,4,...,47.460137,7.86118,6296.0,13.0,1304.0,2.0,21.0,217.0,20000.0,http://www.arlez-carrosserie.ch/
1,1251326,CHE392024369,2020-11-11,[],False,False,True,Vista Coaching GmbH in Liquidation,Vista Coaching GmbH,4,...,47.504062,7.724207,4700.0,13.0,1303.0,1.0,11.0,113.0,20000.0,http://vista-coaching.ch/
2,1251327,CHE473646370,,,,,,Wissler Consulting GmbH,Wissler Consulting GmbH,4,...,47.523075,7.845789,941.0,13.0,1304.0,3.0,23.0,236.0,20000.0,no website available
3,1251328,CHE205344235,,,,,,Wolf Regio GmbH,Wolf Regio GmbH,4,...,47.518085,7.603328,12304.0,13.0,1301.0,1.0,11.0,112.0,20000.0,http://wolfregio.ch/
4,1251329,CHE190527339,,"['Mit Entscheid vom 27.09.2022 , 9.15 Uhr , ha...",False,True,False,AHAS GmbH in Liquidation,AHAS GmbH,4,...,47.0363,8.177812,7771.0,3.0,312.0,2.0,21.0,216.0,20000.0,no website available


### ADD WEBSITE URLS TO COMPANY DATA

In [19]:
websites = pd.read_csv(RAW_DATA_DIR / 'company_urls' / 'urls.csv')
current_website_stats = pd.read_csv(PROCESSED_DATA_DIR / 'summary_stats' / 'current_website_stats_grouped.csv')
current_website_stats.columns = [f'current_{col}' if col != 'ehraid' else col for col in current_website_stats.columns]

founding_website_stats = pd.read_csv(PROCESSED_DATA_DIR / 'summary_stats' / 'wayback_website_stats_grouped.csv')
founding_website_stats.columns = [f'founding_{col}' if col != 'ehraid' else col for col in founding_website_stats.columns ]

websites['ehraid'] = websites['ehraid'].astype(int)

assert websites[websites.duplicated(subset='ehraid', keep=False)].empty

In [20]:
df_startups = df_startups.merge(websites[['ehraid', 'company_url']], on='ehraid', how='left')
df_startups = df_startups.merge(current_website_stats[['ehraid', 'current_n_pages', 'current_total_text_len', 'current_mean_text_len', 'current_n_internal_links_mean', 'current_n_external_links_mean', 'current_n_languages', 'current_dominant_language']], on='ehraid', how='left')
df_startups = df_startups.merge(founding_website_stats[['ehraid', 'founding_n_pages', 'founding_total_text_len', 'founding_mean_text_len', 'founding_n_internal_links_mean', 'founding_n_external_links_mean', 'founding_n_languages', 'founding_dominant_language']], on='ehraid', how='left')

df_startups['company_url'] = df_startups['company_url'].fillna('no website available')
print(f'Percentage of companies with found website: {len(df_startups[df_startups.company_url != 'no website available']) / len(df_startups) * 100}')

Percentage of companies with found website: 48.409195526793205


In [23]:
df_startups[df_startups.company_url != 'no website available'].to_csv(RAW_DATA_DIR / 'company_sample' / 'company_sample_website.csv', index=False)
df_startups[df_startups.company_url == 'no website available'].to_csv(RAW_DATA_DIR / 'company_sample' / 'company_sample_no_website.csv', index=False)

In [None]:
total = df_startups.shape[0]
num_website = df_startups[(df_startups.company_url != 'no website available')].shape[0]
num_no_website = df_startups[(df_startups.company_url == 'no website available')].shape[0]

num_exits = df_startups[(df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)].shape[0]
num_exits_website = df_startups[(df_startups.company_url != 'no website available') & (df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)].shape[0]
num_exits_no_website = df_startups[(df_startups.company_url == 'no website available') & (df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)].shape[0]

In [None]:
num_survival = df_startups[~(df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)].shape[0]
num_survival_website = df_startups[(df_startups.company_url != 'no website available') & ~(df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)].shape[0]
num_survival_no_website = df_startups[(df_startups.company_url == 'no website available') & ~(df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)].shape[0]
print(f'Percentage of firms having website: {num_website / total:.4f}')
print(f'Percentage of exited firms having website: {num_exits_website / num_exits:.4f}')
print(f'Percentage of survived firms having website: {num_survival_website / num_survival:.4f}')

In [None]:
print(f'Ratio of survival to exit: {num_survival / num_exits:.2f}')
print(f'Ratio of survival with website to exit with website: {num_survival_website / num_exits_website:.2f}')

In [None]:
df_startups[~(df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)]
df_startups[(df_startups.company_url != 'no website available') & ~(df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)]
df_startups[(df_startups.company_url != 'no website available') & (df_startups.liquidation | df_startups.bankruptcy | df_startups.other_exit)]

### ADD WEBSITE EMBEDDING-BASED SCORES

In [None]:
current_responsibility = pd.read_csv(PROCESSED_DATA_DIR / 'current_websites_responsibility_scores.csv')
current_differantiation = pd.read_csv(PROCESSED_DATA_DIR / 'current_strat2vec_differentiation_scores.csv')
current_doc2vec = pd.read_csv(PROCESSED_DATA_DIR / 'current_doc2vec_differentiation_scores.csv')

founding_responsibility = pd.read_csv(PROCESSED_DATA_DIR / 'wayback_websites_responsibility_scores.csv')
founding_differentiation = pd.read_csv(PROCESSED_DATA_DIR / 'wayback_strat2vec_differentiation_scores.csv')
founding_doc2vec = pd.read_csv(PROCESSED_DATA_DIR / 'wayback_doc2vec_differentiation_scores.csv')

In [None]:
# Prepare responsibility scores
current_responsibility.drop(columns=['date'], inplace=True)
current_responsibility.columns = [f'current_{col}' if col != 'ehraid' else 'ehraid' for col in current_responsibility.columns]

founding_responsibility = founding_responsibility.drop(columns=['date'])
founding_responsibility.columns = [f'founding_{col}' if col != 'ehraid' else 'ehraid' for col in founding_responsibility.columns]

# Prepare doc2vec differentiation scores
current_doc2vec.drop(columns=['competitors', 'score_type', 'field'], inplace=True)
current_doc2vec = current_doc2vec.rename(columns={'score': 'current_doc2vec_diff'})

founding_doc2vec.drop(columns=['competitors', 'score_type', 'field'], inplace=True)
founding_doc2vec = founding_doc2vec.rename(columns={'score': 'founding_doc2vec_diff'})

# Prepare contextual differentiation scores
current_differantiation = current_differantiation.pivot(index=['ehraid'], columns='field', values='score').reset_index()
current_differantiation.columns = [f'current_{col}' if col != 'ehraid' else 'ehraid' for col in current_differantiation.columns]

founding_differentiation = founding_differentiation.pivot(index=['ehraid'], columns='field', values='score').reset_index()
founding_differentiation.columns = [f'founding_{col}' if col != 'ehraid' else 'ehraid' for col in founding_differentiation.columns]

In [None]:
for df in [current_responsibility, founding_responsibility, current_doc2vec, founding_doc2vec, current_differantiation, founding_differentiation]:
    df_startups = df_startups.merge(df, on='ehraid', how='left')

# **FEATURE ENCODING**

In [485]:
df_startups = pd.read_csv(RAW_DATA_DIR / 'company_sample' / 'until_2020' / '2020_sample_base_data.csv')
df_startups['founding_date'] = pd.to_datetime(df_startups['founding_date'])

  df_startups = pd.read_csv(RAW_DATA_DIR / 'company_sample' / 'until_2020' / '2020_sample_base_data.csv')


In [486]:
df_startups.head()

Unnamed: 0,ehraid,uid,current_name,founding_name,current_legal_form,founding_legal_form,current_purpose,founding_purpose,current_street,current_town,...,dominant_language,class_1_label,section_1_label,prediction_1_score,class_2_label,section_2_label,prediction_2_score,class_3_label,section_3_label,prediction_3_score
0,1251327,CHE473646370,Wissler Consulting GmbH,Wissler Consulting GmbH,4,4.0,Die Gesellschaft bezweckt die Erbringung von B...,Die Gesellschaft bezweckt die Erbringung von B...,Wintersingerstrasse 18a,Maisprach,...,,7022.0,M,0.318912,7021.0,M,0.096343,7490.0,M,0.082247
1,1251329,CHE190527339,AHAS GmbH in Liquidation,AHAS GmbH,4,4.0,"Führung einer Baufirma, speziell Ausführung sä...","Führung einer Baufirma , speziell Ausführung s...",Bösch 21,Hünenberg,...,,4333.0,F,0.186411,4399.0,F,0.125374,4391.0,F,0.070668
2,1251336,CHE350451441,Roof & Terrace AG,Roof & Terrace AG,3,3.0,"Führung von Hotels, Restaurants und Barbetrieb...","Führung von Hotels , Restaurants und Barbetrie...",Pilatusstrasse 1,Luzern,...,,7022.0,M,0.267851,8299.0,N,0.093203,7021.0,M,0.091909
3,1251338,CHE497156719,SP Ventures GmbH,SP Ventures GmbH,4,4.0,Die Gesellschaft übt die Funktion einer Holdin...,Die Gesellschaft übt die Funktion einer Holdin...,Zürichstrasse 5,Luzern,...,,6420.0,K,0.131271,6820.0,L,0.109448,6430.0,K,0.103592
4,1251341,CHE256940465,"The Food Bus, Yanev & Co","The Food Bus , Yanev & Co",2,2.0,"exploitation d'un Food Truck, cuisine ambulante.","exploitation d'un Food Truck , cuisine ambulante",Route de Bourguillon 19,Marly,...,,5610.0,I,0.301125,5629.0,I,0.130845,5621.0,I,0.082071


In [487]:
assert len(df_startups) == df_startups.ehraid.nunique()
original_len = len(df_startups)

## PROCESS OUTPUT FEATURES


### INVOLUNTARY EXIT TARGET

In [488]:
with connect_database() as con:
    all_exits = read_from_database(con, "SELECT ehraid, shab_date AS exit_date FROM zefix.shab WHERE shab_id IN (SELECT shab_id FROM zefix.shab_mutation WHERE description = 'status.aufl' OR description = 'status.loeschung')")
    bankruptcies = read_from_database(con, "SELECT ehraid, shab_date AS exit_date FROM zefix.shab WHERE shab_id IN (SELECT shab_id FROM zefix.shab_mutation WHERE description = 'status.aufl.konk')")
    liquidations = read_from_database(con, "SELECT ehraid, shab_date AS exit_date FROM zefix.shab WHERE shab_id IN (SELECT shab_id FROM zefix.shab_mutation WHERE description = 'status.aufl.liq')")

In [490]:
# Remove duplicates, keep earliest

all_exits['exit_date'] = pd.to_datetime(all_exits['exit_date'])
bankruptcies['exit_date'] = pd.to_datetime(bankruptcies['exit_date'])
liquidations['exit_date'] = pd.to_datetime(liquidations['exit_date'])

all_exits = all_exits.sort_values('exit_date').drop_duplicates(subset=['ehraid'], keep='first')
bankruptcies = bankruptcies.sort_values('exit_date').drop_duplicates(subset=['ehraid'], keep='first')
liquidations = liquidations.sort_values('exit_date').drop_duplicates(subset=['ehraid'], keep='first')

In [491]:
# Add exit date
df_startups = df_startups.merge(all_exits, on='ehraid', how='left')

In [492]:
df_startups['target_inv_exit'] = 0

# Set target_inv_exit to 1 for all bankruptcies and liquidations that happened in within 5 years after founding
condition_a = (df_startups['ehraid'].isin(bankruptcies.ehraid) | df_startups['ehraid'].isin(liquidations.ehraid))  # Bankrupt or liquidated
condition_b = (df_startups['exit_date'] - df_startups['founding_date']) <= pd.Timedelta(days=5*365)  # Event within the first 5 years after founding
df_startups.loc[condition_a & condition_b, 'target_inv_exit'] = 1

In [493]:
len(df_startups[df_startups['target_inv_exit'] == 1]) / len(df_startups)

0.14187719440758892

### ACQUISITION TARGET

In [494]:
# Consolidation mergers are determined statistically via a name similarity index
# The cutoff is set to 0.7

query_merger = """ 
    SELECT ehraid_acquiree AS ehraid, merger_date FROM zefix.merger_relation WHERE merger_date > '2016-01-01' AND name_similarity < 0.7;
"""

with connect_database() as con:
    df_merger = read_from_database(connection=con, query=query_merger)

In [495]:
# Remove duplicates, keep earliest

df_merger['merger_date'] = pd.to_datetime(df_merger['merger_date'])
df_merger = df_merger.sort_values('merger_date').drop_duplicates(subset=['ehraid'], keep='first')

In [496]:
# Add merger date
df_startups = df_startups.merge(df_merger, on='ehraid', how='left')

In [497]:
df_startups['target_acquisition'] = 0

condition_a = df_startups['ehraid'].isin(df_merger.ehraid)  # Merged
condition_b = (df_startups['merger_date'] - df_startups['founding_date']) <= pd.Timedelta(days=5*365)  # Event within the first 5 years after founding
df_startups.loc[condition_a & condition_b, 'target_acquisition'] = 1

In [498]:
len(df_startups[df_startups['target_acquisition'] == 1]) / len(df_startups)

0.006444450461671766

### NON-GOV. INVESTMENT TARGET

In [499]:
df_funding = pd.read_csv(PROCESSED_DATA_DIR / 'funding_data' / 'startup-ch_funding.csv', usecols=['ehraid', 'date'])
df_funding = df_funding[~df_funding.ehraid.isna()].copy()
df_funding.rename(columns={'date': 'investment_date'}, inplace=True)
df_funding['ehraid'] = df_funding['ehraid'].astype(float).astype(int)

In [500]:
# Remove duplicates, keep earliest

df_funding['investment_date'] = pd.to_datetime(df_funding['investment_date'])
df_funding = df_funding.sort_values('investment_date').drop_duplicates(subset=['ehraid'], keep='first')

In [501]:
# Add investment date
df_startups = df_startups.merge(df_funding, on='ehraid', how='left')

In [502]:
df_startups['target_non_gov_investment'] = 0

condition_a = df_startups['ehraid'].isin(df_funding.ehraid)  # Received funding
condition_b = (df_startups['investment_date'] - df_startups['founding_date']) <= pd.Timedelta(days=5*365)  # Event within the first 5 years after founding
df_startups.loc[condition_a & condition_b, 'target_non_gov_investment'] = 1

In [503]:
len(df_startups[df_startups['target_non_gov_investment'] == 1]) / len(df_startups)

0.005126677678192666

### INNOVATION SUBSIDY TARGET

In [504]:
df_inno = pd.read_csv(PROCESSED_DATA_DIR / 'funding_data' / 'innosuisse_grants.csv', usecols=['ehraid', 'start_date'])
df_inno = df_inno[~df_inno.ehraid.isna()].copy()
df_inno.rename(columns={'start_date': 'subsidy_date'}, inplace=True)
df_inno['ehraid'] = df_inno['ehraid'].astype(float).astype(int)

In [505]:
# Remove duplicates, keep earliest

df_inno['subsidy_date'] = pd.to_datetime(df_inno['subsidy_date'])
df_inno = df_inno.sort_values('subsidy_date').drop_duplicates(subset=['ehraid'], keep='first')

In [506]:
# Add investment date
df_startups = df_startups.merge(df_inno, on='ehraid', how='left')

In [507]:
df_startups['target_inno_subsidy'] = 0

condition_a = df_startups['ehraid'].isin(df_inno.ehraid)  # Received innovation subsidy
condition_b = (df_startups['subsidy_date'] - df_startups['founding_date']) <= pd.Timedelta(days=5*365)  # Event within the first 5 years after founding
df_startups.loc[condition_a & condition_b, 'target_inno_subsidy'] = 1

In [508]:
len(df_startups[df_startups['target_inno_subsidy'] == 1]) / len(df_startups)

0.00817741193035661

In [512]:
assert len(df_startups) == original_len

## PROCESS INPUT FEATURES

### ENCODE BASIC FIRM FEATURES

In [None]:
# 1. Encode NOGA hierarchy levels
for i, col in enumerate(['class_1_label', 'class_2_label', 'class_3_label'], start=1):
    # Ensure all values are 4-digit strings with leading zeros if needed
    padded_str = df_startups[col].astype(int).astype(str).str.zfill(4)
    
    # Extract division (first 2 digits) and group (first 3 digits), then convert back to float
    df_startups[f'division_{i}_label'] = padded_str.str[:2].astype(float)
    df_startups[f'group_{i}_label'] = padded_str.str[:3].astype(float)

In [515]:
assert len(df_startups) == df_startups.ehraid.nunique()

### ENCODE ADDRESS FEATURES

In [516]:
def encode_spatial_features(df, lat_col='latitude', lon_col='longitude', founding_col='founding_date', exit_col='exit_date'):
    # Ensure date columns are datetime
    df = df.copy()
    df[founding_col] = pd.to_datetime(df[founding_col])
    df[exit_col] = pd.to_datetime(df[exit_col])
    
    # Convert to GeoDataFrame
    gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df[lon_col], df[lat_col]),
        crs='EPSG:4326'
    )
    gdf = gdf.to_crs(epsg=3857)
    sindex = gdf.sindex

    same_address_counts = []
    firms_within_1km = []
    firms_within_2_5km = []
    firms_within_10km = []

    for idx, row in gdf.iterrows():
        founding_date_i = row[founding_col]

        # -- Same address (within 10m)
        for radius, store in [
            (10, same_address_counts),
            (1000, firms_within_1km),
            (2500, firms_within_2_5km),
            (10_000, firms_within_10km)
        ]:
            possible_matches_idx = list(sindex.intersection(row.geometry.buffer(radius).bounds))
            nearby = gdf.iloc[possible_matches_idx]
            candidates = nearby[
                (nearby[founding_col] <= founding_date_i) &
                (
                    nearby[exit_col].isna() |
                    (nearby[exit_col] > founding_date_i)
                )
            ]
            n_firms = candidates[candidates.geometry.distance(row.geometry) <= radius].shape[0] - 1  # exclude self
            store.append(n_firms)

    gdf['n_firms_within_10m'] = same_address_counts
    gdf['n_firms_within_1km'] = firms_within_1km
    gdf['n_firms_within_2.5km'] = firms_within_2_5km
    gdf['n_firms_within_10km'] = firms_within_10km

    return gdf.drop(columns='geometry')

In [517]:
# Add two spacial variables for
# 1. Number of firms (including the firm itself) at the same location (within 10 meters)
# 2. Number of firms (including the firm itself) within 1km distance
# Both are calculated only considering existing firms at the time of founding!

df_startups = encode_spatial_features(df_startups)

In [518]:
assert len(df_startups) == df_startups.ehraid.nunique()

### ENCODE FIRM NAME FEATUES

In [519]:
porter_stemmer = nltk.stem.PorterStemmer()

token_classifier = pipeline(
  model="ZurichNLP/swissbert-ner",
  aggregation_strategy="simple",
  device='mps'
)

UMLAUT_REPLACEMENTS = {
    'ä': 'ae',
    'ö': 'oe',
    'ü': 'ue',
}

Device set to use mps


In [520]:
def normalize_words(text: str) -> str:
    if not isinstance(text, str):
        return ''
    for char, replacement in UMLAUT_REPLACEMENTS.items():
        text = text.replace(char, replacement)
    return unidecode(text.lower())

def get_language(text: str) -> str:
    language = detect(text)
    return language.get('lang', 'de')

def stem_text(text: str, lang_code: str) -> str:
    code2lang = {
        'de': 'german',
        'en': 'english',
        'fr': 'french',
        'it': 'italian'
    }
    language = code2lang.get(lang_code, 'german')
    stop_words = set(stopwords.words(language))
    tokens = nltk.tokenize.word_tokenize(text, language=language)
    return [porter_stemmer.stem(w) for w in tokens if not w.lower() in stop_words]

def batch_ner_tag_bps(texts: list[str], token_classifier: pipeline):
    batch_outputs = token_classifier(texts, batch_size=32)
    results = []
    for output in batch_outputs:
        people = [entry['word'] for entry in output if entry['entity_group'] == 'PER']
        locations = [entry['word'] for entry in output if entry['entity_group'] == 'LOC']
        results.append((people, locations))
    return results

def process_df_with_ner(df_lang: pd.DataFrame, token_classifier: pipeline, batch_size: int):
    texts = df_lang['founding_purpose'].fillna("").tolist()
    results = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        results.extend(batch_ner_tag_bps(batch, token_classifier))

    df_lang[['people', 'locations']] = pd.DataFrame(results, index=df_lang.index)
    return df_lang

In [521]:
raumgliederungen = pd.read_excel(EXTERNAL_DATA_DIR / 'geo_data' / 'Raumgliederungen.xlsx')
municipality_names = [re.sub(r'\(.*?\)', '', normalize_words(name)).strip() for name in raumgliederungen['Gemeindename']]

In [522]:
with connect_database() as con:
    gendered_names = read_from_database(con, "SELECT * FROM zefix.founders_gender_mapping WHERE gender != 'u' AND request_type = 'first_name' AND probability >= 0.95")

In [523]:
gendered_names['split_names'] = gendered_names['name'].str.split()

certain_gender = gendered_names[gendered_names['split_names'].apply(len) == 1]
certain_gender = certain_gender[certain_gender['name'].str.isalpha() & (certain_gender['name'].str.len() >= 4)].copy()

female_names = set(certain_gender[certain_gender['gender'] == 'f']['name'].apply(normalize_words))
male_names = set(certain_gender[certain_gender['gender'] == 'm']['name'].apply(normalize_words))

In [524]:
# Get external data to create firm name and BPS features (e.g. includes female name)
official_male_names = pd.read_csv(EXTERNAL_DATA_DIR / 'newborn_names' / 'maennliche_vornamen.csv', encoding='ISO-8859-15', usecols=['Vorname'])
official_female_names = pd.read_csv(EXTERNAL_DATA_DIR / 'newborn_names' / 'weibliche_vornamen.csv', encoding='ISO-8859-15', usecols=['Vorname'])

official_male_names = set([normalize_words(name) for name in official_male_names['Vorname']])
official_female_names = set([normalize_words(name) for name in official_female_names['Vorname']])

female_names = female_names.union(official_female_names)
male_names = male_names.union(official_male_names)

In [525]:
print(len(female_names))
print(len(male_names))

7217
10473


In [526]:
# 1. Firm name length
df_startups['firm_name_length'] = df_startups['founding_name'].str.len()

In [527]:
df_startups['founding_name_norm'] = df_startups['founding_name'].apply(normalize_words)

In [528]:
# 2. Contains swiss reference
swiss_terms = [
    "switzerland",
    "swiss", 
    "schweiz",  # Covers "schweiz", "schweizer", "schweizerische", etc.
    "swi",  # e.g. swica
    "sui",
    "suisse",
    "helvet",  # Covers Helvetia, Helvetica, etc.
    "confed",  # Covers Confederation, "confédération", "confederazione"
    "sviz",  # Covers "Svizzera", "Svizzero", "Svizzere", "Svizzeri", "svizra", etc.
    "eidgen",  # Covers "eidgenossenschaft", "eidgenössisch",
]

df_startups['firm_name_swiss_ref'] = df_startups['founding_name_norm'].str.contains('|'.join(swiss_terms)).astype(int)

In [529]:
# 3. Contains holding reference
holding_terms = [
    "holding",
    "beteiligung",
    "participation",
    "partecipazion",
    "anteil",
    "capital",
    "kapital",
    "invest",
    "share",
    "aktie",
    "action",
    "azion"
]

df_startups['firm_name_holding_ref'] = df_startups['founding_name_norm'].str.contains('|'.join(holding_terms)).astype(int)

In [530]:
# 4. Contains geographic term (municipality name)
municipality_set = set(municipality_names)
def has_geographic_term(firm_name, geo_terms):
    tokens = [t for t in firm_name.split() if len(t) > 2]
    return int(any(tok in geo_terms for tok in tokens))
    
df_startups['firm_name_geog_ref'] = df_startups['founding_name_norm'].apply(lambda x: has_geographic_term(x, municipality_set))

In [531]:
# 5. Contains founder names
def has_founder_name(firm_name, founders):
    if founders:
        founder_name_list = []
        for fn, ln in founders.get('founder_names', []):
            fn = [n for n in re.split(r'[- ]', fn) if len(n) > 2]  # Avoid really short names to decrease likelihood of false positive
            ln = [n for n in re.split(r'[- ]', ln) if len(n) > 2]
            founder_name_list.extend(fn + ln)
        return int(any(name in firm_name for name in founder_name_list))
    return 0

df_startups['firm_name_founder_match'] = df_startups.apply(lambda row: has_founder_name(row['founding_name_norm'], founder_lists.get(row['ehraid'])), axis=1)

In [532]:
# 6. Contains gendered name
def contains_male_or_female_name(names: list[str] | str, gendered_first_names):
    if isinstance(names, list):
        names = [n for name in names for n in name.split() if len(n) > 2]
    elif isinstance(names, str):
        names = names.split()
    for name in names:
        gender = name in gendered_first_names
        if gender:
            return 1
    return 0

df_startups['firm_name_male_match'] = df_startups['founding_name_norm'].apply(lambda x: contains_male_or_female_name(x, male_names))
df_startups['firm_name_female_match'] = df_startups['founding_name_norm'].apply(lambda x: contains_male_or_female_name(x, female_names))

In [533]:
assert len(df_startups) == df_startups.ehraid.nunique()

### ENCODE FOUNDER FEATURES

In [534]:
# 1. Encode number of founders
df_startups['n_founders'] = df_startups['ehraid'].apply(lambda x: count_stats.get(x, {}).get('n_founders', 1))
df_startups.loc[(df_startups['n_founders'] == 0) | (df_startups['n_founders'].isna()), 'n_founders'] = 1  # Firm must have at least one founder

# 2. Encode number of inscribed firms
df_startups = df_startups.merge(grouped_insc_firms, on='ehraid', how='left')
df_startups['founding_date'] = pd.to_datetime(df_startups['founding_date'])
df_startups['firm_inscription_date'] = pd.to_datetime(df_startups['firm_inscription_date'])

df_startups.loc[df_startups['firm_inscription_date'] > df_startups['founding_date'], 'n_inscribed_firms'] = 0
df_startups.drop(columns=['firm_inscription_date'], inplace=True)
df_startups['n_inscribed_firms'].fillna(0, inplace=True)

# 3. Percentage of female founders
df_startups['n_female_founders'] = df_startups['ehraid'].apply(lambda x: count_stats.get(x, {}).get('n_female_founders', np.nan))

# 4. Number of distinct nationalities
def get_distinct_nationalities(founder_data):
    if not founder_data:
        return 0
    return len(set([nat for nationalities in founder_data.get('founder_nationalities', []) for nat in nationalities]))

df_startups['n_distinct_nationalities'] = df_startups['ehraid'].apply(lambda ehraid: get_distinct_nationalities(founder_lists.get(ehraid)))

# 5. Number of Swiss founders
df_startups['n_swiss_founders'] = df_startups['ehraid'].apply(lambda x: count_stats.get(x, {}).get('n_swiss_founders', np.nan))

# 6. Number of foreign founders
df_startups['n_foreign_founders'] = df_startups['ehraid'].apply(lambda x: count_stats.get(x, {}).get('n_foreign_founders', np.nan))

# 7. Number of founders with Dr. PhD. Prof. in name
df_startups['n_dr_titles'] = df_startups['ehraid'].apply(lambda x: count_stats.get(x, {}).get('n_dr_titles', np.nan))

# 8. Founders with same municipality than firm
def get_residence_match(firm_bfs_code, founder_data):
    if not founder_data or pd.isna(firm_bfs_code):
        return 0
    count = 0
    for residencies in founder_data.get('founder_residencies', []):
        count += int(firm_bfs_code in residencies)
    return count

df_startups['n_founders_same_residence'] = df_startups.apply(lambda row: get_residence_match(row['founding_bfs_code'], founder_lists.get(row['ehraid'])), axis=1)

df_startups.fillna({
    'n_inscribed_firms': 0,
    'n_female_founders': 0,
    'n_swiss_founders': 0,
    'n_foreign_founders': 0,
    'n_dr_titles': 0,
    'n_founders_same_residence': 0
}, inplace=True)

df_startups['pct_female_founders'] = df_startups['n_female_founders'] / df_startups['n_founders']
df_startups['pct_swiss_founders'] = df_startups['n_swiss_founders'] / df_startups['n_founders']
df_startups['pct_foreign_founders'] = df_startups['n_foreign_founders'] / df_startups['n_founders']
df_startups['pct_dr_titles'] = df_startups['n_dr_titles'] / df_startups['n_founders']
df_startups['pct_founders_same_residence'] = df_startups['n_founders_same_residence'] / df_startups['n_founders']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_startups['n_inscribed_firms'].fillna(0, inplace=True)


In [535]:
# 9. Encode prior founding experience
df_startups['founder_fids'] = df_startups['ehraid'].apply(lambda x: founder_lists.get(x, {}).get('founder_fids', []))

prior_foundings = {}
for ehraid, data in founder_lists.items():
    founders = data.get('founder_fids', [])
    for fid in founders:
        if fid not in prior_foundings:
            prior_foundings[fid] = [ehraid]
        else:
            prior_foundings[fid].append(ehraid)

prior_founding = []
prior_failed = []
prior_existing = []
for i, row in tqdm(df_startups.iterrows(), total=df_startups.shape[0]):
    foundings = 0
    failed = set()
    existing = set()
    for fid in row['founder_fids']:
        matches = df_startups[
            df_startups['ehraid'].isin(prior_foundings[fid])
            & (df_startups['ehraid'] != row['ehraid'])
            & (df_startups['founding_date'] <= row['founding_date'])]
        if matches.empty:
            continue
        foundings += 1
        failed_ehraids = matches[matches['exit_date'] < row['founding_date']]['ehraid'].tolist()
        for eid in failed_ehraids:
            failed.add(eid)
        existing_ehraids = [ehraid for ehraid in matches['ehraid'] if ehraid not in failed_ehraids]
        for eid in existing_ehraids:
            existing.add(eid)

    prior_founding.append(foundings)
    prior_failed.append(len(failed))
    prior_existing.append(len(existing))

100%|██████████| 110793/110793 [02:33<00:00, 719.88it/s]


In [536]:
df_startups['n_founders_with_prior_founding'] = prior_founding
df_startups['pct_founders_with_prior_founding'] = df_startups['n_founders_with_prior_founding'] / df_startups['n_founders']
df_startups['n_dissolved_firms'] = prior_failed
df_startups['n_existing_firms'] = prior_existing

In [537]:
assert len(df_startups) == df_startups.ehraid.nunique()

### ENCODE BPS FEATURES

In [538]:
df_startups.loc[df_startups.founding_purpose.isna(), 'founding_purpose'] = df_startups['current_purpose']  # Use current purpose if founding purpose is missing
df_startups['bps_language'] = df_startups['founding_purpose'].apply(get_language)

corrections = {
    'cs': 'de',
    'en': 'de',
    'sv': 'de',
    'es': 'fr',
    'pt': 'de',
    'pl': 'de',
    'nl': 'de',
    'ca': 'it',
    'et': 'de'
}

df_startups['bps_language'] = df_startups['bps_language'].replace(corrections)
df_startups.loc[~df_startups['bps_language'].isin(['de', 'fr', 'it']), 'bps_language'] = 'de'  # If not de, fr, it assume it as the majority language (German)

In [539]:
# Apply stop-word removal and stemming
df_startups['bps_normalized'] = df_startups.apply(lambda row: stem_text(row['founding_purpose'], lang_code=row['bps_language']), axis=1)

In [540]:
# 1. Encode bps length
df_startups['bps_length'] = df_startups['bps_normalized'].apply(lambda x: len(' '.join(x)))  # bps length

# 2. Encode mean word length
df_startups['bps_mean_word_length'] = [(1 / len(word_list)) * np.array([len(w) for w in word_list]).sum() for word_list in df_startups['bps_normalized']]  # average word length per bps

In [None]:
# 3. Get quantiles of length metrics
def get_quantiles(df: pd.DataFrame, column: str, q: list[float] = [.2, .4, .6, .8]) -> int:
    quantiles = df[column].quantile(q)
    def get_numeric_quintile(target: float, quantiles: pd.Series):
        for i, tau in enumerate(quantiles, start=1):
            if target <= tau:
                return i
        return len(quantiles) + 1
    df[f'{column}_quantiles_{len(quantiles) + 1}'] = df[column].apply(lambda x: get_numeric_quintile(x, quantiles))
    return df


df_startups = get_quantiles(df_startups, 'bps_length')

In [542]:
# 4. Calculate LIX
def calculate_lix(word_list: list[str]):
    pct_above_six = len([w for w in word_list if len(w) > 6])  / len(word_list) * 100
    return len(word_list) + pct_above_six

df_startups['bps_lix'] = df_startups['bps_normalized'].apply(calculate_lix)

In [543]:
# 5. Word-frequency Features
word_dictionary_de = [token for sublist in df_startups[df_startups['bps_language'] == 'de']['bps_normalized'] for token in sublist if token.isalpha() and len(token) > 4]
word_freqencies_de = Counter(word_dictionary_de)

word_dictionary_fr = [token for sublist in df_startups[df_startups['bps_language'] == 'fr']['bps_normalized'] for token in sublist if token.isalpha() and len(token) > 4]
word_freqencies_fr = Counter(word_dictionary_fr)

word_dictionary_it = [token for sublist in df_startups[df_startups['bps_language'] == 'it']['bps_normalized'] for token in sublist if token.isalpha() and len(token) > 4]
word_freqencies_it = Counter(word_dictionary_it)

def compute_specificity_features(bps_tokens: list[str], lang_code: str) -> tuple[float, float, float]:
    if not isinstance(bps_tokens, list):
        print(bps_tokens)
    if lang_code == 'fr':
        word_freqencies = word_freqencies_fr
    elif lang_code == 'it':
        word_freqencies = word_freqencies_it
    else:
        word_freqencies = word_freqencies_de
    token_freqs = [word_freqencies[token] for token in bps_tokens if token in word_freqencies]
    if not token_freqs:
        return (0, 0, 0.0)

    total_tokens = sum(word_freqencies.values())
    # Normalize it by total number of tokens to account for differences between languages
    min_freq_norm, max_freq_norm = min(token_freqs) / total_tokens, max(token_freqs) / total_tokens

    ratio = min_freq_norm / max_freq_norm if max_freq_norm > 0 else 0.0
    return (min_freq_norm, max_freq_norm, ratio)

df_startups[['bps_min_word_freq_norm', 'bps_max_word_freq_norm', 'bps_freq_ratio_norm']] = df_startups.apply(lambda row: compute_specificity_features(row['bps_normalized'], row['bps_language']), axis=1).apply(pd.Series)

In [544]:
# 5. Encode geographic and name features
dfs = []
for language in ['de', 'fr', 'it']:
    token_classifier.model.set_default_language(f"{language}_CH")
    df_lang = df_startups[df_startups['bps_language'] == language].copy()
    df_lang = process_df_with_ner(df_lang, token_classifier, batch_size=128)

    df_lang['bps_geographic_term'] = df_lang['locations'].apply(lambda x: int(len(x) > 0))
    df_lang['people'] = df_lang['people'].apply(lambda names: [normalize_words(name) for name in names])
    df_lang['bps_male_name'] = df_lang['people'].apply(lambda x: contains_male_or_female_name(x, male_names))
    df_lang['bps_female_name'] = df_lang['people'].apply(lambda x: contains_male_or_female_name(x, female_names))

    dfs.append(df_lang)

100%|██████████| 580/580 [12:56<00:00,  1.34s/it]
100%|██████████| 233/233 [05:05<00:00,  1.31s/it]
100%|██████████| 54/54 [02:22<00:00,  2.63s/it]


In [545]:
df_startups = pd.concat(dfs)

In [546]:
df_startups[['bps_geographic_term', 'bps_male_name', 'bps_female_name']] = df_startups[[
    'bps_geographic_term', 'bps_male_name', 'bps_female_name'
]].fillna(0).astype(int)

In [547]:
assert len(df_startups) == df_startups.ehraid.nunique()

### ADD CONTROL VARIABLES

In [None]:
# Add the number of days that we have a history to inform the model
# about missing history for variables depending on the history (e.g. n-firms at address, prior founding experiance)

df_startups['founding_year'] = df_startups['founding_date'].dt.year

min_date = df_startups['founding_date'].agg('min')
df_startups['days_of_prior_observations'] = (df_startups['founding_date'] - min_date).dt.days

In [549]:
assert len(df_startups) == df_startups.ehraid.nunique()

## EXPORT SAMPLE

In [550]:
df_startups.to_csv(RAW_DATA_DIR / 'company_sample' / 'until_2020' / '2020_sample_encoded_features.csv', index=False)