# **LOAD INSCRIPTIONS FROM 2016 to 2025**

In [1]:
import re
from pocketknife.database import (connect_database, read_from_database)
import pandas as pd
from datetime import datetime
from success_prediction.config import RAW_DATA_DIR, EXTERNAL_DATA_DIR, PROCESSED_DATA_DIR
from ftlangdetect import detect


[32m2025-04-23 11:58:34.670[0m | [1mINFO    [0m | [36msuccess_prediction.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/manuelbolz/Documents/git/for_work/company_success_prediction[0m


In [2]:
id2legalform = {
    1: 'Sole proprietorship',  # Einzelunternehmen  ->  EXCLUDE
    2: 'General Partnership',  # Kollektivgesellschaft  ->  INCLUDE
    3: 'Corporation',  # Aktiengesellschaft  ->  INCLUDE
    4: 'Limited Liability Company',  # Gesellschaft mit beschränkter Haftung  ->  INCLUDE
    5: 'Cooperative',  # Genossenschaft  ->  EXCLUDE
    6: 'Association',  # Verein  ->  EXCLUDE
    7: 'Foundation',  # Stiftung  ->  EXCLUDE
    8: 'Public sector institution',  # Institut des öffentlichen Rechts  ->  EXCLUDE
    9: 'Branch',  # Zweigniederlassung  ->  EXCLUDE
    10: 'Limited Partnership',  # Kommanditgesellschaft  ->  INCLUDE
    11: 'Foreign branch',  # Zweigniederlassung einer ausl. Gesellschaft  ->  EXCLUDE
    12: 'Corporation with unlimited partners',  # Kommanditaktiengesellschaft  ->  INCLUDE
    13: 'Special legal form',  # Besondere Rechtsform  ->  EXCLUDE
    14: 'Ownership in undivided shares',  # Gemeinderschaft  ->  EXCLUDE
    15: 'Limited Partnership for collective investment schemes with a fixed capital',  # Investmentgesellschaft mit festem Kapital  ->  INCLUDE
    16: 'Limited Partnership for collective investment schemes with a variable capital',  # Investmentgesellschaft mit variablem Kapital  ->  INCLUDE
    17: 'Limited Partnership for collective investment schemes',  # Kommanditgesellschaft für kollektive Kapitalanlagen  ->  INCLUDE
    18: 'Non commercial power of attorney',  # Nichtkaufmännische Prokure  ->  EXCLUDE
    19: '(unknown)',  # (unbekannt)  ->  EXCLUDE
}

growth_oriented_legal_forms = [2, 3, 4, 10, 12, 15, 16, 17]

In [None]:
# This query gets the sample of growth oriented firms that were founded between 2016 and current for the prediction sample
query_founded_firms = """
    SELECT
        base.ehraid,
        base.uid,

        base.delete_date,
        dissolution.reason_for_dissolution,
        dissolution.liquidation,
        dissolution.bankruptcy,

        base.name AS current_name,
        founding_name.firm_name AS founding_name,

        base.legal_form_id AS current_legal_form,
        legal_form.legal_form_id AS founding_legal_form,

        base.purpose_raw AS current_purpose,
        founding_purpose.purpose_raw AS founding_purpose,

        (COALESCE(address.street, '') || ' ' || COALESCE(address.house_number, '')) AS current_street,
        address.town AS current_town,
        address.swiss_zip_code AS current_zip_code,
        address.country AS current_country,

        founding_address.street AS founding_street,
        founding_address.town AS founding_town,
        founding_address.postal_code AS founding_zip_code,
        founding_address.town_bfs_gmde_code_latest AS founding_bfs_code,

        shab.shab_id AS founding_shab_id,
        shab.shab_date AS founding_date,
        shab.message AS founding_message

    FROM zefix.base base

    -- Get only companies where we have the full history from founding (2016-present)
    INNER JOIN (
        SELECT s.ehraid, s.shab_id, s.shab_date, s.message
        FROM zefix.shab s
        INNER JOIN zefix.shab_mutation sm ON s.shab_id = sm.shab_id
        WHERE sm.description = 'status.neu'
    ) AS shab ON base.ehraid = shab.ehraid

    -- Join the current addresses of the firms
    LEFT JOIN zefix.address address ON base.ehraid = address.ehraid

    -- Join the founding addresses of the firms
    LEFT JOIN (
        SELECT hfa.ehraid, hfa.street, hfa.postal_code, hfa.town, hfa.town_bfs_gmde_code_latest, hfa.founding
        FROM zefix.history_firm_addresses hfa
        WHERE hfa.founding
    ) AS founding_address
    ON base.ehraid = founding_address.ehraid

    -- Join the founding names of the firms
    LEFT JOIN (
        SELECT hfn.ehraid, hfn.firm_name, hfn.founding
        FROM zefix.history_firm_names hfn
        WHERE hfn.founding = TRUE
    ) AS founding_name
    ON base.ehraid = founding_name.ehraid

    -- Join the founding purpose of the firms
    LEFT JOIN (
        SELECT hp.ehraid, hp.purpose_raw
        FROM zefix.history_purpose hp
        WHERE hp.founding_purpose = TRUE
    ) AS founding_purpose
    ON base.ehraid = founding_purpose.ehraid

    -- Join the founding legal form of the firms
    LEFT JOIN (
        SELECT hlf.ehraid, hlf.legal_form_id
        FROM zefix.history_founding_legal_form hlf
    ) AS legal_form
    ON base.ehraid = legal_form.ehraid

    -- Join the reason for the dissolution
    LEFT JOIN (
        SELECT hd.ehraid, hd.shab_date, hd.reason_for_dissolution, hd.liquidation, hd.bankruptcy
        FROM zefix.history_dissolutions hd
    ) AS dissolution
    ON base.ehraid = dissolution.ehraid AND base.delete_date::date = dissolution.shab_date

    --Exclude all kind of branches
    WHERE
        NOT base.is_branch
        AND shab.shab_date < '2024-01-01'
        AND base.legal_form_id IN (2, 3, 4, 10, 12, 15, 16, 17)
        AND LOWER(base.name) NOT LIKE '%zweigniederlassung%'
        AND LOWER(base.name) NOT LIKE '%succursale%';
"""

In [66]:
query_founded_firms = """ 
    SELECT
        base.ehraid,
        base.uid,

        -- Dissolution information
        base.delete_date,
        dissolution.reason_for_dissolution,
        dissolution.liquidation,
        dissolution.bankruptcy,

        -- Names
        base.name AS current_name,
        founding_name.firm_name AS founding_name,

        -- Legal forms
        base.legal_form_id AS current_legal_form,
        legal_form.legal_form_id AS founding_legal_form,

        -- Purpose
        base.purpose_raw AS current_purpose,
        founding_purpose.purpose_raw AS founding_purpose,

        -- Current address
        COALESCE(address.street, '') || ' ' || COALESCE(address.house_number, '') AS current_street,
        address.town AS current_town,
        address.swiss_zip_code AS current_zip_code,
        address.country AS current_country,

        -- Founding address
        founding_address.street AS founding_street,
        founding_address.town AS founding_town,
        founding_address.postal_code AS founding_zip_code,
        founding_address.town_bfs_gmde_code_latest AS founding_bfs_code,

        -- Founding SHAB entry
        shab.shab_id,
        shab.shab_date AS founding_date,
        shab.message AS founding_message

    FROM zefix.base base

    -- Founding SHAB messages
    INNER JOIN (
        SELECT s.ehraid, s.shab_id, s.shab_date, s.message
        FROM zefix.shab s
        INNER JOIN zefix.shab_mutation sm ON s.shab_id = sm.shab_id
        WHERE sm.description = 'status.neu'
    ) AS shab ON base.ehraid = shab.ehraid

    -- Current address
    LEFT JOIN zefix.address address ON base.ehraid = address.ehraid

    -- Founding address
    LEFT JOIN (
        SELECT DISTINCT hfa.ehraid, hfa.street, hfa.postal_code, hfa.town, hfa.town_bfs_gmde_code_latest
        FROM zefix.history_firm_addresses hfa
        WHERE founding = TRUE
    ) AS founding_address ON base.ehraid = founding_address.ehraid

    -- Founding name
    LEFT JOIN (
        SELECT DISTINCT hfn.ehraid, hfn.firm_name
        FROM zefix.history_firm_names hfn
        WHERE hfn.founding = TRUE
    ) AS founding_name ON base.ehraid = founding_name.ehraid

    -- Founding purpose
    LEFT JOIN (
        SELECT DISTINCT hp.ehraid, hp.purpose_raw
        FROM zefix.history_purpose hp
        WHERE hp.founding_purpose = TRUE
    ) AS founding_purpose ON base.ehraid = founding_purpose.ehraid

    -- Founding legal form
    LEFT JOIN (
        SELECT DISTINCT hlf.ehraid, hlf.legal_form_id
        FROM zefix.history_founding_legal_form hlf
    ) AS legal_form ON base.ehraid = legal_form.ehraid

    -- Dissolution information
    LEFT JOIN (
        -- Only keep the last dissolution message as the final dissolution
        SELECT hd.ehraid, hd.shab_date, hd.reason_for_dissolution, hd.liquidation, hd.bankruptcy, hd.other_exit
        FROM (
            SELECT *,
                ROW_NUMBER() OVER (PARTITION BY ehraid ORDER BY shab_date DESC) AS rn
            FROM zefix.history_dissolutions
        ) hd
        WHERE hd.rn = 1
    ) AS dissolution ON base.ehraid = dissolution.ehraid

    -- Filter out irrelevant records
    WHERE
        NOT base.is_branch
        AND shab.shab_date < '2024-01-01'
        AND base.legal_form_id IN (2, 3, 4, 10, 12, 15, 16, 17)
        AND LOWER(base.name) NOT LIKE '%zweigniederlassung%'
        AND LOWER(base.name) NOT LIKE '%succursale%';
"""

In [62]:
# This query gets a all firms (except branches) that existed between 2016 and current
query_all_active_firms = """
    SELECT 
        base.name,
        base.ehraid,
        base.uid,
        base.legal_seat_id,
        base.legal_seat,
        address.street,
        address.house_number,
        address.town,
        address.swiss_zip_code,
        address.country,
        base.legal_form_id,
        base.status, 
        base.delete_date,
        base.purpose_raw,
        shab.shab_id,
        shab.shab_date AS founding_date
    FROM zefix.base base
    -- Get only companies where we have the full history from founding (2016-present)
    LEFT JOIN (
        SELECT s.ehraid, s.shab_id, s.shab_date
        FROM zefix.shab s
        WHERE s.shab_id IN (
            SELECT shab_id
            FROM zefix.shab_mutation
            WHERE description = 'status.neu'
        )
    ) AS shab
    ON base.ehraid = shab.ehraid
    -- Join the addresses of the firms
    LEFT JOIN zefix.address address
    ON base.ehraid = address.ehraid
    --Exclude all kind of branches
    WHERE
        (NOT base.delete_date < '2016-01-01' OR base.delete_date IS NULL)
        AND (shab.shab_date < '2024-01-01' OR shab.shab_date IS NULL)
        AND NOT base.legal_form_id IN (9, 11, 13, 14, 18, 19)
        AND NOT base.is_branch
        AND LOWER(base.name) NOT LIKE '%zweigniederlassung%'
        AND LOWER(base.name) NOT LIKE '%succursale%';
"""

In [67]:
with connect_database() as con:
    df_sample = read_from_database(connection=con, query=query_founded_firms)

In [37]:
with connect_database() as con:
    df_all_firms = read_from_database(connection=con, query=query_all_active_firms)

In [None]:
# Exclude companies founded in 2025
df_sample['founding_date'] = pd.to_datetime(df_sample['founding_date'])
df_sample['delete_date'] = pd.to_datetime(df_sample['delete_date'])
df_sample = df_sample[df_sample.founding_date < '2025-01-01']

In [68]:
df_sample.shape

(226726, 23)

In [38]:
df_all_firms.shape

(943328, 16)

In [70]:
df_sample

Unnamed: 0,ehraid,uid,delete_date,reason_for_dissolution,liquidation,bankruptcy,current_name,founding_name,current_legal_form,founding_legal_form,...,current_town,current_zip_code,current_country,founding_street,founding_town,founding_zip_code,founding_bfs_code,shab_id,founding_date,founding_message
0,1251325,CHE153193257,,[Mit Urteil des Gerichtspräsidenten des Zivilk...,True,False,Arlez Carrosserie GmbH in Liquidation,Arlez Carrosserie GmbH,4,4.0,...,Gelterkinden,4460,CH,Bleichiweg 4,Gelterkinden,4460.0,2846,2636869,2016-02-03,"Arlez Carrosserie GmbH, in Gelterkinden, CHE-1..."
1,1251326,CHE392024369,2020-11-11,[],False,False,Vista Coaching GmbH in Liquidation,Vista Coaching GmbH,4,4.0,...,Füllinsdorf,4414,CH,Ergolzstrasse 13,Füllinsdorf,4414.0,2825,2636871,2016-02-03,"Vista Coaching GmbH, in Füllinsdorf, CHE-392.0..."
2,1251327,CHE473646370,,,,,Wissler Consulting GmbH,Wissler Consulting GmbH,4,4.0,...,Maisprach,4464,CH,Wintersingerstrasse 18a,Maisprach,4464.0,2853,2636873,2016-02-03,"Wissler Consulting GmbH, in Maisprach, CHE-473..."
3,1251328,CHE205344235,,,,,Wolf Regio GmbH,Wolf Regio GmbH,4,4.0,...,Münchenstein,4142,CH,Grellingerstrasse 32,Münchenstein,4142.0,2769,2636875,2016-02-03,"Wolf Regio GmbH, in Münchenstein, CHE-205.344...."
4,1251329,CHE190527339,,"[Mit Entscheid vom 27.09.2022 , 9.15 Uhr , hat...",False,True,AHAS GmbH in Liquidation,AHAS GmbH,4,4.0,...,Hünenberg,6331,CH,Luzernstrasse 60,Malters,6102.0,1062,2637249,2016-02-03,"AHAS GmbH, in Malters, CHE-190.527.339, Luzern..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226721,1619512,CHE463481400,,[Die Gesellschaft hat sich mit Beschluss der a...,True,False,LB & Partner Baumanagement GmbH in Liquidation,LB & Partner Baumanagement GmbH,4,4.0,...,Hildisrieden,6024,CH,Feldacher 2,Hildisrieden,6024.0,1088,1005923014,2023-12-29,"LB & Partner Baumanagement GmbH, in Hildisried..."
226722,1619513,CHE228622540,,,,,Rigi Flachdach GmbH,Rigi Flachdach GmbH,4,4.0,...,Inwil,6034,CH,Feldmattstrasse 19,Emmen,6032.0,1024,1005923015,2023-12-29,"Rigi Flachdach GmbH, in Emmen, CHE-228.622.540..."
226723,1619514,CHE188578474,,,,,Sozialbegleiterin Alessandra De Donno GmbH,Sozialbegleiterin Alessandra De Donno GmbH,4,4.0,...,Gisikon,6038,CH,Mühlematt 8,Gisikon,6038.0,1055,1005923016,2023-12-29,"Sozialbegleiterin Alessandra De Donno GmbH, in..."
226724,1619515,CHE421115361,,,,,Wärmeverbund Ettiswil AG,Wärmeverbund Ettiswil AG,3,3.0,...,Ettiswil,6218,CH,Surseestrasse 5,Ettiswil,6218.0,1128,1005923017,2023-12-29,"Wärmeverbund Ettiswil AG, in Ettiswil, CHE-421..."


In [23]:
websites = pd.read_csv(RAW_DATA_DIR / 'company_urls' / 'scraped_company_urls.csv')
websites = websites.drop_duplicates(subset=['uid'], keep='first')

In [27]:
websites

Unnamed: 0,uid,noga,company_url
0,CHE395937898,749000,no website available
1,CHE142825231,812100,no website available
2,CHE171766547,464700,no website available
3,CHE430662484,257300,no website available
4,CHE156125157,829900,no website available
...,...,...,...
618169,CHE452592018,855904,no website available
618170,CHE104902451,855904,no website available
618171,CHE103332769,855904,http://www.commercants-lausannois.ch/
618172,CHE390937135,561001,no website available


In [48]:
def detect_language(text: str) -> str:
    return detect(text)['lang']

df_merged = df.merge(websites, on='uid', how='left')
df_merged['language'] = df_merged['founding_message'].str.replace('\n', '').apply(detect_language)

In [49]:
df_merged

Unnamed: 0,name,ehraid,uid,legal_seat_id,legal_seat,street,house_number,town,swiss_zip_code,country,legal_form_id,status,delete_date,purpose_raw,shab_id,founding_date,founding_message,noga,company_url,language
0,TS-Gebäudereinigung Savic,1258893,CHE142825231,3204,Wittenbach,St. Gallerstrasse,57,Wittenbach,9300,CH,1,GELOESCHT,2017-11-08,Gebäudereinigung,2763045,2016-04-06,"TS-Gebäudereinigung Savic + Co, in Wittenbach,...",812100,no website available,de
1,Coplan E. Kistler,1263962,CHE178876510,2773,Reinach (BL),Fontanaweg,2,Reinach BL,4153,CH,1,EXISTIEREND,NaT,"Planung, Projektierung und Ausführung von Anla...",2844775,2016-05-23,"Coplan E. Kistler, in Reinach BL, CHE-178.876....",711203,no website available,de
2,as technik SA,1264102,CHE307459275,2143,Morlon,La Croix,27,Morlon,1638,CH,3,EXISTIEREND,NaT,"fourniture, installation, maintenance et dépan...",2847491,2016-05-24,"as technik SA, à Morlon, La Croix 27, 1638 Mor...",432204,http://www.as-technik-sa.ch,fr
3,Fondation ALM,1264334,CHE137304452,6710,Courtételle,Rue des Chênes,9,Courtételle,2852,CH,7,EXISTIEREND,NaT,Promouvoir et encourager le développement et l...,2853935,2016-05-26,"Fondation ALM, à Courtételle, CHE-137.304.452,...",949901,http://fundraiso.ch,fr
4,Coiffure Am Leewasser GmbH,1267084,CHE156073193,1364,Ingenbohl,Bahnhofstrasse,28,Brunnen,6440,CH,4,EXISTIEREND,NaT,Die Gesellschaft bezweckt den Betrieb eines Co...,2895985,2016-06-17,"Coiffure Am Leewasser GmbH, in Ingenbohl, CHE-...",960201,http://www.coiffure-leewasser.ch/,de
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407991,Anolis SA,1565902,CHE164708013,6621,Genève,Boulevard des Philosophes,17,Genève,1205,CH,3,EXISTIEREND,NaT,"tout conseil en stratégie et management, gesti...",1005632461,2022-12-19,"Anolis SA, à Genève, Boulevard des Philosophes...",011600,http://www.anolis-sas.com/,fr
407992,RMB-CARS Sàrl,1579131,CHE215949718,5852,Bursinel,Route du Village,20,Bursinel,1195,CH,4,EXISTIEREND,NaT,la société a pour but toutes activités d'achat...,1005703446,2023-03-17,"RMB-CARS Sàrl, à Bursinel, Route du Village 20...",,,fr
407993,PENTI BAU P. MEHMETI,1583263,CHE269339337,295,Horgen,Kalkofenstrasse,25,Horgen,8810,CH,1,EXISTIEREND,NaT,Maler; Gipser; Fassadenbau; Trockenbau; Tapetz...,1005723493,2023-04-14,"PENTI BAU P. MEHMETI, in Horgen, CHE-269.339.3...",,,de
407994,Amavet Sàrl,1640857,CHE338218307,6621,Genève,Route de Meyrin,49,Genève,1203,CH,4,EXISTIEREND,NaT,La société a pour but l'exécution de toutes pr...,1006039627,2024-05-24,"Amavet Sàrl, à Genève, Route de Meyrin 49, c/o...",750000,no website available,fr


In [33]:
df_merged[(df_merged.status == 'GELOESCHT')]

Unnamed: 0,name,ehraid,uid,legal_seat_id,legal_seat,street,house_number,town,swiss_zip_code,country,legal_form_id,status,delete_date,purpose_raw,shab_id,founding_date,founding_message,noga,company_url
0,TS-Gebäudereinigung Savic,1258893,CHE142825231,3204,Wittenbach,St. Gallerstrasse,57,Wittenbach,9300,CH,1,GELOESCHT,2017-11-08,Gebäudereinigung,2763045,2016-04-06,"TS-Gebäudereinigung Savic + Co, in Wittenbach,...",812100,no website available
5,good food artisans T&M chicouri snc,1270620,CHE288520386,5606,Lutry,Chemin du Petit-Bochat,45,La Conversion,1093,CH,2,GELOESCHT,2023-07-12,commerce de produits artisanaux et locaux sans...,2951043,2016-07-13,"good food artisans T&M chicouri snc, à Lutry, ...",107100,http://www.goodfoodartisans.ch/
7,GE Energy STS Switzerland GmbH,1285073,CHE226642951,4021,Baden,Brown Boveri Strasse,7,Baden,5400,CH,4,GELOESCHT,2017-06-02,"Erwerb, Handel und Dienstleistungen im Zusamme...",3195847,2016-12-01,"GE Energy STS Switzerland GmbH, in Baden, CHE-...",691002,no website available
9,Catering Consulting di Fabi Fabrizio,1287821,CHE291004139,3787,St. Moritz,Via Signuria,10,St. Moritz,7500,CH,1,GELOESCHT,2024-10-16,"La consulenza, l'intermediazione e la prestazi...",3238139,2016-12-21,"Consulting Singles di Fabi Fabrizio, in Balern...",562100,no website available
15,"Ineffable&Mellifluous, Thirumoorthy",1336273,CHE457505305,1709,Unterägeri,Waldheimstrasse,5,Unterägeri,6314,CH,1,GELOESCHT,2018-08-30,"Onlineshop, insbesondere Handel und Verkauf vo...",4028969,2018-02-01,"Ineffable&Mellifluous, Thirumoorthy, in Unterä...",479100,no website available
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407943,Heimetli GmbH in Liquidation,1354504,CHE376408760,4616,Uesslingen-Buch,Hauptstrasse,4,Buch b. Frauenfeld,8524,CH,4,GELOESCHT,2025-01-17,Die Gesellschaft bezweckt das Angebot für inte...,4321961,2018-06-28,"Heimetli GmbH, in Uesslingen-Buch, CHE-376.408...",,
407950,Zwyer Baum- & Gartenarbeiten,1435061,CHE315286545,1371,Sattel,Eggeli,5,Sattel,6417,CH,1,GELOESCHT,2023-11-08,"Baum- und Gartenarbeiten, Umgebungspflege, Hau...",1004883750,2020-05-06,"Zwyer Baum- & Gartenarbeiten, in Sattel, CHE-3...",,
407961,Šajn Reinigung Clean,1604777,CHE360924837,3203,St. Gallen,Friedhofstrasse,9,St. Gallen,9014,CH,1,GELOESCHT,2024-05-16,Erbringung von Dienstleistungen im Bereich Rei...,1005841804,2023-09-20,"Š ajn Reinigung Clean, in St. Gallen, CHE-360....",,
407968,Regalado Santana Music,1663051,CHE484499446,261,Zürich,Stettbacherrain,17,Zürich,8051,CH,1,GELOESCHT,2025-03-25,Die Firma bezweckt die Publikation und den Ver...,1006166811,2024-10-31,"Regalado Santana Music, in Zürich, CHE-484.499...",,


In [61]:
df_merged[~(df_merged.company_url.isin(['no website available', 'no_match_found'])) & ~(df_merged.company_url.isna()) & (df_merged.language == 'de')].sample(n=5000).to_csv(RAW_DATA_DIR / 'company_urls' / 'urls_de.csv', index=False)

In [57]:
df_merged[~(df_merged.company_url.isin(['no website available', 'no_match_found'])) & ~(df_merged.company_url.isna())].sample(n=20).to_csv(RAW_DATA_DIR / 'company_urls' / 'urls.csv', index=False)

In [36]:
df_merged[(df_merged.company_url == 'no website available') & ~(df_merged.company_url.isna())]

Unnamed: 0,name,ehraid,uid,legal_seat_id,legal_seat,street,house_number,town,swiss_zip_code,country,legal_form_id,status,delete_date,purpose_raw,shab_id,founding_date,founding_message,noga,company_url
0,TS-Gebäudereinigung Savic,1258893,CHE142825231,3204,Wittenbach,St. Gallerstrasse,57,Wittenbach,9300,CH,1,GELOESCHT,2017-11-08,Gebäudereinigung,2763045,2016-04-06,"TS-Gebäudereinigung Savic + Co, in Wittenbach,...",812100,no website available
1,Coplan E. Kistler,1263962,CHE178876510,2773,Reinach (BL),Fontanaweg,2,Reinach BL,4153,CH,1,EXISTIEREND,NaT,"Planung, Projektierung und Ausführung von Anla...",2844775,2016-05-23,"Coplan E. Kistler, in Reinach BL, CHE-178.876....",711203,no website available
6,Swiss Blade AG,1272772,CHE379732292,1708,Steinhausen,Bahnhofstrasse,63,Steinhausen,6312,CH,3,EXISTIEREND,NaT,Die Gesellschaft bezweckt die Erbringung von L...,2985219,2016-08-03,Medisantos AG (Medisantos SA) (Medisantos Ltd)...,282900,no website available
7,GE Energy STS Switzerland GmbH,1285073,CHE226642951,4021,Baden,Brown Boveri Strasse,7,Baden,5400,CH,4,GELOESCHT,2017-06-02,"Erwerb, Handel und Dienstleistungen im Zusamme...",3195847,2016-12-01,"GE Energy STS Switzerland GmbH, in Baden, CHE-...",691002,no website available
9,Catering Consulting di Fabi Fabrizio,1287821,CHE291004139,3787,St. Moritz,Via Signuria,10,St. Moritz,7500,CH,1,GELOESCHT,2024-10-16,"La consulenza, l'intermediazione e la prestazi...",3238139,2016-12-21,"Consulting Singles di Fabi Fabrizio, in Balern...",562100,no website available
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407982,Maven Agency AG,1526459,CHE193960193,1701,Baar,Blegistrasse,7,Baar,6340,CH,3,EXISTIEREND,NaT,Der Hauptzweck der Gesellschaft ist die Erbrin...,1005419317,2022-03-03,"Maven Agency AG, in Kreuzlingen, CHE-193.960.1...",620100,no website available
407983,Brozen Impact SA,1529359,CHE467174776,6458,Neuchâtel,Place Pury,3,Neuchâtel,2000,CH,3,EXISTIEREND,NaT,"investir, en Suisse et à l'étranger, dans des ...",1005435074,2022-03-24,"Brozen Impact SA, à Neuchâtel, Place Pury 3, c...",631200,no website available
407984,MR Schreiner GmbH,1546402,CHE149617780,4666,Kemmental,Rütiweg,8,Dotnacht,8566,CH,4,EXISTIEREND,NaT,Die Gesellschaft bezweck die Erbringung von Di...,1005526302,2022-07-21,"MR Schreiner GmbH, in Kemmental, CHE-149.617.7...",683200,no website available
407994,Amavet Sàrl,1640857,CHE338218307,6621,Genève,Route de Meyrin,49,Genève,1203,CH,4,EXISTIEREND,NaT,La société a pour but l'exécution de toutes pr...,1006039627,2024-05-24,"Amavet Sàrl, à Genève, Route de Meyrin 49, c/o...",750000,no website available


In [30]:
df_merged[(df_merged.company_url.isna())]

Unnamed: 0,name,ehraid,uid,legal_seat_id,legal_seat,street,house_number,town,swiss_zip_code,country,legal_form_id,status,delete_date,purpose_raw,shab_id,founding_date,founding_message,noga,company_url
12101,CEGA Elektro GmbH in Liquidation,1267213,CHE484087210,177,Pfäffikon,Hermatswilerstrasse,62,Pfäffikon ZH,8330,CH,4,GELOESCHT,2021-02-08,Erbringung von handwerklichen Dienstleistungen...,2898729,2016-06-20,CEGA Elektro GmbH (CEGA Elektro Sàrl) (CEGA El...,,
12102,WENK KERAMIK Inhaber Bozkurt,1274305,CHE147679103,2762,Allschwil,Beim Lindenbaum,23,Allschwil,4123,CH,1,GELOESCHT,2020-05-20,Plattenleger-Geschäft.,3013709,2016-08-22,"WENK KERAMIK Inhaber Bozkurt, in Allschwil, CH...",,
12103,Hodapp Schweiz GmbH,1274405,CHE376020723,4280,Oftringen,Föhrenweg,4,Oftringen,4665,CH,4,EXISTIEREND,NaT,"Vertrieb, Herstellung, Elektrifizierung, Monta...",3016209,2016-08-23,"Hodapp Schweiz GmbH, in Oftringen, CHE-376.020...",,
12104,"Cafe&Bistro Piazza, Jacqueline Ryf",1281055,CHE204178796,4163,Frick,Hauptstrasse,35,Frick,5070,CH,1,EXISTIEREND,NaT,Betrieb eines Cafe's und Bistros.,3128493,2016-10-26,"Cafe&Bistro Piazza, Jacqueline Thommen, in Fri...",,
12105,Kieser Training Schweiz AG,1321139,CHE232109599,261,Zürich,Hardstrasse,223,Zürich,8005,CH,3,EXISTIEREND,NaT,Ausarbeitung und Durchführung von Krafttrainin...,3783261,2017-10-02,"exersuisse ag, in Zug, CHE-232.109.599, Bundes...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407988,MAC Formation Sàrl,1559672,CHE364200394,2125,Bulle,Rue de Vevey,178,Bulle,1630,CH,4,EXISTIEREND,NaT,la société a pour but le formation de professi...,1005599305,2022-11-08,"MAC Formation Sàrl, à Bulle, Rue de Vevey 178,...",,
407989,Gartencharme Stettler,1563461,CHE396470785,902,Langnau im Emmental,Gmünden,984,Gohl,3553,CH,1,EXISTIEREND,NaT,Erbringung von Dienstleistungen im Bereich Unt...,1005619150,2022-12-05,"Gartencharme Stettler, in Langnau im Emmental,...",,
407990,Schnider Optik GmbH,1565260,CHE294799713,1054,Ebikon,Riedmattstrasse,12,Ebikon,6030,CH,4,EXISTIEREND,NaT,Die Gesellschaft bezweckt den Betrieb eines Op...,1005627716,2022-12-14,"Schnider Optik GmbH, in Ebikon, CHE-294.799.71...",,
407992,RMB-CARS Sàrl,1579131,CHE215949718,5852,Bursinel,Route du Village,20,Bursinel,1195,CH,4,EXISTIEREND,NaT,la société a pour but toutes activités d'achat...,1005703446,2023-03-17,"RMB-CARS Sàrl, à Bursinel, Route du Village 20...",,


In [4]:
import numpy as np
import pandas as pd
from pocketknife.database import connect_database, read_from_database
from success_prediction.zefix_processing.clustering import PersonClustering

pd.set_option('future.no_silent_downcasting', True)

In [262]:
query = """
    SELECT * FROM zefix.history_inscribed_people
"""

with connect_database() as con:
    people_df = read_from_database(con, query)
people_df['shab_date'] = pd.to_datetime(people_df['shab_date'])

# Filter out companies where we don't have at least one founder
people_df = people_df[people_df.ehraid.isin(people_df[people_df.founders].ehraid)]

# Pre-process dataframe
bfs_code_cols = [col for col in people_df.columns if 'bfs_gmde_code_' in col]
people_df[bfs_code_cols] = people_df[bfs_code_cols].astype(str).replace('0', np.nan)

In [None]:
# test_df = people_df[people_df.ehraid.isin([1600448, 1251490, 1260743, 1328630])].reset_index(drop=True).copy()

In [None]:
clustering = PersonClustering(people_df)
clustered_df = clustering.cluster()

Cluster people within company: 100%|██████████| 4/4 [00:00<00:00, 272.57it/s]


In [None]:
clustered_df

Unnamed: 0,ehraid,shab_date,shab_id,keyword,first_name,first_name_norm,last_name,last_name_norm,job_title,signing_rights,...,place_of_residence_2_bfs_stand_origin,hometown_1_bfs_gmde_code_latest,hometown_2_bfs_gmde_code_latest,hometown_3_bfs_gmde_code_latest,hometown_4_bfs_gmde_code_latest,hometown_5_bfs_gmde_code_latest,place_of_residence_1_bfs_gmde_code_latest,place_of_residence_2_bfs_gmde_code_latest,heuristic,fid
7,1251490,2016-02-03,2636489,personnes inscrites special,Daniel,daniel,Delisle,delisle,associé-gérant,avec signature individuelle,...,,5586.0,,,,,6458,,,1.0
8,1260743,2016-04-21,2792853,personnes inscrites special,Daniel,daniel,Delisle,delisle,associé-gérant,avec signature individuelle,...,,5586.0,,,,,6458,,,1.0
9,1328630,2017-12-04,3907923,personnes inscrites special,José,jose,Vilela,vilela,président,avec signature collective à deux,...,,,,,,,371,,,2.0
13,1328630,2017-12-04,3907923,personnes inscrites special,Joël,joel,Sprunger,sprunger,,avec signature collective à deux,...,,4726.0,,,,,690,,,3.0
12,1328630,2017-12-04,3907923,personnes inscrites special,Olivier,olivier,Sprunger,sprunger,trésorier,avec signature collective à deux,...,,4726.0,,,,,690,,,4.0
11,1328630,2017-12-04,3907923,personnes inscrites special,Daniel,daniel,Delisle,delisle,secrétaire,avec signature collective à deux,...,,5586.0,,,,,6458,,,1.0
10,1328630,2017-12-04,3907923,personnes inscrites special,Alain,alain,Jaccard,jaccard,vice-président,avec signature collective à deux,...,,5568.0,,,,,5568,,,5.0
17,1600448,2023-08-16,1005817188,eingetragene personen,Serdar,serdar,Gelebek,gelebek,Mitglied des Verwaltungsrates,mit Kollektivunterschrift zu zweien,...,,,,,,,4781,,,6.0
16,1600448,2023-08-16,1005817188,eingetragene personen,Daniel,daniel,Böhringer,boehringer,Mitglied des Verwaltungsrates,mit Kollektivunterschrift zu zweien,...,,247.0,,,,,1709,,,7.0
15,1600448,2023-08-16,1005817188,eingetragene personen,Kevin,kevin,Bollmann,bollmann,Mitglied des Verwaltungsrates,mit Kollektivunterschrift zu zweien,...,,180.0,,,,,230,,,8.0
