# Coding Setup

### Load Packages

In [40]:
# All packages (see requirements.txt)

## Data Collection and Processing
import pandas as pd
from collections import defaultdict

## System and Environment
import os
from dotenv import load_dotenv
import glob
import time

## Api calling
import requests
from opencage.geocoder import OpenCageGeocode
import gender_guesser.detector as gender
from pathlib import Path
import logging

## Math operations
from math import radians, sin, cos, sqrt, atan2

### Set Working Environment

In [41]:
# ---insert your path here ---
os.chdir('/Users/janlinzner/Projects/thesis-spatial-seed-syndication') 

### Load Api-keys

In [42]:
# load api keys from .env file or insert the key directly
load_dotenv()
geocage = os.getenv("geocage")
genderize = os.getenv("genderize")

### Create Harvesine Formula
[How to calculate the harvesine formula](https://stackoverflow.com/questions/27928/calculate-distance-between-two-latitude-longitude-points-haversine-formula)

In [43]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371 

    phi1, phi2 = radians(lat1), radians(lat2)
    d_phi = radians(lat2 - lat1)
    d_lambda = radians(lon2 - lon1)

    a = sin(d_phi / 2) ** 2 + cos(phi1) * cos(phi2) * sin(d_lambda / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c

# Data Collection

![Data Generation](data_generation.png)

## Data Sources Overview

## Data Loading and Preparation

### A: Companies

#### Companies: Load Data

Load all company files from `data/companies`

In [44]:
root_directory = 'data/companies'

df_list = []

selected_columns = [
    "Organization Name", "Organization Name URL", "Description", "Industry Groups", "Headquarters Location",
    "Founded Date", "Number of Founders", "Last Equity Funding Type", "Exit Date", "Founders" 
]

for subdir, _, _ in os.walk(root_directory):
    csv_files = glob.glob(os.path.join(subdir, '*.csv'))
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

final_df = pd.concat(df_list, ignore_index=True)

companies = final_df[selected_columns].copy()

Rename the variables to small letters with underscore

In [45]:
companies = companies.rename(columns={
    "Organization Name": "organization_name",
    "Organization Name URL": "organization_name_url",
    "Description": "organization_description",
    "Industry Groups": "industry_groups",
    "Headquarters Location": "hq_location",
    "Founded Date": "founded_date",
    "Number of Founders": "number_of_founders",
    "Last Equity Funding Type": "last_equity_funding_type",
    "Exit Date": "exit_date",
    "Founders": "founders"
})

#### Companies: Clean Data

Convert data variables to date types

In [46]:
date_cols = ["founded_date"]
for col in date_cols:
    companies[col] = pd.to_datetime(companies[col], errors="coerce")

Get `founding_year`

In [47]:
companies["founded_year"] = companies["founded_date"].dt.year
companies.drop(columns=["founded_date"])

Unnamed: 0,organization_name,organization_name_url,organization_description,industry_groups,hq_location,number_of_founders,last_equity_funding_type,exit_date,founders,founded_year
0,DePoly,https://www.crunchbase.com/organization/depoly,DePoly is a PET-to-raw-material recycling comp...,"Science and Engineering, Sustainability","Sion, Valais, Switzerland",3.0,Seed,,"Bardiya Valizadeh, Christopher Ireland, Samant...",2020
1,Hilo,https://www.crunchbase.com/organization/aktiia,Hilo is a healthcare company that provides blo...,"Artificial Intelligence (AI), Consumer Electro...","Neuchâtel, Neuchatel, Switzerland",3.0,Series B,,"Josep Sola, Mattia Bertschi, Raghav Gupta",2018
2,Sygnum,https://www.crunchbase.com/organization/sygnum,Sygnum is a digital asset banking group that e...,"Blockchain and Cryptocurrency, Financial Servi...","Zürich, Zurich, Switzerland",3.0,Series C,,"Gerald Goh, Luka Muller, Mathias Imbach",2017
3,HAYA Therapeutics,https://www.crunchbase.com/organization/haya-t...,HAYA Therapeutics is a biopharmaceutical compa...,"Biotechnology, Health Care, Science and Engine...","Lausanne, Vaud, Switzerland",2.0,Undisclosed,,"Daniel Blessing, Samir Ounzain",2017
4,Squirro,https://www.crunchbase.com/organization/squirro,Squirro is a software company that utilizes Au...,"Artificial Intelligence (AI), Data and Analyti...","Zürich, Zurich, Switzerland",4.0,Private Equity,,"Dorian Selz, Felix Hürlimann, Patrice Neff, To...",2012
...,...,...,...,...,...,...,...,...,...,...
23093,OutdoorCompute,https://www.crunchbase.com/organization/outdoo...,OutdoorCompute's liquid immersion cooling tech...,,"Benningbroek, Noord-Holland, The Netherlands",,Seed,,,2023
23094,NEKOD,https://www.crunchbase.com/organization/nekod,NEKOD is a platform designed for business auto...,"Artificial Intelligence (AI), Data and Analyti...","Amsterdam, Noord-Holland, The Netherlands",,Pre-Seed,,,2024
23095,Kvikk Insurance,https://www.crunchbase.com/organization/kvikk-...,Kvikk is a digital insurance platform.,Financial Services,"Bergen, Noord-Holland, The Netherlands",,Pre-Seed,,,2021
23096,Dynamicpixels,https://www.crunchbase.com/organization/dynami...,"PaaS,Video games,Back-end","Gaming, Software","Delft, Zuid-Holland, The Netherlands",,Pre-Seed,,,2023


Provide `company_id` with padding zeros (e.g. 000001)

In [48]:
companies.reset_index(drop=True, inplace=True)
companies["company_id"] = companies.index + 1
companies["company_id"] = companies["company_id"].apply(lambda x: f"{x:06d}")
companies["company_id"] = companies["company_id"].astype("string")

cols = ["company_id"] + [col for col in companies.columns if col != "company_id"]
companies = companies[cols]

Fill missing values in `number_of_founders` with mean

In [49]:
mean_founders = companies["number_of_founders"].mean(skipna=True)
companies["number_of_founders"] = companies["number_of_founders"].fillna(round(mean_founders)).astype(int)

Create `exit_binary` (True if value inside `exit_date`)

In [50]:
companies.loc[:, "exit_binary"]  = companies["exit_date"].notna().astype(int)
companies["exit_binary"] = companies["exit_binary"].astype(bool)

Add geocoding and get `longitude` and `latitude` data

In [51]:
data_dir = Path("data/locations")
data_dir.mkdir(parents=True, exist_ok=True)
cache_file = data_dir / "location_cache.csv"
BASE_URL = "https://api.opencagedata.com/geocode/v1/json"
API_KEY = geocage
DELAY = 0.5

logging.basicConfig(level=logging.INFO, format="%(message)s")

def load_cache(path):
    if path.exists():
        return pd.read_csv(path)
    df = pd.DataFrame(columns=["hq_location","latitude","longitude"])
    df.to_csv(path, index=False)
    return df

def geocode(companies, loc_col="hq_location"):
    cache = load_cache(cache_file)
    cache_map = dict(zip(cache.hq_location, zip(cache.latitude, cache.longitude)))
    results, new = [], []

    for i, loc in enumerate(companies[loc_col].dropna().unique(), 1):
        if not loc.strip():
            lat, lng, tag = None, None, 'blank'
        elif loc in cache_map:
            lat, lng, tag = *cache_map[loc], 'cached'
        else:
            try:
                r = requests.get(BASE_URL, params={"key":API_KEY, "q":loc, "limit":1})
                data = r.json().get('results', [])
                lat, lng = data[0]['geometry']['lat'], data[0]['geometry']['lng'] if data else (None, None)
            except:
                lat, lng = None, None
            new.append({'hq_location':loc,'latitude':lat,'longitude':lng})
            tag = 'fetched'
            time.sleep(DELAY)

        logging.info(f"[{i}/{len(cache_map)+len(new)}] ({tag}) {loc!r} → {lat}, {lng}")
        results.append({'hq_location':loc,'latitude':lat,'longitude':lng})

    if new:
        pd.concat([cache, pd.DataFrame(new)]).drop_duplicates('hq_location').to_csv(cache_file, index=False)
    return companies.merge(pd.DataFrame(results), on='hq_location', how='left')

companies = geocode(companies, 'hq_location')

[1/4500] (cached) 'Sion, Valais, Switzerland' → 46.2311749, 7.3588795
[2/4500] (cached) 'Neuchâtel, Neuchatel, Switzerland' → 46.9895828, 6.9292641
[3/4500] (cached) 'Zürich, Zurich, Switzerland' → 47.3744489, 8.5410422
[4/4500] (cached) 'Lausanne, Vaud, Switzerland' → 46.5218269, 6.6327025
[5/4500] (cached) 'Zug, Zug, Switzerland' → 47.1679898, 8.5173652
[6/4500] (cached) 'Sankt Gallen, Sankt Gallen, Switzerland' → 47.425618, 9.3762397
[7/4500] (cached) 'Kemptthal, Zurich, Switzerland' → 47.4528517, 8.7058075
[8/4500] (cached) 'Basel, Basel-Stadt, Switzerland' → 47.5581077, 7.5878261
[9/4500] (cached) 'Baar, Zug, Switzerland' → 47.1951976, 8.5253985
[10/4500] (cached) 'Bern, Bern, Switzerland' → 46.9484742, 7.4521749
[11/4500] (cached) 'Lugano, Ticino, Switzerland' → 46.0050102, 8.9520281
[12/4500] (cached) 'Zofingen, Aargau, Switzerland' → 47.288491, 7.9458259
[13/4500] (cached) 'Horgen, Zurich, Switzerland' → 47.260692, 8.5976831
[14/4500] (cached) 'Monthey, Valais, Switzerland' → 4

Add the homecountry (`hq_country`)

In [52]:
companies["hq_country"] = companies["hq_location"].apply(lambda x: x.split(',')[-1].strip() if pd.notna(x) else None)

Add startup ecosystem binary (`hub_binary`) if next hotspot less than 100km

In [53]:
hotspot_data = [
    {"city": "London",       "lat": 51.5074, "lng": -0.1278},
    {"city": "Paris",        "lat": 48.8566, "lng": 2.3522},
    {"city": "Berlin",       "lat": 52.5200, "lng": 13.4050},
    {"city": "Stockholm",    "lat": 59.3293, "lng": 18.0686},
    {"city": "Munich",       "lat": 48.1351, "lng": 11.5820},
    {"city": "Helsinki",     "lat": 60.1695, "lng": 24.9354},
    {"city": "Madrid",       "lat": 40.4168, "lng": -3.7038},
    {"city": "Dublin",       "lat": 53.3498, "lng": -6.2603},
    {"city": "Tallinn",      "lat": 59.4370, "lng": 24.7536},
    {"city": "Copenhagen",   "lat": 55.6761, "lng": 12.5683},
    {"city": "Milan",        "lat": 45.4642, "lng": 9.1900},
    {"city": "Zurich",       "lat": 47.3769, "lng": 8.5417},
    {"city": "Oslo",         "lat": 59.9139, "lng": 10.7522},
    {"city": "Cambridge",    "lat": 52.2053, "lng": 0.1218},
    {"city": "Kyiv",         "lat": 50.4501, "lng": 30.5234},
    {"city": "Vienna",       "lat": 48.2082, "lng": 16.3738},
    {"city": "Brussels",     "lat": 50.8503, "lng": 4.3517},
    {"city": "Manchester",   "lat": 53.4808, "lng": -2.2426},
    {"city": "Lisbon",       "lat": 38.7169, "lng": -9.1399},
    {"city": "Prague",       "lat": 50.0755, "lng": 14.4378},
    {"city": "Warsaw",       "lat": 52.2297, "lng": 21.0122},
    {"city": "Hamburg",      "lat": 53.5511, "lng": 9.9937},
    {"city": "Oxford",       "lat": 51.7520, "lng": -1.2577},
    {"city": "Amsterdam",    "lat": 52.3676, "lng": 4.9041},
    {"city": "Barcelona",    "lat": 41.3851, "lng": 2.1734},
    {"city": "Lausanne",     "lat": 46.5197, "lng": 6.6323},
]

hotspots_df = pd.DataFrame(hotspot_data)

def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth radius in km
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
    return R * 2 * atan2(sqrt(a), sqrt(1 - a))

def min_dist_to_hotspot(lat, lon):
    if pd.isna(lat) or pd.isna(lon):
        return float('inf')
    return hotspots_df.apply(
        lambda r: haversine(lat, lon, r["lat"], r["lng"]), axis=1
    ).min()

companies["latitude"]  = pd.to_numeric(companies["latitude"],  errors="coerce")
companies["longitude"] = pd.to_numeric(companies["longitude"], errors="coerce")

companies["distance_to_hub"] = companies.apply(
    lambda r: min_dist_to_hotspot(r["latitude"], r["longitude"]), axis=1
)
companies["hub_binary"] = (companies["distance_to_hub"] < 20).astype(bool)

Business orientation (B2B or B2C)<br>
<sub>Gathered through LLM-approach (see notebook: )<sub>

In [54]:
focus = pd.read_csv('data/business_focus/companies_business_focus_save.csv')
url_to_b2b = dict(zip(focus['Organization Name URL'], focus['B2B Binary']))
companies['b2b'] = companies['organization_name_url'].map(url_to_b2b)
df_missing_b2b = companies[companies['b2b'].isna()]
df_missing_b2b.to_csv('data/business_focus/df_missing_b2b.csv', index=False)

GICS Groups

In [55]:
companies["industry_groups"] = companies["industry_groups"].fillna('').str.replace(r'\s*,\s*', ',', regex=True)
dummies = companies["industry_groups"].str.get_dummies(sep=',').astype(bool)
companies = pd.concat([companies, dummies], axis=1)

In [56]:
to_move = [
    "Administrative Services", "Advertising", "Agriculture and Farming",
    "Apps", "Artificial Intelligence (AI)", "Biotechnology",
    "Blockchain and Cryptocurrency", "Clothing and Apparel",
    "Commerce and Shopping", "Community and Lifestyle",
    "Consumer Electronics", "Consumer Goods", "Content and Publishing",
    "Data and Analytics", "Design", "Education", "Energy", "Events",
    "Financial Services", "Food and Beverage", "Gaming",
    "Government and Military", "Hardware", "Health Care",
    "Information Technology", "Internet Services",
    "Lending and Investments", "Manufacturing", "Media and Entertainment",
    "Messaging and Telecommunications", "Mobile", "Music and Audio",
    "Natural Resources", "Navigation and Mapping", "Other", "Payments",
    "Platforms", "Privacy and Security", "Professional Services",
    "Real Estate", "Sales and Marketing", "Science and Engineering",
    "Social Impact", "Software", "Sports", "Sustainability",
    "Transportation", "Travel and Tourism", "Video"
]
cols_to_move = [c for c in to_move if c in companies.columns]

new_order = [c for c in companies.columns if c not in cols_to_move] + cols_to_move
companies = companies[new_order]

gics_map = {
    "Energy": "energy",
    "Blockchain and Cryptocurrency": "information_technology",
    "Natural Resources": "materials",
    "Administrative Services": "industrials",
    "Design": "industrials",
    "Government and Military": "industrials",
    "Manufacturing": "industrials",
    "Professional Services": "industrials",
    "Science and Engineering": "industrials",
    "Social Impact": "industrials",
    "Transportation": "industrials",
    "Clothing and Apparel": "consumer_discretionary",
    "Commerce and Shopping": "consumer_discretionary",
    "Community and Lifestyle": "consumer_discretionary",
    "Consumer Electronics": "consumer_discretionary",
    "Consumer Goods": "consumer_discretionary",
    "Education": "consumer_discretionary",
    "Events": "consumer_discretionary",
    "Sports": "consumer_discretionary",
    "Travel and Tourism": "consumer_discretionary",
    "Agriculture and Farming": "consumer_staples",
    "Food and Beverage": "consumer_staples",
    "Biotechnology": "health_care",
    "Health Care": "health_care",
    "Financial Services": "financials",
    "Lending and Investments": "financials",
    "Payments": "financials",
    "Apps": "information_technology",
    "Artificial Intelligence (AI)": "information_technology",
    "Blockchain and Cryptocurrency": "information_technology",
    "Data and Analytics": "information_technology",
    "Hardware": "information_technology",
    "Information Technology": "information_technology",
    "Internet Services": "information_technology",
    "Mobile": "information_technology",
    "Navigation and Mapping": "information_technology",
    "Platforms": "information_technology",
    "Privacy and Security": "information_technology",
    "Software": "information_technology",
    "Advertising": "communication_services",
    "Content and Publishing": "communication_services",
    "Gaming": "communication_services",
    "Media and Entertainment": "communication_services",
    "Messaging and Telecommunications": "communication_services",
    "Music and Audio": "communication_services",
    "Sales and Marketing": "communication_services",
    "Video": "communication_services",
    "Sustainability": "utilities",
    "Real Estate": "real_estate",
    "Other": "other",
}

head_to_orig = defaultdict(list)
for orig, head in gics_map.items():
    head_to_orig[head].append(orig)

for head, orig_cols in head_to_orig.items():
    companies[head] = companies[orig_cols].any(axis=1)

to_drop = [c for c in cols_to_move if gics_map[c] != c]
companies.drop(columns=to_drop, inplace=True)

gics_heads = list(head_to_orig.keys())
final_order = [c for c in companies.columns if c not in gics_heads] + gics_heads
companies = companies[final_order]

### B: Rounds

#### Rounds: Load Data

Load all rounds from `data/rounds`

In [57]:
root_directory = 'data/rounds'

selected_columns = [
    'Transaction Name URL', 'Organization Name', 'Organization Name URL', 'Funding Type', 'Money Raised (in USD)', 'Announced Date', 'Lead Investors', 'Investor Names'
]

df_list = []

for subdir, _, _ in os.walk(root_directory):
    csv_files = glob.glob(os.path.join(subdir, '*.csv'))
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

final_df = pd.concat(df_list, ignore_index=True)

rounds = final_df[selected_columns].copy()

#### Rounds: Clean Data

Drop duplicate rounds

In [58]:
rounds = rounds.drop_duplicates()

Rename the variables

In [59]:
rounds = rounds.rename(columns={
    "Transaction Name URL": "transaction_name_url",
    "Organization Name": "organization_name",
    "Organization Name URL": "organization_name_url",
    "Funding Type": "funding_type",
    "Money Raised (in USD)": "money_raised_usd",
    "Announced Date": "announced_date",
    "Lead Investors": "lead_investors",
    "Investor Names": "investor_names"
})

Drop all rounds other than Pre-Seed or Seed

In [60]:
rounds = rounds[rounds['funding_type'].isin(['Pre-Seed', 'Seed'])]

Change `announced_date` to date type

In [61]:
rounds['announced_date'] = pd.to_datetime(rounds['announced_date'], errors='coerce')

Add IDs (`round_id`)

In [62]:
rounds.reset_index(drop=True, inplace=True)
rounds['round_id'] = rounds.index + 1
rounds['round_id'] = rounds['round_id'].apply(lambda x: f"{x:06d}")
rounds['round_id'] = rounds['round_id'].astype('string')

cols = ['round_id'] + [c for c in rounds.columns if c != 'round_id']
rounds = rounds[cols]

### C: Investors

#### Investors: Load Data

Load all files from `data/investor`

In [63]:
csv_dir = 'data/investor'
columns = [
    "Organization/Person Name", "Organization/Person Name URL", "Investor Type",
    "Location", "Description", "Founded Date", "CB Rank (Investor)"
]

files = glob.glob(os.path.join(csv_dir, '*.csv'))
dfs = [pd.read_csv(f) for f in files if os.path.getsize(f) > 0]

if dfs:
    investors = pd.concat(dfs, ignore_index=True)
    investors = investors[[col for col in columns if col in investors.columns]]
    investors.drop_duplicates(subset=["Organization/Person Name", "Organization/Person Name URL"], inplace=True)
else:
    investors = pd.DataFrame(columns=columns)

#### Investors: Clean Data

Rename the variables

In [64]:
investors = investors.rename(columns={
    "Organization/Person Name": "organization_person_name",
    "Organization/Person Name URL": "organization_person_name_url",
    "Investor Type": "investor_type",
    "Location": "hq_location",
    "Description": "description",
    "Founded Date": "founded_date",
    "CB Rank (Investor)": "cb_rank_investor"
})

Convert `founded_date` into the date type

In [65]:
date_cols = ["founded_date"]
for col in date_cols:
    investors[col] = pd.to_datetime(investors[col], errors='coerce')

Provide `investor_id`

In [66]:
investors.reset_index(drop=True, inplace=True)
investors['investor_id'] = investors.index + 1
investors['investor_id'] = investors['investor_id'].apply(lambda x: f"{x:06d}")
investors['investor_id'] = investors['investor_id'].astype('string')

cols = ['investor_id'] + [c for c in investors.columns if c != 'investor_id']
investors = investors[cols]

Add `longitude` and `latitude` pairs

In [67]:
data_dir = Path("data/locations")
data_dir.mkdir(parents=True, exist_ok=True)
cache_file = data_dir / "location_cache.csv"
BASE_URL = "https://api.opencagedata.com/geocode/v1/json"
API_KEY = geocage
DELAY = 0.5

logging.basicConfig(level=logging.INFO, format="%(message)s")

def load_cache(path):
    if path.exists():
        return pd.read_csv(path)
    df = pd.DataFrame(columns=["hq_location", "latitude", "longitude"])
    df.to_csv(path, index=False)
    return df

def geocode_locations(df, loc_col="hq_location"):
    cache = load_cache(cache_file)
    cache_map = dict(zip(cache.hq_location, zip(cache.latitude, cache.longitude)))
    results = []
    new_entries = []

    unique_locs = df[loc_col].dropna().unique()
    total = len(unique_locs)

    for idx, loc in enumerate(unique_locs, start=1):
        if not loc.strip():
            lat = lng = None
            tag = 'blank'
        elif loc in cache_map:
            lat, lng = cache_map[loc]
            tag = 'cached'
        else:
            try:
                r = requests.get(BASE_URL, params={"key": API_KEY, "q": loc, "limit": 1}, timeout=10)
                r.raise_for_status()
                res = r.json().get('results')
                if res:
                    lat = res[0]['geometry']['lat']
                    lng = res[0]['geometry']['lng']
                else:
                    lat = lng = None
            except Exception as e:
                logging.warning(f"Error fetching '{loc}': {e}")
                lat = lng = None
            new_entries.append({'hq_location': loc, 'latitude': lat, 'longitude': lng})
            tag = 'fetched'
            time.sleep(DELAY)

        logging.info(f"[{idx}/{total}] ({tag}) {loc!r} → {lat}, {lng}")
        results.append({'hq_location': loc, 'latitude': lat, 'longitude': lng})

    if new_entries:
        updated = pd.concat([cache, pd.DataFrame(new_entries)], ignore_index=True)
        updated.drop_duplicates(subset='hq_location', keep='first', inplace=True)
        updated.to_csv(cache_file, index=False)
        logging.info(f"Cache updated ({len(new_entries)} new entries)")

    coords_df = pd.DataFrame(results)
    return df.merge(coords_df, on='hq_location', how='left')

investors = geocode_locations(investors, 'hq_location')

[1/2830] (cached) 'Tokyo, Tokyo, Japan' → 35.6812546, 139.766706
[2/2830] (cached) 'London, England, United Kingdom' → 51.4893335, -0.1440551
[3/2830] (cached) 'Olten, Aargau, Switzerland' → 47.3085666, 7.8932696
[4/2830] (cached) 'Toronto, Ontario, Canada' → 43.6534817, -79.3839347
[5/2830] (cached) 'Muttenz, Basel-Landschaft, Switzerland' → 47.525113, 7.6477401
[6/2830] (cached) 'Mechelen-bovelingen, Limburg, Belgium' → 50.7427937, 5.2629291
[7/2830] (cached) 'Rome, Lazio, Italy' → 41.8933203, 12.4829321
[8/2830] (cached) 'Östermalm, Stockholms Lan, Sweden' → 59.3382751, 18.0718928
[9/2830] (cached) 'Mountain View, California, United States' → 37.3893889, -122.0832101
[10/2830] (cached) 'Charlotte, North Carolina, United States' → 35.2272086, -80.8430827
[11/2830] (cached) 'Bangalore, Karnataka, India' → 12.9767936, 77.590082
[12/2830] (cached) 'Vienna, Wien, Austria' → 48.1857192, 16.4221587
[13/2830] (cached) 'Hamburg, Hamburg, Germany' → 53.550341, 10.000654
[14/2830] (cached) 'Li

Get homecountry (`hq_country`)

In [68]:
investors['hq_country'] = investors['hq_location'].apply(lambda x: x.split(',')[-1].strip() if pd.notna(x) else None)

Investor Type

In [69]:
def parse_types(x):
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return [t.strip().lower() for t in x]
    return [t.strip().lower() for t in str(x).split(",")]

keywords = [
    "accelerator",
    "incubator",
    "micro vc",
    "corporate venture capital",
    "angel group",
    "angel",
    "university program",
    "entrepreneurship program",
    "family investment office"
]

investors["type_list"] = investors["investor_type"].apply(parse_types)

for kw in keywords:
    col = kw.replace(" ", "_")
    investors[col] = investors["type_list"].apply(
        lambda types: any(kw in t for t in types)
    )

investors.drop(columns="type_list", inplace=True)

### Grants

#### Grants: Load Data

Load all grants from `data/grants`

In [70]:
root_directory = 'data/grants'

df_list = []

selected_columns = [
    "Transaction Name", "Organization Name", "Organization Name URL", "Announced Date", "Money Raised (in USD)", "Investor Names"
]

for subdir, _, _ in os.walk(root_directory):
    csv_files = glob.glob(os.path.join(subdir, '*.csv'))
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

final_df = pd.concat(df_list, ignore_index=True)

grants = final_df[selected_columns].copy()

#### Grants: Data Cleaning

Renaming Variables

In [71]:
grants = grants.rename(columns={
    "Transaction Name": "transaction_name",
    "Organization Name": "organization_name",
    "Organization Name URL": "organization_name_url",
    "Announced Date": "announced_date",
    "Money Raised (in USD)": "money_raised_usd",
    "Investor Names": "investor_names"
})

`announced_date` into date type

In [72]:
cols = ["announced_date"]
for col in cols:
    grants[col] = pd.to_datetime(grants[col], errors='coerce')

# Data Engineering

## Preparation

Create a help table `seed_help`<br>
<sub>`seed_help` lists all investor-investee pairs over all rounds<sub>

In [73]:
seed_help = rounds[["round_id", "organization_name_url", "investor_names"]].copy()
rounds_investors = seed_help.dropna(subset=["investor_names"])

seed_help["investor_names"] = seed_help["investor_names"].str.split(",")
seed_help = seed_help.explode("investor_names")
seed_help["investor_names"] = seed_help["investor_names"].str.strip()

seed_help = seed_help[seed_help["investor_names"] != ""]

seed_help = seed_help.merge(companies[["organization_name_url", "company_id"]], on="organization_name_url", how="left")

seed_help.reset_index(drop=True, inplace=True)

seed_help = seed_help[seed_help["company_id"].notna()].reset_index(drop=True)

Only keep investors in `seed_help`with an investor entry in `investors`

In [74]:
seed_help = seed_help.merge(
    investors[["organization_person_name", "investor_id"]],
    left_on="investor_names",
    right_on="organization_person_name",
    how="inner"
)

seed_help = seed_help.drop(columns=["organization_person_name"])

## Company

## Founding Team

### Female Founder<br>
Check if there is at least one female founder involved in the new venture. A founder is labeled as female based on the first name and the help of a dedicated library and the genderize-api.

Find all unique names to make the library/api requests more efficient

In [75]:
founders = companies[["company_id", "founders"]].copy()
founders = founders.dropna(subset=["founders"])
founders["founders"] = founders["founders"].astype(str)

founders = founders.assign(founder=founders["founders"].str.split(",")).explode("founder")
founders["founder"] = founders["founder"].str.strip()
founders = founders[founders["founder"] != ""]
founders = founders[["company_id", "founder"]].reset_index(drop=True)

prefixes = {"Dr.", "Dr", "Mr.", "Prof.", "PROF", "prof.", "St.", "2", "3", "Prof. Dr.", "Dr.-Ing.", "Dr. -Ing.", "Dr .", "dr."}

def extract_first_name(name):
    parts = name.strip().split()
    if not parts:
        return ""
    if parts[0] in prefixes:
        parts = parts[1:]
    if not parts:
        return ""
    return parts[0].split("-")[0]

founders["first_name"] = founders["founder"].apply(extract_first_name)

unique_first_names = founders["first_name"].dropna().unique()
unique_first_names_df = pd.DataFrame(unique_first_names, columns=["first_name"])

The gender-detector library provides a gender label ([gender-detector](https://pypi.org/project/gender-detector/)).<br>Mostly-female is classified as female; mostly-male is classified as male. 

In [76]:
d = gender.Detector()

unique_first_names_df["gender"] = unique_first_names_df["first_name"].apply(d.get_gender)
unique_first_names_df["gender"] = unique_first_names_df["gender"].replace({
    "mostly_female": "female",
    "mostly_male": "male"
})

Some names are not detected and they are labeled as unknown. We clean the output and create a blank cell.

In [77]:
unknown_gender_df = unique_first_names_df[unique_first_names_df["gender"] == "unknown"].reset_index(drop=True)
unknown_gender_df["gender"] = ""

We call the [genderize](https://genderize.io) to analyze the remaining names. We cache the results.

In [78]:
API_KEY = genderize
CACHE_CSV = 'data/gender/gender_cache.csv'
API_URL = 'https://api.genderize.io'

if os.path.exists(CACHE_CSV):
    gender_cache = pd.read_csv(CACHE_CSV)
else:
    gender_cache = pd.DataFrame(columns=['first_name', 'gender'])

unknown_gender_df['first_name'] = unknown_gender_df['first_name'].str.lower()
gender_cache['first_name'] = gender_cache['first_name'].str.lower()

gender_cache = gender_cache.drop_duplicates(subset='first_name')

results = []

for name in unknown_gender_df['first_name']:
    cached_row = gender_cache[gender_cache['first_name'] == name]

    if not cached_row.empty:
        
        gender = cached_row['gender'].values[0]
    else:
        
        response = requests.get(API_URL, params={'name': name, 'apikey': API_KEY})
        if response.status_code == 200:
            data = response.json()
            gender = data.get('gender')
            
            gender_cache = pd.concat([gender_cache, pd.DataFrame([{
                'first_name': name,
                'gender': gender
            }])], ignore_index=True)
        else:
            gender = None  

    results.append(gender)

unknown_gender_df['gender'] = results

gender_cache.to_csv(CACHE_CSV, index=False)

The results are merged 

In [79]:
merged = unique_first_names_df.copy()
merged['first_name_lower'] = merged['first_name'].str.lower()
gender_cache['first_name_lower'] = gender_cache['first_name'].str.lower()

merged = merged.merge(
    gender_cache[['first_name_lower', 'gender']],
    on='first_name_lower',
    how='left',
    suffixes=('', '_cache')
)

merged['gender'] = merged['gender_cache'].combine_first(merged['gender'])

unique_first_names_df['gender'] = merged['gender']

In [80]:
founders = founders.merge(
    unique_first_names_df[['first_name', 'gender']],
    on='first_name',
    how='left'
)

`female_founder` is positive, if the company has at least one female team member in the founding team.

In [81]:
companies['female_founder'] = companies['company_id'].isin(
    founders[founders['gender'] == 'female']['company_id']
)

## Financing

### Number Seed Rounds <br>
<sub>Get `number_seed_rounds` for each company by counting all unique rounds in `rounds`.</sub>

In [82]:
seed_round_counts = rounds["organization_name_url"].value_counts()

companies["number_seed_rounds"] = companies["organization_name_url"].map(seed_round_counts).fillna(0).astype(int)

### Number Seed Investors <br>
<sub>Get the number_of_investors by counting all unique imvestors for a company in `seed_help`.<sub>

In [83]:
unique_investors_per_company = seed_help.groupby("company_id")["investor_id"].nunique()

companies["number_seed_investors"] = companies["company_id"].map(unique_investors_per_company).fillna(0).astype(int)

### Help-Variables: Seed Time Stamps <br>
<sub>Get `earliest_seed_round_date` and `latest_seed_round_date` as the gap between the first and last seed round.<sub>

In [84]:
first_round_date = rounds.groupby("organization_name_url")["announced_date"].min()
last_round_date = rounds.groupby("organization_name_url")["announced_date"].max()

companies["earliest_seed_round_date"] = companies["organization_name_url"].map(first_round_date)
companies["latest_seed_round_date"] = companies["organization_name_url"].map(last_round_date)

### Year First Seed Round<br>
<sub>Get `first_seed_round_year` as the year of the earliest seed round of the new venture.<sub>

In [85]:
companies["first_seed_round_year"] = companies["earliest_seed_round_date"].dt.year

### Help-Variable: Seed Funding<br>
<sub>Get `total_seed_funding_m` in millions USD by sum `money_raised_usd` over all seed investment rounds of the new venture.<sub>

In [86]:
company_seed_funding_m = (
    rounds.groupby("organization_name_url")["money_raised_usd"].sum() / 1_000_000
).reset_index()

company_seed_funding_m = company_seed_funding_m.rename(
    columns={"money_raised_usd": "total_seed_funding_m"}
)

companies = companies.merge(
    company_seed_funding_m, on="organization_name_url", how="left"
)

companies["total_seed_funding_m"] = companies["total_seed_funding_m"].fillna(0)

### Low Seed Funding<br>
<sub>`low_seed_funding` is positive if `total_seed_funding_m` is less than 0.25 m USD<sub>

In [87]:
companies["low_seed_funding"] = companies["total_seed_funding_m"] < 0.25 

### High Seed Funding<br>
<sub>`high_seed_funding` is positive if `total_seed_funding_m` is more than 4.75 m USD<sub>

In [88]:
companies["high_seed_funding"] = companies["total_seed_funding_m"] > 4.75

### Follow-on Seed Investor<br>
<sub>`follow_on_investor` is positive if one investor participates in more than 1 round.<sub>

In [89]:
seed_help['follow_on_investor'] = (
    seed_help.groupby(['company_id', 'investor_id'])['round_id'].transform('count') > 1
)

follow_on_any = seed_help.groupby("company_id")["follow_on_investor"].any()
companies["follow_on_investor"] = companies["company_id"].map(follow_on_any).infer_objects(copy=False).fillna(False)


  companies["follow_on_investor"] = companies["company_id"].map(follow_on_any).infer_objects(copy=False).fillna(False)


### Age<br>
<sub>`Age` of the company at the point of the first investment round as `earliest_seed_round_date` minus `founded_date` in months.<sub>

In [90]:
companies["age"] = (
    (companies["earliest_seed_round_date"] - companies["founded_date"]).dt.days // 30
)

### Pre Seed Round<br>
<sub>`pre_seed` is positive if the company raised a round with the `funding_type` = "Pre-Seed".<sub>

In [91]:
pre_seed_companies = set(rounds.loc[rounds['funding_type'] == 'Pre-Seed', 'organization_name_url'])
companies['pre_seed'] = companies['organization_name_url'].isin(pre_seed_companies)

### Duration of Seed Stage<br>
<sub>Months between first and last seed investment<sub>

In [92]:
companies["duration_seed_stage"] = (
    (companies["latest_seed_round_date"] - companies["earliest_seed_round_date"]).dt.days / 30.44
).round(1)

## Grants

### Grant before Seed<br>
<sub>Received a grants before seed funding<sub>

In [93]:
earliest_seed_round_map = companies.set_index('company_id')['earliest_seed_round_date']

def has_grant_before_seed(row):
    company_id = companies.loc[companies['organization_name_url'] == row['organization_name_url'], 'company_id']
    if company_id.empty or pd.isna(row['announced_date']):
        return False
    earliest_date = earliest_seed_round_map.get(company_id.values[0], pd.NaT)
    return pd.notna(earliest_date) and row['announced_date'] < earliest_date

grants['grant_before_seed'] = grants.apply(has_grant_before_seed, axis=1)

grant_before_seed_map = grants[grants['grant_before_seed']].groupby('organization_name_url').size() > 0
companies['grant_before_seed'] = companies['organization_name_url'].map(grant_before_seed_map).fillna(False).astype(bool)

  companies['grant_before_seed'] = companies['organization_name_url'].map(grant_before_seed_map).fillna(False).astype(bool)


## Investor

### No Syndication<br>
<sub>Only one investor in the seed stage

In [94]:
companies["one_seed_investor_binary"] = (companies["number_seed_investors"] == 1)

### Lead Investor in Seed<br>
<sub>Existence of an lead investor entry in at least one round in the seed stage<sub>

In [95]:
lead_investor_any = (
    rounds.groupby("organization_name_url")["lead_investors"]
    .apply(lambda x: x.notna() & (x.str.strip() != ""))
    .groupby("organization_name_url")
    .any()
)

companies["lead_investor_in_seed"] = companies["organization_name_url"].map(lead_investor_any).fillna(False)

  companies["lead_investor_in_seed"] = companies["organization_name_url"].map(lead_investor_any).fillna(False)


### All Homecountry Investors<br>
<sub>All investors from the same homecountry<sub>

In [96]:
def all_investors_from_homecountry(company_id):
    company_country = companies.loc[companies["company_id"] == company_id, "hq_country"].values[0]
    investor_countries = seed_help.loc[seed_help["company_id"] == company_id, "investor_id"].map(
        investors.set_index("investor_id")["hq_country"]
    )
    return investor_countries.notna().all() and (investor_countries == company_country).all()

companies["all_homecountry_investors_in_seed_binary"] = companies["company_id"].apply(all_investors_from_homecountry)

### US investor<br>
<sub>Participation of at least one investor from the united states<sub>

In [97]:
def has_us_investor(company_id):
    investor_ids = seed_help.loc[seed_help["company_id"] == company_id, "investor_id"]
    if investor_ids.empty:
        return False
    investor_countries = investors.set_index("investor_id").loc[investor_ids, "hq_country"]
    return (investor_countries == "United States").any()

companies["us_investor_binary"] = companies["company_id"].apply(has_us_investor)

### Regional Investor<br>
<sub>At least one investor is from the same region as the new venture (less than 100km distance)<sub>

In [98]:
def has_regional_seed_investor(company_id):
    company = companies.loc[companies["company_id"] == company_id]
    if company.empty or pd.isna(company.iloc[0]["latitude"]) or pd.isna(company.iloc[0]["longitude"]):
        return False
    lat1 = company.iloc[0]["latitude"]
    lon1 = company.iloc[0]["longitude"]
    investor_ids = seed_help.loc[seed_help["company_id"] == company_id, "investor_id"]
    if investor_ids.empty:
        return False
    investor_locs = investors.set_index("investor_id").loc[investor_ids, ["latitude", "longitude"]]
    investor_locs = investor_locs.dropna()
    if investor_locs.empty:
        return False
    for _, row in investor_locs.iterrows():
        lat2, lon2 = row["latitude"], row["longitude"]
        dist = haversine(lat1, lon1, lat2, lon2)
        if dist <= 100:
            return True
    return False

companies["regional_seed_investor_binary"] = companies["company_id"].apply(has_regional_seed_investor)

### Institutional Regional Investor<br>
<sub>There is at least one institutional investor (defined by no person = angel) in the seed stage<sub>

In [99]:
def is_institutional_investor(investor_id):
    url = investors.loc[investors["investor_id"] == investor_id, "organization_person_name_url"].values[0]
    return "organization" in url

def has_regional_institutional_investor(company_id):
    company = companies.loc[companies["company_id"] == company_id]
    if company.empty or pd.isna(company.iloc[0]["latitude"]) or pd.isna(company.iloc[0]["longitude"]):
        return False
    lat1 = company.iloc[0]["latitude"]
    lon1 = company.iloc[0]["longitude"]
    investor_ids = seed_help.loc[seed_help["company_id"] == company_id, "investor_id"]
    if investor_ids.empty:
        return False
    investor_locs = investors.set_index("investor_id").loc[investor_ids, ["latitude", "longitude", "organization_person_name_url"]]
    investor_locs = investor_locs.dropna()
    if investor_locs.empty:
        return False
    for _, row in investor_locs.iterrows():
        if "organization" in row["organization_person_name_url"]:
            lat2, lon2 = row["latitude"], row["longitude"]
            dist = haversine(lat1, lon1, lat2, lon2)
            if dist <= 100:
                return True
    return False

companies["regional_seed_investor_institutional_binary"] = companies["company_id"].apply(has_regional_institutional_investor)

### Same City Region Investor<br>
<sub>Investor is in the same city region as the new venture (less than 30km)<sub>

In [100]:
def has_regional_seed_investor(company_id):
    company = companies.loc[companies["company_id"] == company_id]
    if company.empty or pd.isna(company.iloc[0]["latitude"]) or pd.isna(company.iloc[0]["longitude"]):
        return False
    lat1 = company.iloc[0]["latitude"]
    lon1 = company.iloc[0]["longitude"]
    investor_ids = seed_help.loc[seed_help["company_id"] == company_id, "investor_id"]
    if investor_ids.empty:
        return False
    investor_locs = investors.set_index("investor_id").loc[investor_ids, ["latitude", "longitude"]]
    investor_locs = investor_locs.dropna()
    if investor_locs.empty:
        return False
    for _, row in investor_locs.iterrows():
        lat2, lon2 = row["latitude"], row["longitude"]
        dist = haversine(lat1, lon1, lat2, lon2)
        if dist <= 30:
            return True
    return False

companies["city_seed_investor_binary"] = companies["company_id"].apply(has_regional_seed_investor)

### Investor Types<br>
<sub> Checking if at least one Accelerator, Incubator, Micro VC, Corporate Venture Capital, Angel Group, Angel, University Program, Entrepreneurship Program, Family Investment Office is involved in the seed stage<sub>

In [101]:
keywords = [
    "accelerator",
    "incubator",
    "micro vc",
    "corporate venture capital",
    "angel group",
    "angel",
    "university program",
    "entrepreneurship program",
    "family investment office"
]

for kw in keywords:
    col = kw.replace(" ", "_")
    investor_ids_with_type = set(investors.loc[investors[col], "investor_id"])
    companies[col + "_seed_investor_binary"] = companies["company_id"].isin(
        seed_help.loc[seed_help["investor_id"].isin(investor_ids_with_type), "company_id"]
    )

## Ecosystem

### Top University<br>
<sub>Check if there is a university from the top200 university ranking inside a range of 20km<sub>

In [102]:
uni = pd.read_csv('data/university/universities_coordinates_backup.csv')

def is_near_top_university_haversine(row):
    if pd.isna(row['latitude']) or pd.isna(row['longitude']):
        return False
    company_lat, company_lng = row['latitude'], row['longitude']
    for _, uni_row in uni.iterrows():
        if pd.isna(uni_row['Latitude']) or pd.isna(uni_row['Longitude']):
            continue
        uni_lat, uni_lng = uni_row['Latitude'], uni_row['Longitude']
        if haversine(company_lat, company_lng, uni_lat, uni_lng) <= 20:
            return True
    return False

companies['top_university'] = companies.apply(is_near_top_university_haversine, axis=1)

## Economy

### GDP rates
<sub>Collected data from the worldbank per country and founding-year of the company<sub>

In [103]:
gdp = pd.read_csv('data/gdp_rates/gdp_rates.csv')

def get_gdp(row):
    country = row['hq_country']
    year = row['founded_year']
    if pd.isna(country) or pd.isna(year):
        return None
    try:
        return gdp.loc[gdp['Country Name'] == country, str(int(year))].values[0]
    except Exception:
        return None

companies['gdp'] = companies.apply(get_gdp, axis=1)

## Success definition

### Overall Success <br>
<sub>Exit Flag or Late Stage Funding<sub>

In [104]:
success_types = ['Series A', 'Series B', 'Private Equity',
 'Series C', 'Corporate Round', 'Series D', 'Series E', 'Series F', 'Series G', 'Series I']


companies["success"] = (
    companies["last_equity_funding_type"].isin(success_types) | companies["exit_binary"]
)

### Post Seed Round Success <br>
<sub>New venture raised an investment round after the seed stage<sub>

In [105]:
success_types = ['Series A', 'Series B', 'Private Equity',
 'Series C', 'Corporate Round', 'Series D', 'Series E', 'Series F', 'Series G', 'Series I']


companies["post_seed_success"] = (
    companies["last_equity_funding_type"].isin(success_types)
)

# Scope 

Help-Dataframe: df<br>
<sub>Creating a copy of companies and store it in df to make adjustments to the scope without running the code from the beginning<sub>

In [118]:
df = companies.copy()

Remove all companies with 0 investment rounds

In [119]:
df = df[df["number_seed_rounds"] > 0].reset_index(drop=True)

Remove blockchain companies

In [120]:
df = df[~df["industry_groups"].str.split(",").apply(lambda x: "Blockchain and Cryptocurrency" in x)].reset_index(drop=True)

Only companies founded between 2007 and 2018

In [121]:
df = df[(df["founded_year"] >= 2007) & (df["founded_year"] <= 2020)].reset_index(drop=True)

At least one investor with coordinates

In [122]:
valid_investors = seed_help.merge(
    investors[['investor_id', 'latitude', 'longitude']],
    on='investor_id',
    how='left'
)

valid_investors = valid_investors.dropna(subset=['latitude', 'longitude'])

companies_with_valid_investors = valid_investors['company_id'].unique()

df = df[df['company_id'].isin(companies_with_valid_investors)].reset_index(drop=True)

At least one funding round

In [123]:
df = df[df["number_seed_rounds"] > 0].reset_index(drop=True)

First seed year between 2007 and 2018

In [124]:
df = df[
    df["first_seed_round_year"].between(2007, 2020, inclusive="both")
].reset_index(drop=True)

Optional: Last Seed at the end of 2020

In [None]:
'''
df = df[
    (df["latest_seed_round_date"].dt.year <= 2020)
].reset_index(drop=True)
'''

At least one founder

In [125]:
df = df[df["founders"].notna() & (df["founders"].str.strip() != "")].reset_index(drop=True)

# Output

In [126]:
selected_features = [
    "company_id", "organization_name", "hq_country", "latitude", "longitude", "founded_year", "b2b", "hub_binary",
    "number_of_founders", "female_founder", "energy", "information_technology", "materials", "industrials",
    "consumer_discretionary", "consumer_staples", "health_care", "financials", "communication_services",
    "utilities", "real_estate", "other", "grant_before_seed", "first_seed_round_year", "number_seed_rounds",
    "number_seed_investors", "one_seed_investor_binary", "pre_seed", "follow_on_investor", "low_seed_funding", 
    "high_seed_funding", "duration_seed_stage", "all_homecountry_investors_in_seed_binary",
    "regional_seed_investor_binary", "regional_seed_investor_institutional_binary", "city_seed_investor_binary",
    "lead_investor_in_seed", "accelerator_seed_investor_binary", "incubator_seed_investor_binary", "micro_vc_seed_investor_binary",
    "corporate_venture_capital_seed_investor_binary", "angel_group_seed_investor_binary", "angel_seed_investor_binary",
    "university_program_seed_investor_binary", "entrepreneurship_program_seed_investor_binary",
    "family_investment_office_seed_investor_binary", "us_investor_binary", "gdp", "top_university",
    "exit_binary", "success"
]

df = df[selected_features]

In [127]:
output_path = 'data/datasets/companies.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True) 
df.to_csv(output_path, index=False)

# Explanatory Analysis

# Correlation Analysis

# Regression Analysis

## Regression 1

## Regression 2

# Robustness Check