# Data Cleaning

## A: Setup

In [292]:
import os
import pandas as pd
import glob
import time
from opencage.geocoder import OpenCageGeocode
from dotenv import load_dotenv
from math import radians, sin, cos, sqrt, atan2
import requests
import math
from collections import defaultdict
import numpy as np
from sklearn.neighbors import BallTree
from itertools import combinations
import pickle
import kagglehub

load_dotenv()
api_key = os.getenv("geocage")

os.chdir('/Users/janlinzner/Projects/Master-Thesis-Spatial-Proximity-Venture-Capital')

## B: Companies

### Append all CSV files

In [293]:
root_directory = 'data/companies'

df_list = []

selected_columns = [
    "Organization Name", "Organization Name URL", "Description", "Industry Groups", "Headquarters Location",
    "Founded Date", "Founded Date Precision", "Exit Date", "Exit Date Precision", "Number of Founders", "Founders",
    "Last Equity Funding Amount (in USD)", "Last Equity Funding Type", "Total Equity Funding Amount (in USD)"
]

for subdir, _, _ in os.walk(root_directory):
    csv_files = glob.glob(os.path.join(subdir, '*.csv'))
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

final_df = pd.concat(df_list, ignore_index=True)

companies = final_df[selected_columns].copy()

### Convert variables to date variables

In [294]:
date_cols = ['Founded Date', 'Exit Date']
for col in date_cols:
    companies[col] = pd.to_datetime(companies[col], errors='coerce')

### Add Founding Year

In [295]:
founded_year = companies['Founded Date'].dt.year

pos = companies.columns.get_loc('Founded Date') + 1
companies.insert(loc=pos, column='Founded Year', value=founded_year)

### Convert variables to categorical variables

In [296]:
companies['Founded Date Precision'] = pd.Categorical(companies['Founded Date Precision'],
                                    categories=['month','year','day'],
                                    ordered=False)

companies['Exit Date Precision'] = pd.Categorical(companies['Exit Date Precision'],
                                    categories=['day','month'],
                                    ordered=False)

### Provide a company id with leading zeros

In [297]:
companies.reset_index(drop=True, inplace=True)
companies['Company ID'] = companies.index + 1
companies['Company ID'] = companies['Company ID'].apply(lambda x: f"{x:06d}")
companies['Company ID'] = companies['Company ID'].astype('string')

cols = ['Company ID'] + [c for c in companies.columns if c != 'Company ID']
companies = companies[cols]

### Convert company events into binaries

In [298]:
companies = companies.copy()

companies.loc[:, 'Exit Binary']     = companies['Exit Date'].notna().astype(int)
companies['Exit Binary'] = companies['Exit Binary'].astype(bool)

order = [
    'Company ID', 'Organization Name', 'Organization Name URL', 'Description', 'Industry Groups', 'Headquarters Location',
    'Founded Date', 'Founded Date Precision', 'Founded Year', 'Exit Date', 'Exit Date Precision', 'Exit Binary',
    'Number of Founders', 'Founders', 'Last Equity Funding Amount (in USD)', 'Last Equity Funding Type', 'Total Equity Funding Amount (in USD)'
]
companies = companies[order]

### Adding geocoding

In [299]:
cache_file = 'data/locations/location_cache.csv'

if not os.path.exists(cache_file):
    pd.DataFrame({
        'hq_location': pd.Series(dtype='object'),
        'latitude':     pd.Series(dtype='float64'),
        'longitude':    pd.Series(dtype='float64'),
    }).to_csv(cache_file, index=False)
    print(f"Created new file: {cache_file}")
else:
    print(f"File already exists: {cache_file}")

cached_coords = pd.read_csv(cache_file)

unique_locations = companies['Headquarters Location'].dropna().unique()
total = len(unique_locations)
base_url = "https://api.opencagedata.com/geocode/v1/json"
api_key   = os.getenv("geocage")

coords_list = []
for idx, location in enumerate(unique_locations, start=1):

    cached = cached_coords[cached_coords['hq_location'] == location]
    if not cached.empty:
        row = cached.iloc[0].to_dict()
        coords_list.append(row)
        print(f"[{idx}/{total}] (cached)  {location} → {row['latitude']}, {row['longitude']}")
        continue

    if not location.strip():
        row = {'hq_location': location, 'latitude': None, 'longitude': None}
        coords_list.append(row)
        print(f"[{idx}/{total}] (blank)   {location!r} → None")
    else:

        params = {"key": api_key, "q": location, "limit": 1, "no_annotations": 1}
        try:
            resp = requests.get(base_url, params=params)
            data = resp.json()
            if data['status']['code'] == 200 and data['results']:
                lat = data['results'][0]['geometry']['lat']
                lng = data['results'][0]['geometry']['lng']
            else:
                lat = lng = None
        except Exception:
            lat = lng = None

        row = {'hq_location': location, 'latitude': lat, 'longitude': lng}
        coords_list.append(row)
        print(f"[{idx}/{total}] (fetched) {location} → {lat}, {lng}")

    new_row_df = pd.DataFrame([row])
    if cached_coords.empty:
        cached_coords = new_row_df
    else:
        cached_coords = pd.concat([cached_coords, new_row_df], ignore_index=True)
    cached_coords = cached_coords.drop_duplicates(subset='hq_location', keep='first')
    cached_coords.to_csv(cache_file, index=False)

    time.sleep(0.5)

df_coords = pd.DataFrame(coords_list)
companies = companies.merge(
    df_coords,
    left_on='Headquarters Location',
    right_on='hq_location',
    how='left'
).drop(columns=['hq_location'])

lat = companies.pop('latitude')
lon = companies.pop('longitude')

insert_at = companies.columns.get_loc('Headquarters Location') + 1

companies.insert(insert_at, 'latitude', lat)
companies.insert(insert_at + 1, 'longitude', lon)

companies = (
    companies
      .rename(columns={
          'latitude':  'Latitude',
          'longitude': 'Longitude'
      })
)

File already exists: data/locations/location_cache.csv
[1/2369] (cached)  Warsaw, Mazowieckie, Poland → 52.2319581, 21.0067249
[2/2369] (cached)  Kraków, Malopolskie, Poland → 50.0619474, 19.9368564
[3/2369] (cached)  Lublin, Lubelskie, Poland → 51.250559, 22.5701022
[4/2369] (cached)  Torun, Kujawsko-Pomorskie, Poland → 53.0102721, 18.6048094
[5/2369] (cached)  Poznan, Wielkopolskie, Poland → 52.4082663, 16.9335199
[6/2369] (cached)  Piaseczno, Mazowieckie, Poland → 52.0747377, 21.0270885
[7/2369] (cached)  Warszawa, Mazowieckie, Poland → 52.2319581, 21.0067249
[8/2369] (cached)  Wroclaw, Dolnoslaskie, Poland → 51.1089776, 17.0326689
[9/2369] (cached)  Zielona Góra, Lubuskie, Poland → 51.9383777, 15.5050408
[10/2369] (cached)  Cracow, Malopolskie, Poland → 50.0619474, 19.9368564
[11/2369] (cached)  Lodz, Lodzkie, Poland → 51.7687323, 19.4569911
[12/2369] (cached)  Gorzów Wielkopolski, Lubuskie, Poland → 52.7276526, 15.2286373
[13/2369] (cached)  Krakow, Malopolskie, Poland → 50.061947

### Startup Hub
https://www.startupblink.com <br>
Top 100

In [300]:
geocoder = OpenCageGeocode(api_key)

hotspot_names = [
    "London, United Kingdom", "Paris, France", "Berlin, Germany",
    "Stockholm, Sweden", "Munich, Germany", "Helsinki, Finland",
    "Madrid, Spain", "Dublin, Ireland", "Tallinn, Estonia",
    "Copenhagen, Denmark", "Milan, Italy", "Zurich, Switzerland",
    "Oslo, Norway", "Cambridge, United Kingdom", "Kyiv, Ukraine",
    "Vienna, Austria", "Brussels, Belgium", "Manchester, United Kingdom",
    "Lisbon, Portugal", "Prague, Czech Republic", "Warsaw, Poland",
    "Hamburg, Germany", "Oxford, United Kingdom", "Amsterdam, The Netherlands", "Barcelona, Spain", "Lausanne, Switzerland",
]

records = []
for name in hotspot_names:
    results = geocoder.geocode(name, limit=5)
    if results:
        best = max(results, key=lambda x: x.get("confidence", 0))
        geom = best["geometry"]
        records.append({
            "city": name.split(",", 1)[0],
            "lat":  geom["lat"],
            "lng":  geom["lng"]
        })
    time.sleep(0.2)  
hotspots_df = pd.DataFrame(records)

def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat/2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2)**2
    return R * 2 * atan2(sqrt(a), sqrt(1 - a))

def min_dist_to_hotspot(lat, lon):
    return hotspots_df.apply(
        lambda r: haversine(lat, lon, r["lat"], r["lng"]), axis=1
    ).min()

companies["Latitude"]  = pd.to_numeric(companies["Latitude"],  errors="coerce")
companies["Longitude"] = pd.to_numeric(companies["Longitude"], errors="coerce")

companies["Distance to Hub"] = companies.apply(
    lambda r: min_dist_to_hotspot(r["Latitude"], r["Longitude"]), axis=1
)
companies["Hub Binary"] = (companies["Distance to Hub"] < 20).astype(int)

In [301]:
geocoder = OpenCageGeocode(api_key)

hotspot_names = [
    "London, United Kingdom", "Paris, France", "Berlin, Germany",
    "Stockholm, Sweden", "Munich, Germany", "Helsinki, Finland",
    "Madrid, Spain", "Dublin, Ireland", "Tallinn, Estonia",
    "Copenhagen, Denmark", "Milan, Italy", "Zurich, Switzerland",
    "Oslo, Norway", "Cambridge, United Kingdom", "Kyiv, Ukraine",
    "Vienna, Austria", "Brussels, Belgium", "Manchester, United Kingdom",
    "Lisbon, Portugal", "Prague, Czech Republic", "Warsaw, Poland",
    "Hamburg, Germany", "Oxford, United Kingdom", "Amsterdam, The Netherlands", "Barcelona, Spain", "Lausanne, Switzerland",
]

records = []
for name in hotspot_names:
    city, country = [s.strip() for s in name.split(",", 1)]
    results = geocoder.geocode(name, limit=5)
    if results:
        best = max(results, key=lambda x: x.get("confidence", 0))
        geom = best["geometry"]
        records.append({
            "city":    city,
            "country": country,
            "lat":     geom["lat"],
            "lng":     geom["lng"]
        })
    time.sleep(0.2)  

hotspots_df = pd.DataFrame(records)

hotspots_df.to_csv("data/sets-for-r/startup_hubs.csv", index=False)

print("Wrote", len(hotspots_df), "hubs to startup_hubs.csv")

Wrote 26 hubs to startup_hubs.csv


### B2B or B2C

In [302]:
focus_path = 'data/business_orientation/companies_business_focus_save.csv'
existing = pd.read_csv(focus_path)

merged = companies.merge(
    existing[['Organization Name', 'Organization Name URL', 'B2B Binary']],
    on=['Organization Name', 'Organization Name URL'],
    how='left'
)

annot_df = merged[[
    'Organization Name',
    'Description',
    'Organization Name URL',
    'B2B Binary'
]].copy()

annot_df['B2B Binary'] = annot_df['B2B Binary'] \
    .map({1: '1', 0: '0'}) \
    .fillna('')

to_annotate = annot_df[annot_df['B2B Binary'] == '']

to_annotate.to_csv(
    'data/business_orientation/companies_business_focus.csv',
    index=False
)

In [303]:
company_focus = pd.read_csv('data/business_orientation/companies_business_focus_save.csv')

companies = companies.merge(
    company_focus[['Organization Name',
                    'Organization Name URL',
                    'B2B Binary']],
    on=['Organization Name', 'Organization Name URL'],
    how='left'
)

companies['B2B Binary'] = (
    companies['B2B Binary']
    .fillna(0)
    .astype(int)
    .astype('boolean')
)

### Print final companies dataframe

In [304]:
companies

Unnamed: 0,Company ID,Organization Name,Organization Name URL,Description,Industry Groups,Headquarters Location,Latitude,Longitude,Founded Date,Founded Date Precision,...,Exit Date Precision,Exit Binary,Number of Founders,Founders,Last Equity Funding Amount (in USD),Last Equity Funding Type,Total Equity Funding Amount (in USD),Distance to Hub,Hub Binary,B2B Binary
0,000001,RedStone,https://www.crunchbase.com/organization/redsto...,RedStone builds two products: A cross-chain da...,"Blockchain and Cryptocurrency, Financial Servi...","Warsaw, Mazowieckie, Poland",52.231958,21.006725,2021-03-01,month,...,,False,3.0,"Alex Suvorov, Jakub Wojciechowski, Marcin Kazm...",15000000.0,Series A,22875000.0,0.000000,1,False
1,000002,Jutro Medical,https://www.crunchbase.com/organization/jutro,Jutro Medical offers a combination of online a...,Health Care,"Warsaw, Mazowieckie, Poland",52.231958,21.006725,2019-01-01,year,...,,False,1.0,Adam Janczewski,13064211.0,Series A,22973232.0,0.000000,1,False
2,000003,Booksy,https://www.crunchbase.com/organization/booksy,Booksy is a booking platform that links local ...,"Apps, Commerce and Shopping, Financial Service...","Warsaw, Mazowieckie, Poland",52.231958,21.006725,2014-07-08,day,...,,False,3.0,"Konrad Howard, Stefan Batory, Tomasz Zembrzycki",,Venture - Series Unknown,118700000.0,0.000000,1,False
3,000004,Ramp Network,https://www.crunchbase.com/organization/ramp-3b7b,Ramp Network is a fintech startup that offers ...,"Blockchain and Cryptocurrency, Financial Servi...","Warsaw, Mazowieckie, Poland",52.231958,21.006725,2017-01-01,year,...,,False,2.0,"Przemek Kowalczyk, Szymon Sypniewicz",70000000.0,Series B,133942437.0,0.000000,1,False
4,000005,Quantee,https://www.crunchbase.com/organization/quantee,SaaS AI-based dynamic insurance pricing,"Artificial Intelligence (AI), Data and Analyti...","Warsaw, Mazowieckie, Poland",52.231958,21.006725,2018-11-01,month,...,day,True,2.0,"Dawid Kopczyk, Mateusz Gintrowski",,Seed,700000.0,0.000000,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21890,021891,ReEmbed,https://www.crunchbase.com/organization/reembed,ReEmbed is a custom video player that supports...,"Advertising, Content and Publishing, Media and...","Athens, Attiki, Greece",37.975565,23.734832,2013-01-01,year,...,,False,2.0,"Alex Papaspiridis, Vasilis Papaconstantinou",,Seed,107844.0,1283.894358,0,False
21891,021892,Athroa Innovations,https://www.crunchbase.com/organization/athroa...,"Venture Builder focusing on biomedical, clean ...",,"Athens, Attiki, Greece",37.975565,23.734832,2019-09-18,day,...,,False,1.0,Mihalis Boutaris,,Seed,,1283.894358,0,True
21892,021893,CYRUS,https://www.crunchbase.com/organization/cyrus,CYRUS offers a noise-free hydrogen compression...,,"Athens, Attiki, Greece",37.975565,23.734832,2019-04-01,month,...,,False,,,,Pre-Seed,,1283.894358,0,True
21893,021894,Captainwise,https://www.crunchbase.com/organization/captai...,CaptainWise is an experienced captain whose pu...,"Commerce and Shopping, Data and Analytics, Eve...","Athens, Attiki, Greece",37.975565,23.734832,2012-01-01,year,...,,False,3.0,"Andreas Karoutzos, Chris Chatziapostolakis, Di...",123814.0,Seed,123814.0,1283.894358,0,False


## C: Rounds

### Append all CSV files

In [305]:
root_directory = 'data/rounds'

selected_columns = [
    'Transaction Name', 'Transaction Name URL', 'Organization Name', 'Organization Name URL', 'Funding Type', 'Money Raised (in USD)', 'Announced Date', 'Lead Investors', 'Investor Names', 'Number of Investors',
]

df_list = []

for subdir, _, _ in os.walk(root_directory):
    csv_files = glob.glob(os.path.join(subdir, '*.csv'))
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

final_df = pd.concat(df_list, ignore_index=True)

rounds = final_df[selected_columns].copy()

### Convert variables to date variables

In [306]:
date_cols = ['Announced Date']
for col in date_cols:
    rounds[col] = pd.to_datetime(rounds[col], errors='coerce')

### Add round id with leading zeros

In [307]:
rounds.reset_index(drop=True, inplace=True)
rounds['Round ID'] = rounds.index + 1
rounds['Round ID'] = rounds['Round ID'].apply(lambda x: f"{x:06d}")
rounds['Round ID'] = rounds['Round ID'].astype('string')

cols = ['Round ID'] + [c for c in rounds.columns if c != 'Round ID']
rounds = rounds[cols]

## D: Investors

### Append all CSV files

In [308]:
root_directory = 'data/investors'
df_list = []

selected_columns = [
    "Organization/Person Name", "Organization/Person Name URL", "Investor Type",
    "Number of Investments", "Number of Exits", "Location",
    "Description", "Number of Lead Investments",
    "Number of Portfolio Organizations", "Founded Date", "Industry Groups", "CB Rank (Investor)"
]

for subdir, _, _ in os.walk(root_directory):
    csv_files = glob.glob(os.path.join(subdir, '*.csv'))
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

final_df = pd.concat(df_list, ignore_index=True)

investors = final_df[selected_columns].copy()

investors.drop_duplicates(
    subset=['Organization/Person Name', 'Organization/Person Name URL'],
    inplace=True
)

### Convert variables to date variables

In [309]:
date_cols = ['Founded Date']
for col in date_cols:
    investors[col] = pd.to_datetime(investors[col], errors='coerce')

### Add investor id with leading zeros

In [310]:
investors.reset_index(drop=True, inplace=True)
investors['Investor ID'] = investors.index + 1
investors['Investor ID'] = investors['Investor ID'].apply(lambda x: f"{x:06d}")
investors['Investor ID'] = investors['Investor ID'].astype('string')

cols = ['Investor ID'] + [c for c in investors.columns if c != 'Investor ID']
investors = investors[cols]

### Adding Geocoding

In [311]:
cache_file = 'data/locations/location_cache.csv'
if not os.path.exists(cache_file):
    pd.DataFrame({
        'hq_location': pd.Series(dtype='object'),
        'latitude':     pd.Series(dtype='float64'),
        'longitude':    pd.Series(dtype='float64'),
    }).to_csv(cache_file, index=False)
    print(f"Created new file: {cache_file}")
else:
    print(f"File already exists: {cache_file}")

cached_coords = pd.read_csv(cache_file)

unique_locations = investors['Location'].dropna().unique()
total = len(unique_locations)
base_url = "https://api.opencagedata.com/geocode/v1/json"
api_key   = os.getenv("geocage")

coords_list = []
for idx, location in enumerate(unique_locations, start=1):

    cached = cached_coords[cached_coords['hq_location'] == location]
    if not cached.empty:
        row = cached.iloc[0].to_dict()
        coords_list.append(row)
        print(f"[{idx}/{total}] (cached)  {location} → {row['latitude']}, {row['longitude']}")
        continue

    if not location.strip():
        row = {'hq_location': location, 'latitude': None, 'longitude': None}
        coords_list.append(row)
        print(f"[{idx}/{total}] (blank)   {location!r} → None")
    else:

        params = {"key": api_key, "q": location, "limit": 1, "no_annotations": 1}
        try:
            resp = requests.get(base_url, params=params)
            data = resp.json()
            if data['status']['code'] == 200 and data['results']:
                lat = data['results'][0]['geometry']['lat']
                lng = data['results'][0]['geometry']['lng']
            else:
                lat = lng = None
        except Exception:
            lat = lng = None

        row = {'hq_location': location, 'latitude': lat, 'longitude': lng}
        coords_list.append(row)
        print(f"[{idx}/{total}] (fetched) {location} → {lat}, {lng}")

    new_row_df = pd.DataFrame([row])
    cached_coords = pd.concat([cached_coords, new_row_df], ignore_index=True)
    cached_coords = cached_coords.drop_duplicates(subset='hq_location', keep='first')
    cached_coords.to_csv(cache_file, index=False)
    time.sleep(0.5)

df_coords = pd.DataFrame(coords_list)

investors = investors.merge(
    df_coords,
    left_on='Location',
    right_on='hq_location',
    how='left'
).drop(columns=['hq_location'])

lat = investors.pop('latitude')
lon = investors.pop('longitude')
insert_at = investors.columns.get_loc('Location') + 1
investors.insert(insert_at,     'Latitude',  lat)
investors.insert(insert_at + 1, 'Longitude', lon)

File already exists: data/locations/location_cache.csv
[1/1339] (cached)  Boulder, Colorado, United States → 40.0149856, -105.270545
[2/1339] (cached)  Princeton, New Jersey, United States → 40.3496953, -74.6597376
[3/1339] (cached)  Palo Alto, California, United States → 37.4443293, -122.1598465
[4/1339] (cached)  Sunnyvale, California, United States → 37.3688301, -122.036349
[5/1339] (cached)  Singapore, Central Region, Singapore → 1.28967, 103.85007
[6/1339] (cached)  Santa Clara, California, United States → 37.3541132, -121.955174
[7/1339] (cached)  Menlo Park, California, United States → 37.4519671, -122.177992
[8/1339] (cached)  Austin, Texas, United States → 30.2711286, -97.7436995
[9/1339] (cached)  Dubai, Dubai, United Arab Emirates → 25.0742823, 55.1885387
[10/1339] (cached)  London, England, United Kingdom → 51.4893335, -0.1440551
[11/1339] (cached)  Tallinn, Harjumaa, Estonia → 59.4372155, 24.7453688
[12/1339] (cached)  San Francisco, California, United States → 37.7792588,

In [312]:
investors.columns

Index(['Investor ID', 'Organization/Person Name',
       'Organization/Person Name URL', 'Investor Type',
       'Number of Investments', 'Number of Exits', 'Location', 'Latitude',
       'Longitude', 'Description', 'Number of Lead Investments',
       'Number of Portfolio Organizations', 'Founded Date', 'Industry Groups',
       'CB Rank (Investor)'],
      dtype='object')

### Add zeros for NaN in Lead Investments and Exits

In [313]:
investors['Number of Lead Investments'] = investors['Number of Lead Investments'].fillna(0)
investors['Number of Exits'] = investors['Number of Exits'].fillna(0)

### Extract Descriptions to classify if it is a sector specific investor

In [314]:
focus_path = 'data/industry-focus/industry_focus_save.csv'
existing = pd.read_csv(focus_path)

merged = investors.merge(
    existing[['Organization/Person Name', 'Organization/Person Name URL', 'Specific VC Binary']],
    on=['Organization/Person Name', 'Organization/Person Name URL'],
    how='left'
)

annot_df = merged[[
    'Organization/Person Name',
    'Description',
    'Organization/Person Name URL',
    'Specific VC Binary'
]].copy()

annot_df['Specific VC Binary'] = annot_df['Specific VC Binary'] \
    .map({1: '1', 0: '0'}) \
    .fillna('')

to_annotate = annot_df[annot_df['Specific VC Binary'] == '']

to_annotate.to_csv(
    'data/industry-focus/industry_focus.csv',
    index=False
)

In [315]:
industry_focus = pd.read_csv('data/industry-focus/industry_focus_save.csv')

investors = investors.merge(
    industry_focus[['Organization/Person Name',
                    'Organization/Person Name URL',
                    'Specific VC Binary']],
    on=['Organization/Person Name', 'Organization/Person Name URL'],
    how='left'
)

investors['Specific VC Binary'] = (
    investors['Specific VC Binary']
    .fillna(0)
    .astype(int)
    .astype('boolean')
)

### Extract Investor Type
Search the 'Investor Type' column and if there is one of the Types, put a True in the assigned Boolean.

In [316]:
def parse_types(x):
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return [t.strip().lower() for t in x]

    return [t.strip().lower() for t in x.split(",")]


keywords = {
    "accelerator":              "Accelerator",
    "micro vc":                 "Micro VC",
    "corporate venture capital":"Corporate Venture Capital",
    "angel group":              "Angel Group"
}

investors["type_list"] = investors["Investor Type"].apply(parse_types)

for kw, col in keywords.items():
    investors[col] = investors["type_list"].apply(lambda lst: kw in lst)

investors.drop(columns="type_list", inplace=True)

### Target VC

In [317]:
investors['CB Rank (Investor)'] = (
    investors['CB Rank (Investor)']
    .astype(str)
    .str.replace(',', '')                
    .str.extract(r'(\d+\.?\d*)')[0]      
    .astype(float)                      
)

threshold = investors['CB Rank (Investor)'].quantile(0.05)

investors['Target VC'] = (
    investors['CB Rank (Investor)'] <= threshold
)

### Print final investors dataframe

In [318]:
investors

Unnamed: 0,Investor ID,Organization/Person Name,Organization/Person Name URL,Investor Type,Number of Investments,Number of Exits,Location,Latitude,Longitude,Description,...,Number of Portfolio Organizations,Founded Date,Industry Groups,CB Rank (Investor),Specific VC Binary,Accelerator,Micro VC,Corporate Venture Capital,Angel Group,Target VC
0,000001,Techstars,https://www.crunchbase.com/organization/techstars,"Accelerator, Venture Capital",6189,527.0,"Boulder, Colorado, United States",40.014986,-105.270545,Techstars is an accelerator that provides pre-...,...,4715,2006-08-01,"Financial Services, Lending and Investments, P...",238.0,False,True,False,False,False,True
1,000002,SOSV,https://www.crunchbase.com/organization/sosv,"Accelerator, Venture Capital",2881,80.0,"Princeton, New Jersey, United States",40.349695,-74.659738,SOSV is a deep tech venture firm that invests ...,...,1445,1995-01-01,"Financial Services, Lending and Investments",951.0,True,True,False,False,False,True
2,000003,Accel,https://www.crunchbase.com/organization/accel,Venture Capital,2184,394.0,"Palo Alto, California, United States",37.444329,-122.159847,Accel is an seed and growth-stage venture capi...,...,1064,1983-01-01,"Financial Services, Lending and Investments",18.0,False,False,False,False,False,True
3,000004,Plug and Play,https://www.crunchbase.com/organization/plug-a...,"Accelerator, Co-Working Space, University Prog...",2068,194.0,"Sunnyvale, California, United States",37.368830,-122.036349,Plug and Play is an innovation platform bringi...,...,1682,2006-01-01,"Commerce and Shopping, Financial Services, Len...",23.0,False,True,False,False,False,True
4,000005,Antler,https://www.crunchbase.com/organization/antler...,Venture Capital,1778,12.0,"Singapore, Central Region, Singapore",1.289670,103.850070,Antler is a venture capital firm that focuses ...,...,1524,2018-01-01,"Financial Services, Lending and Investments",33.0,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7638,007639,Bonsal Capital,https://www.crunchbase.com/organization/bonsal...,Venture Capital,41,18.0,"Baltimore, Maryland, United States",39.290882,-76.610759,Bonsal Capital is a mission-driven partnership...,...,31,1999-05-01,"Education, Financial Services, Information Tec...",115270.0,True,False,False,False,False,False
7639,007640,A*,https://www.crunchbase.com/organization/a-capi...,Venture Capital,40,2.0,"San Francisco, California, United States",37.779259,-122.419329,A* is a venture capital firm that invests in t...,...,33,2020-01-01,"Financial Services, Lending and Investments",1152.0,False,False,False,False,False,True
7640,007641,Symbolic Capital,https://www.crunchbase.com/organization/symbol...,Venture Capital,40,0.0,"New York, New York, United States",40.712728,-74.006015,Symbolic Capital is a thesis-driven investment...,...,39,2022-01-01,,2015.0,True,False,False,False,False,False
7641,007642,f7 Ventures,https://www.crunchbase.com/organization/f7-ven...,Venture Capital,40,4.0,"San Francisco, California, United States",37.779259,-122.419329,f7 Ventures is a Venture Capital.,...,36,2018-01-01,"Financial Services, Lending and Investments",2279.0,False,False,False,False,False,False


## E: Seed Setup

### Seed - Help Table

Create Industry Group Binaries

In [319]:
companies['Industry Groups'] = companies['Industry Groups'].fillna('').str.replace(r'\s*,\s*', ',', regex=True)
dummies = companies['Industry Groups'].str.get_dummies(sep=',').astype(bool)
companies = pd.concat([companies, dummies], axis=1)

Delete companies from the Blockchain environment due to different characteristics

In [320]:
companies = companies[~companies['Blockchain and Cryptocurrency']]

In [321]:
round_type_col = 'Funding Type'
seed_rounds = rounds[rounds[round_type_col].isin(['Seed', 'Pre-Seed', 'Angel'])].copy()

company_lookup = {
    (str(n).strip().lower(), str(u).strip().lower()): cid
    for n, u, cid in zip(
        companies['Organization Name'],
        companies['Organization Name URL'],
        companies['Company ID']
    )
}

investor_lookup = {
    str(n).strip().lower(): (iid, str(url).strip())
    for n, iid, url in zip(
        investors['Organization/Person Name'],
        investors['Investor ID'],
        investors['Organization/Person Name URL']
    )
}

seed_rounds['comp_key'] = seed_rounds.apply(
    lambda r: (str(r['Organization Name']).strip().lower(),
               str(r['Organization Name URL']).strip().lower()),
    axis=1
)
seed_rounds = seed_rounds[seed_rounds['comp_key'].isin(company_lookup)].copy()
seed_rounds.drop(columns=['comp_key'], inplace=True)

records = []
for _, row in seed_rounds.iterrows():
    rid = row['Round ID']
    org_name = row['Organization Name']
    org_url = row['Organization Name URL']
    comp_key = (org_name.strip().lower(), org_url.strip().lower())
    company_id = company_lookup[comp_key]
    lead_raw = row.get('Lead Investors', '')
    lead_list = [inv.strip() for inv in str(lead_raw).split(',') if pd.notna(lead_raw) and inv.strip().lower()!='nan']
    inv_raw = row.get('Investor Names', '')
    inv_list = [inv.strip() for inv in str(inv_raw).split(',') if pd.notna(inv_raw) and inv.strip().lower()!='nan']
    for inv_name in set(lead_list + inv_list):
        lookup = investor_lookup.get(inv_name.lower())
        if not lookup:
            continue
        inv_id, inv_url = lookup
        records.append({
            'Round ID':               rid,
            'Company ID':             company_id,
            'Organization Name':      org_name,
            'Organization Name URL':  org_url,
            'Investor ID':            inv_id,
            'Investor Name':          inv_name,
            'Investor URL':           inv_url,
            'Lead':                   inv_name in lead_list
        })

seed_help = pd.DataFrame(records)

investor_extra = [
    'Number of Investments',
    'Number of Portfolio Organizations',
    'Number of Lead Investments',
    'Number of Exits',
    'Longitude',
    'Latitude',
    'Founded Date',
    'Specific VC Binary',
    'Accelerator',
    'Micro VC',
    'Angel Group',
    'Corporate Venture Capital',
    'Target VC'
]
seed_help = seed_help.merge(
    investors[['Investor ID'] + investor_extra],
    on='Investor ID',
    how='left'
)

company_extra = ['Longitude', 'Latitude', 'Founded Year']
seed_help = seed_help.merge(
    companies[['Company ID'] + company_extra],
    on='Company ID',
    how='left',
    suffixes=('', '_comp')
)

rename_map = {
    'Longitude':           'Investor Longitude',
    'Latitude':            'Investor Latitude',
    'Longitude_comp':      'Company Longitude',
    'Latitude_comp':       'Company Latitude',
    'Founded Date':       'Investor Founded Date',
    'Founded Year':      'Organization Founded Year',
}
seed_help.rename(columns=rename_map, inplace=True)

cols_order = [
    'Round ID',
    'Company ID',
    'Organization Name',
    'Organization Name URL',
    'Organization Founded Year',
    'Investor ID',
    'Investor Name',
    'Investor URL',
    'Investor Founded Date',
    'Investor Longitude',
    'Investor Latitude',
    'Company Longitude',
    'Company Latitude',
    'Number of Investments',
    'Number of Portfolio Organizations',
    'Number of Lead Investments',
    'Number of Exits',
    'Lead',
    'Specific VC Binary',
    'Accelerator',
    'Angel Group',
    'Micro VC',
    'Corporate Venture Capital',
    'Target VC'
]
seed_help = seed_help[cols_order]

def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat/2)**2 + math.cos(lat1)*math.cos(lat2)*math.sin(dlon/2)**2
    c = 2*math.atan2(math.sqrt(a), math.sqrt(1-a))
    return R * c

seed_help['Distance km'] = seed_help.apply(
    lambda r: haversine(
        r['Investor Latitude'], r['Investor Longitude'],
        r['Company Latitude'], r['Company Longitude']
    ), axis=1
)
seed_help['Local Investor'] = seed_help['Distance km'] < 100

### Create features to companies dataframe about seed round

#### Number of Seed Rounds

In [322]:
counts = seed_help.groupby('Company ID')['Round ID'].nunique().rename('Number Seed Rounds')
companies = companies.merge(counts, on='Company ID', how='left')
companies['Number Seed Rounds'] = companies['Number Seed Rounds'].fillna(0).astype(int)

#### Average Distance to Seed Investors

In [323]:
avg_distance = seed_help.groupby('Company ID')['Distance km'].mean().rename('Avg Seed Investor Distance')
companies = companies.merge(avg_distance, on='Company ID', how='left')
companies['Avg Seed Investor Distance'] = companies['Avg Seed Investor Distance'].fillna(0)

#### Average Portfolio Organizations of Investors

In [324]:
seed_help['Number of Portfolio Organizations'] = pd.to_numeric(
    seed_help['Number of Portfolio Organizations'],
    errors='coerce'
)

avg_portfolio = (
    seed_help
    .groupby('Company ID')['Number of Portfolio Organizations']
    .mean()
    .rename('Average Seed Investors Portfolio Organizations')
)

companies = companies.merge(
    avg_portfolio,
    on='Company ID',
    how='left'
)

companies['Average Seed Investors Portfolio Organizations'] = (
    companies['Average Seed Investors Portfolio Organizations']
    .fillna(0)
)

#### Number of Seed Investors

In [325]:
num_seed_investors = (
    seed_help
    .groupby('Company ID')['Investor ID']
    .nunique()
    .rename('Number Seed Investors')
)

companies = companies.merge(
    num_seed_investors,
    on='Company ID',
    how='left'
)

companies['Number Seed Investors'] = companies['Number Seed Investors'].fillna(0).astype(int)

#### Binary for Regional Investor and Overregional Investor

In [326]:
regional = (
    seed_help
    .groupby('Company ID')['Local Investor']
    .any()
    .rename('Regional Seed Investor Binary')
    .fillna(False)
    .astype(bool)
)

overregional = (
    seed_help
    .groupby('Company ID')['Local Investor']
    .apply(lambda x: (~x).any())
    .rename('Overregional Seed Investor Binary')
    .fillna(False)
    .astype(bool)
)

companies = (
    companies
    .merge(regional, on='Company ID', how='left')
    .merge(overregional, on='Company ID', how='left')
)

Number of Lead Investors and Binary for Regional and Overregional Investor

In [327]:
num_lead_seed_investors = (
    seed_help[seed_help['Lead']]
    .groupby('Company ID')['Investor ID']
    .nunique()
    .rename('Number Lead Seed Investors')
)

has_regional_lead = (
    seed_help[seed_help['Lead']]
    .groupby('Company ID')['Local Investor']
    .any()
    .rename('Regional Lead Seed Investor Binary')
    .fillna(False)
    .astype(bool)
)

has_overregional_lead = (
    seed_help[seed_help['Lead']]
    .groupby('Company ID')['Local Investor']
    .apply(lambda x: (~x).any())
    .rename('Overregional Lead Seed Investor Binary')
    .fillna(False)
    .astype(bool)
)

companies = (
    companies
    .merge(num_lead_seed_investors, on='Company ID', how='left')
    .merge(has_regional_lead, on='Company ID', how='left')
    .merge(has_overregional_lead, on='Company ID', how='left')
)

companies['Number Lead Seed Investors'] = companies['Number Lead Seed Investors'].fillna(0).astype(int)

companies['Number Lead Seed Investors'] = companies['Number Lead Seed Investors'].fillna(0).astype(int)

mask_no_lead = companies['Number Lead Seed Investors'] == 0
companies.loc[mask_no_lead, 'Regional Lead Seed Investor Binary'] = False
companies.loc[mask_no_lead, 'Overregional Lead Seed Investor Binary'] = False

#### Make the binaries to a boolean

In [328]:
companies = companies.convert_dtypes()

binary_cols = [
    'Regional Seed Investor Binary',
    'Overregional Seed Investor Binary',
    'Regional Lead Seed Investor Binary',
    'Overregional Lead Seed Investor Binary'
]

for col in binary_cols:
    companies[col] = companies[col].fillna(False)

#### Average Age of Seed Investors

In [329]:
if 'Announced Date' not in seed_help.columns:
    seed_help = seed_help.merge(
        rounds[['Round ID', 'Announced Date']],
        on='Round ID',
        how='left'
    )

seed_help['Announced Date']        = pd.to_datetime(seed_help['Announced Date'], errors='coerce')
seed_help['Investor Founded Date'] = pd.to_datetime(seed_help['Investor Founded Date'], errors='coerce')

seed_help['Investment Year']       = seed_help['Announced Date'].dt.year
seed_help['Investor Founded Year'] = seed_help['Investor Founded Date'].dt.year

seed_help['Investor Age at Investment'] = (
    (seed_help['Investment Year'] - seed_help['Investor Founded Year'])
    .clip(lower=0)
)

avg_age = (
    seed_help
    .groupby('Company ID')['Investor Age at Investment']
    .mean()
    .round(1)
    .rename('Avg Age of Seed Investors')
)

companies = companies.merge(
    avg_age,
    on='Company ID',
    how='left'
)
companies['Avg Age of Seed Investors'] = companies['Avg Age of Seed Investors'].fillna(0)

#### Average Number of Exits of Seed Investors

In [330]:
avg_exits = (
    seed_help
    .groupby('Company ID')['Number of Exits']
    .mean()
    .round(1)
    .rename('Avg Exits of Seed Investors')
)

companies = companies.merge(
    avg_exits,
    on='Company ID',
    how='left'
)
companies['Avg Exits of Seed Investors'] = companies['Avg Exits of Seed Investors'].fillna(0)

#### Binary Specific Investor in Seed

In [331]:
seed_help['Specific VC Binary'] = (
    seed_help['Specific VC Binary']
    .astype('boolean')
)

industry_specific = (
    seed_help
    .groupby('Company ID')['Specific VC Binary']
    .any()
    .rename('Specific VC in Seed Binary')
)

companies = companies.merge(
    industry_specific,
    on='Company ID',
    how='left'
)

companies['Specific VC in Seed Binary'] = (
    companies['Specific VC in Seed Binary']
    .fillna(False)
    .astype('boolean')
)


#### Binary Specific VC Lead in Seed Binary

In [332]:
industry_specific_lead = (
    seed_help[seed_help['Lead'] & seed_help['Specific VC Binary']]
    .groupby('Company ID')['Specific VC Binary']
    .any()
    .rename('Specific Lead VC in Seed Binary')
)

companies = companies.merge(
    industry_specific_lead,
    on='Company ID',
    how='left'
)

companies['Specific Lead VC in Seed Binary'] = (
    companies['Specific Lead VC in Seed Binary']
    .fillna(False)
    .astype('boolean')
)

### Target VC in Seed Binary

In [333]:
seed_help['Target VC'] = (
    seed_help['Target VC']
    .astype('boolean')
)

target_specific = (
    seed_help
    .groupby('Company ID')['Target VC']
    .any()
    .rename('Target VC in Seed Binary')
)

companies = companies.merge(
    target_specific,
    on='Company ID',
    how='left'
)

companies['Target VC in Seed Binary'] = (
    companies['Target VC in Seed Binary']
    .fillna(False)
    .astype('boolean')
)

### Target VC in Lead Seed Binary

In [334]:
target_lead = (
    seed_help[seed_help['Lead'] & seed_help['Target VC']]
    .groupby('Company ID')['Target VC']
    .any()
    .rename('Target Lead VC in Seed Binary')
)

companies = companies.merge(
    target_lead,
    on='Company ID',
    how='left'
)

companies['Target Lead VC in Seed Binary'] = (
    companies['Target Lead VC in Seed Binary']
    .fillna(False)
    .astype('boolean')
)

### Accelerator, Micro VC, Angel Group or Corporate Venture Capita in Seed

In [335]:
flags = [
    "Accelerator",
    "Angel Group",
    "Micro VC",
    "Corporate Venture Capital",
]

company_flags = (
    seed_help
    .groupby("Company ID")[flags]
    .any()   
    .rename(columns=lambda c: f"{c} Funding Binary")
)

companies = companies.merge(
    company_flags,
    on="Company ID",
    how="left"
)

for col in company_flags.columns:
    companies[col] = companies[col].fillna(False).astype("boolean")


  companies[col] = companies[col].fillna(False).astype("boolean")


In [336]:
flags = [
    "Accelerator",
    "Angel Group",
    "Micro VC",
    "Corporate Venture Capital",
]

lead_rounds = seed_help[seed_help["Lead"] == True]

company_lead_flags = (
    lead_rounds
    .groupby("Company ID")[flags]
    .any() 
    .rename(columns=lambda c: f"{c} Lead Funding Binary")
)

companies = companies.merge(
    company_lead_flags,
    on="Company ID",
    how="left"
)

for col in company_lead_flags.columns:
    companies[col] = companies[col].fillna(False).astype("boolean")

  companies[col] = companies[col].fillna(False).astype("boolean")


#### Time to first Seed

In [337]:
if 'Investment Year' not in seed_help.columns:
    seed_help['Investment Year'] = pd.to_datetime(
        seed_help['Announced Date'], errors='coerce'
    ).dt.year

first_seed = (
    seed_help
    .groupby('Company ID')['Investment Year']
    .min()
    .rename('First Seed Year')
)

companies = companies.merge(
    first_seed,
    on='Company ID',
    how='left'
)

companies['Years to Seed'] = (
    (companies['First Seed Year'] - companies['Founded Year'])
    .clip(lower=0)
    .fillna(0)
    .astype(int)
)

companies.drop(columns=['First Seed Year'], inplace=True)

## F: Success

Look for unique values

In [338]:
unique_vals = companies['Last Equity Funding Type'].dropna().unique().tolist()
print(unique_vals)

['Series A', 'Venture - Series Unknown', 'Seed', 'Series B', 'Series C', 'Pre-Seed', 'Series D', 'Equity Crowdfunding', 'Corporate Round', 'Undisclosed', 'Angel', 'Private Equity', 'Post-IPO Equity', 'Series F', 'Initial Coin Offering', 'Series E', 'Series G', 'Series I']


Define Success binary / boolean

In [339]:
companies['Success'] = (
    companies['Exit Binary'].fillna(False)
).astype(bool)

cols = [c for c in companies.columns if c not in ['Exit Binary', 'Success']]
new_order = cols + ['Exit Binary', 'Success']
companies = companies[new_order]

## G: Prepare Companies Data Frame for Analysis in R

Delete companies without founding round due to errors in Crunchbase filtering

In [340]:
companies = companies[companies['Number Seed Rounds'] != 0]

Create country as a control

In [341]:
companies['Headquarters Country'] = companies['Headquarters Location'].str.split(',').str[-1].str.strip()

insert_at = companies.columns.get_loc('Headquarters Location') + 1
companies.insert(insert_at, 'Headquarters Country', companies.pop('Headquarters Country'))

Calculate Total Funding

In [342]:
seed_rounds = (
    rounds
    .loc[
        (rounds["Funding Type"] == "Seed") &
        (rounds["Money Raised (in USD)"].notna()) &
        (rounds["Money Raised (in USD)"] > 0),
        ["Organization Name URL", "Money Raised (in USD)"]
    ]
)

funding_sums = (
    seed_rounds
    .groupby("Organization Name URL", as_index=False)
    .agg(**{"Total Seed Funding (USD)": ("Money Raised (in USD)", "sum")})
)

companies = (
    companies
    .merge(funding_sums, on="Organization Name URL", how="left")
    .assign(**{
        "Total Seed Funding (USD)": 
            lambda df: df["Total Seed Funding (USD)"].fillna(0)
    })
)

companies["Total Seed Funding (M)"] = (
    companies["Total Seed Funding (USD)"] / 1e6
)


In [343]:
companies['Funding Success'] = companies['Total Equity Funding Amount (in USD)'].fillna(0) > 20000000

companies['Success'] = companies['Exit Binary'] | companies['Funding Success']

funding_success_col = companies.pop('Funding Success')
exit_binary_index = companies.columns.get_loc('Exit Binary') + 1
companies.insert(exit_binary_index, 'Funding Success', funding_success_col)

Only select companies from a certain year

In [344]:
start_year = 2007
end_year = 2022

companies = companies[companies['Founded Year'].between(start_year, end_year)]

## H: Create an investor data frame
Based on the investments in the time frame and prepare it for R analysis

Aggregate features for the investors based on their investment behaviours

In [345]:
seed_help_enriched = seed_help.merge(
    companies[['Company ID', 'Industry Groups', 'Founded Year', 'Success', 'Hub Binary', 'Number Seed Rounds', 'Headquarters Country']],
    on='Company ID',
    how='left'
)

investor_aggregates = seed_help_enriched.groupby('Investor ID').agg(
    Number_of_Seed_Investments              = ('Round ID', 'nunique'),
    Number_of_Companies_Invested            = ('Company ID', 'nunique'),
    Number_of_Unique_Industries_Invested    = ('Industry Groups',
                                               lambda x: x.str.get_dummies(sep=',').sum().gt(0).sum()),
    Average_Company_Age_at_Investment       = ('Organization Founded Year',
                                               lambda x: (seed_help_enriched['Investment Year'] - x).mean()),
    Number_of_Successful_Investments        = ('Success', 'sum'),
    Number_of_Local_Investments             = ('Local Investor', 'sum'),
    Average_Investment_Distance             = ('Distance km', 'mean'),
    Number_of_Investments_in_Hubs           = ('Hub Binary', 'sum'),
    Average_Number_of_Seed_Rounds           = ('Number Seed Rounds', 'mean'),
    Industry_Specific_Investments           = ('Specific VC Binary', 'sum'),
    Number_of_Unique_Countries_Invested     = ('Headquarters Country', 'nunique'),
    Number_of_Investments_with_Fundraising_Success = ('Success', 'sum'),
    Number_of_Lead_Seed_Investments         = ('Lead', 'sum')          
).reset_index()

rename_features = {
    'Number_of_Seed_Investments':               'Seed Investments Count',
    'Number_of_Companies_Invested':             'Unique Companies Count',
    'Number_of_Unique_Industries_Invested':     'Unique Industries Count',
    'Average_Company_Age_at_Investment':        'Avg Company Age at Investment',
    'Number_of_Successful_Investments':         'Successful Investments Count',
    'Number_of_Local_Investments':              'Local Investments Count',
    'Average_Investment_Distance':              'Avg Investment Distance (km)',
    'Number_of_Investments_in_Hubs':            'Hub Investments Count',
    'Average_Number_of_Seed_Rounds':            'Avg Seed Rounds per Company',
    'Industry_Specific_Investments':            'Specific Investments Count',
    'Number_of_Unique_Countries_Invested':      'Unique Countries Count',
    'Number_of_Investments_with_Fundraising_Success': 'Fundraising Success Count',
    'Number_of_Lead_Seed_Investments':          'Seed Lead Investments Count'  
}

investor_aggregates.rename(columns=rename_features, inplace=True)

investors_seed = investors.merge(investor_aggregates, on='Investor ID', how='left')

Select needed features

In [346]:
numeric_columns = investors_seed.select_dtypes(include=['float64', 'int64']).columns
investors_seed[numeric_columns] = investors_seed[numeric_columns].apply(pd.to_numeric, errors='coerce')

required_columns = [
    'Investor ID', 'Organization/Person Name', 'Organization/Person Name URL', 'Description', 'Location', 'Longitude', 'Latitude',
    'Seed Investments Count', 'Seed Lead Investments Count', 'Unique Companies Count', 'Unique Industries Count',
    'Local Investments Count', 'Avg Investment Distance (km)', 'Hub Investments Count',
    'Unique Countries Count', 'Fundraising Success Count', 'Successful Investments Count', 'Specific VC Binary', 'Accelerator', 'Corporate Venture Capital', 'Micro VC', 'Angel Group', "Target VC", "CB Rank (Investor)"
]

investors_seed = investors_seed[required_columns]

Create Country variable

In [347]:
investors_seed['Country'] = investors_seed['Location'].str.split(',').str[-1].str.strip()

insert_at = investors_seed.columns.get_loc('Location') + 1
investors_seed.insert(insert_at, 'Country', investors_seed.pop('Country'))

Create Median Distance, create the local investor variable, and implement restriction on the data quality (seed investment not NaN and Avg Investment Distance not NaN)

In [348]:
investors_seed['Local Investor'] = investors_seed['Avg Investment Distance (km)'] < 100

investors_seed = investors_seed.dropna(subset=['Seed Investments Count'])

investors_seed = investors_seed.dropna(subset=['Avg Investment Distance (km)'])

Primary Role in Investments (based on the majority)

In [349]:
round_counts = (
    seed_help
    .groupby("Round ID")["Investor ID"]
    .nunique()
    .rename("num_investors")
)

seed_help = seed_help.merge(round_counts, on="Round ID", how="left")

def assign_role(row):
    if row["Lead"]:
        return "Lead"
    else:
        return "Co-Investor"

seed_help["Syndication Role"] = seed_help.apply(assign_role, axis=1)

primary_roles = (
    seed_help
    .groupby("Investor ID")["Syndication Role"]
    .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else pd.NA)
    .rename("Primary Role")
)

investors_seed = investors_seed.merge(
    primary_roles,
    on="Investor ID",
    how="left"
)

investors_seed["Lead Investor"] = investors_seed["Primary Role"] == "Lead"

investors_seed = investors_seed.drop(columns=["Primary Role"])

investors_seed["Lead Investor"] = investors_seed["Lead Investor"].astype("boolean")

Hub Investor

In [350]:
investors_seed["Hub Investor"] = (
    investors_seed["Hub Investments Count"] /
    investors_seed["Seed Investments Count"]
) > 0.75

Only select investors with at least a certain number of investments

In [351]:
min_seed_investments = 1
investors_seed = investors_seed[investors_seed['Seed Investments Count'] >= min_seed_investments]

Ordering of dataframe

In [352]:
logical_order = [
    'Investor ID', 'Organization/Person Name', 'Organization/Person Name URL', 'Longitude', 'Latitude', 'Location', 'Country', 'Accelerator', 'Angel Group', 'Micro VC', 
    'Corporate Venture Capital', 'Specific VC Binary', 'Local Investor', 'Hub Investor', 'Lead Investor', 'Target VC', 'Seed Investments Count', 'Seed Lead Investments Count', 
    'Unique Companies Count', 'Unique Industries Count', 'Local Investments Count', 'Avg Investment Distance (km)', 'Hub Investments Count',
    'Unique Countries Count', 'Fundraising Success Count', 'Successful Investments Count', "CB Rank (Investor)"
]
investors_seed = investors_seed[logical_order]

Final output

In [353]:
investors_seed.columns

Index(['Investor ID', 'Organization/Person Name',
       'Organization/Person Name URL', 'Longitude', 'Latitude', 'Location',
       'Country', 'Accelerator', 'Angel Group', 'Micro VC',
       'Corporate Venture Capital', 'Specific VC Binary', 'Local Investor',
       'Hub Investor', 'Lead Investor', 'Target VC', 'Seed Investments Count',
       'Seed Lead Investments Count', 'Unique Companies Count',
       'Unique Industries Count', 'Local Investments Count',
       'Avg Investment Distance (km)', 'Hub Investments Count',
       'Unique Countries Count', 'Fundraising Success Count',
       'Successful Investments Count', 'CB Rank (Investor)'],
      dtype='object')

## I: Save the dataframes and make formatting adjustments

In [354]:
keep_types = ["Pre-Seed", "Seed", "Angel"]
rounds_filtered = rounds[rounds["Funding Type"].isin(keep_types)].copy()
rounds_filtered.reset_index(drop=True, inplace=True)

Add Target VC to Companies

Assigning the Crunchbase Categories to the 11 GICS Categories + Other <br>
https://www.msci.com/our-solutions/indexes/gics 

In [355]:
companies_seed = companies.copy()

to_move = [
    "Administrative Services", "Advertising", "Agriculture and Farming",
    "Apps", "Artificial Intelligence (AI)", "Biotechnology",
    "Blockchain and Cryptocurrency", "Clothing and Apparel",
    "Commerce and Shopping", "Community and Lifestyle",
    "Consumer Electronics", "Consumer Goods", "Content and Publishing",
    "Data and Analytics", "Design", "Education", "Energy", "Events",
    "Financial Services", "Food and Beverage", "Gaming",
    "Government and Military", "Hardware", "Health Care",
    "Information Technology", "Internet Services",
    "Lending and Investments", "Manufacturing", "Media and Entertainment",
    "Messaging and Telecommunications", "Mobile", "Music and Audio",
    "Natural Resources", "Navigation and Mapping", "Other", "Payments",
    "Platforms", "Privacy and Security", "Professional Services",
    "Real Estate", "Sales and Marketing", "Science and Engineering",
    "Social Impact", "Software", "Sports", "Sustainability",
    "Transportation", "Travel and Tourism", "Video"
]
cols_to_move = [c for c in to_move if c in companies_seed.columns]

new_order = [c for c in companies_seed.columns if c not in cols_to_move] + cols_to_move
companies_seed = companies_seed[new_order]

gics_map = {
    "Energy": "Energy",
    "Natural Resources": "Materials",
    "Administrative Services": "Industrials",
    "Design": "Industrials",
    "Government and Military": "Industrials",
    "Manufacturing": "Industrials",
    "Professional Services": "Industrials",
    "Science and Engineering": "Industrials",
    "Social Impact": "Industrials",
    "Transportation": "Industrials",
    "Clothing and Apparel": "Consumer Discretionary",
    "Commerce and Shopping": "Consumer Discretionary",
    "Community and Lifestyle": "Consumer Discretionary",
    "Consumer Electronics": "Consumer Discretionary",
    "Consumer Goods": "Consumer Discretionary",
    "Education": "Consumer Discretionary",
    "Events": "Consumer Discretionary",
    "Sports": "Consumer Discretionary",
    "Travel and Tourism": "Consumer Discretionary",
    "Agriculture and Farming": "Consumer Staples",
    "Food and Beverage": "Consumer Staples",
    "Biotechnology": "Health Care",
    "Health Care": "Health Care",
    "Financial Services": "Financials",
    "Lending and Investments": "Financials",
    "Payments": "Financials",
    "Apps": "Information Technology",
    "Artificial Intelligence (AI)": "Information Technology",
    "Blockchain and Cryptocurrency": "Information Technology",
    "Data and Analytics": "Information Technology",
    "Hardware": "Information Technology",
    "Information Technology": "Information Technology",
    "Internet Services": "Information Technology",
    "Mobile": "Information Technology",
    "Navigation and Mapping": "Information Technology",
    "Platforms": "Information Technology",
    "Privacy and Security": "Information Technology",
    "Software": "Information Technology",
    "Advertising": "Communication Services",
    "Content and Publishing": "Communication Services",
    "Gaming": "Communication Services",
    "Media and Entertainment": "Communication Services",
    "Messaging and Telecommunications": "Communication Services",
    "Music and Audio": "Communication Services",
    "Sales and Marketing": "Communication Services",
    "Video": "Communication Services",
    "Sustainability": "Utilities",
    "Real Estate": "Real Estate",
    "Other": "Other",
}

head_to_orig = defaultdict(list)
for orig, head in gics_map.items():
    head_to_orig[head].append(orig)

for head, orig_cols in head_to_orig.items():
    companies_seed[head] = companies_seed[orig_cols].any(axis=1)

to_drop = [c for c in cols_to_move if gics_map[c] != c]
companies_seed.drop(columns=to_drop, inplace=True)

gics_heads = list(head_to_orig.keys())
final_order = [c for c in companies_seed.columns if c not in gics_heads] + gics_heads
companies_seed = companies_seed[final_order]

Make sure that at least one investor is known for a round

In [356]:
valid_investor_ids = investors_seed['Investor ID'].unique()
seed_help = seed_help[seed_help['Investor ID'].isin(valid_investor_ids)]

valid_company_ids = seed_help['Company ID'].unique()

initial_company_count = companies_seed['Company ID'].nunique()
print(f"Initial number of companies in companies_seed: {initial_company_count}")

companies_seed = companies_seed[companies_seed['Company ID'].isin(valid_company_ids)]

final_company_count = companies_seed['Company ID'].nunique()
print(f"Final number of companies in companies_seed: {final_company_count}")

dropped_company_count = initial_company_count - final_company_count
print(f"Number of companies dropped: {dropped_company_count}")

Initial number of companies in companies_seed: 19543
Final number of companies in companies_seed: 19443
Number of companies dropped: 100


Include controls

In [357]:
if 'Investment Year' not in seed_help.columns:
    seed_help['Investment Year'] = pd.to_datetime(
        seed_help['Announced Date'], errors='coerce'
    ).dt.year

first_seed = (
    seed_help
    .groupby('Company ID')['Investment Year']
    .min()
    .rename('First Seed Year')
)

companies_seed = companies_seed.merge(
    first_seed,
    on='Company ID',
    how='left'
)

companies_seed['First Seed Year'] = companies_seed['First Seed Year'].fillna(0).astype(int)

europe investments as controls

In [358]:
europe_investments = pd.read_csv('data/europe_investments/europe_investment_per_year_country.csv')

# Define the country mapping
country_mapping = {
    'Austria': 'Austria',
    'Belgium': 'Belgium',
    'Czech Republic': 'Czech Republic',
    'Denmark': 'Denmark',
    'Estonia': 'Baltics',
    'Latvia': 'Baltics',
    'Lithuania': 'Baltics',
    'Finland': 'Finland',
    'France': 'France',
    'Germany': 'Germany',
    'Greece': 'Greece',
    'Hungary': 'Hungary',
    'Ireland': 'Ireland',
    'Italy': 'Italy',
    'Norway': 'Norway',
    'Poland': 'Poland',
    'Portugal': 'Portugal',
    'Spain': 'Spain',
    'Switzerland': 'Switzerland',
    'The Netherlands': 'The Netherlands',
    'United Kingdom': 'United Kingdom'
}

companies_seed['Mapped Country'] = companies_seed['Headquarters Country'].map(country_mapping)

companies_seed = companies_seed[companies_seed['Mapped Country'] != 'Slovakia (Slovak Republic)']

merged_data = companies_seed.merge(
    europe_investments,
    left_on=['Mapped Country', 'First Seed Year'],
    right_on=['Country', 'Year'],
    how='left'
)

companies_seed['Investment Volume in First Seed Year'] = merged_data['Sums']

companies_seed['Investment Volume in First Seed Year (log)'] = np.log1p(companies_seed['Investment Volume in First Seed Year'])

companies_seed = companies_seed[
    companies_seed['First Seed Year'].between(2007, 2022)]

Same City

In [359]:
seed_help['Seed Investor Same City'] = seed_help['Distance km'] < 20

num_seed_investors_same_city = (
    seed_help.groupby('Company ID')['Seed Investor Same City']
    .sum()
    .rename('Number Seed Investors Same City')
)

seed_help['Seed Lead Investor Same City'] = seed_help['Lead'] & seed_help['Seed Investor Same City']

num_seed_lead_investors_same_city = (
    seed_help.groupby('Company ID')['Seed Lead Investor Same City']
    .sum()
    .rename('Number Seed Lead Investors Same City')
)

companies_seed = companies_seed.merge(
    num_seed_investors_same_city, on='Company ID', how='left'
).merge(
    num_seed_lead_investors_same_city, on='Company ID', how='left'
)

companies_seed['Number Seed Investors Same City'] = companies_seed['Number Seed Investors Same City'].fillna(0).astype(int)
companies_seed['Number Seed Lead Investors Same City'] = companies_seed['Number Seed Lead Investors Same City'].fillna(0).astype(int)

companies_seed['Seed Investor Same City'] = companies_seed['Number Seed Investors Same City'] > 0
companies_seed['Seed Lead Investor Same City'] = companies_seed['Number Seed Lead Investors Same City'] > 0

Homecountry

In [360]:
seed_help = seed_help.merge(
    investors_seed[['Investor ID', 'Country']],
    on='Investor ID',
    how='left'
)

seed_help = seed_help.merge(
    companies_seed[['Company ID', 'Headquarters Country']],
    on='Company ID',
    how='left'
)

seed_help['Homecountry Investor'] = seed_help['Country'] == seed_help['Headquarters Country']


num_homecountry_investors = (
    seed_help.groupby('Company ID')['Homecountry Investor']
    .sum()
    .rename('Number Homecountry Investors')
)

seed_help['Homecountry Lead Investor'] = seed_help['Lead'] & seed_help['Homecountry Investor']

num_homecountry_lead_investors = (
    seed_help.groupby('Company ID')['Homecountry Lead Investor']
    .sum()
    .rename('Number Homecountry Lead Investors')
)

companies_seed = companies_seed.merge(
    num_homecountry_investors, on='Company ID', how='left'
).merge(
    num_homecountry_lead_investors, on='Company ID', how='left'
)

companies_seed['Number Homecountry Investors'] = companies_seed['Number Homecountry Investors'].fillna(0).astype(int)
companies_seed['Number Homecountry Lead Investors'] = companies_seed['Number Homecountry Lead Investors'].fillna(0).astype(int)


companies_seed['Homecountry Investor'] = companies_seed['Number Homecountry Investors'] > 0
companies_seed['Homecountry Lead Investor'] = companies_seed['Number Homecountry Lead Investors'] > 0

Investor within 50km

In [361]:
seed_help['Seed Investor Within 50km'] = seed_help['Distance km'] <= 10

num_seed_investors_within_50km = (
    seed_help.groupby('Company ID')['Seed Investor Within 50km']
    .sum()
    .rename('Number Seed Investors Within 50km')
)

companies_seed = companies_seed.merge(
    num_seed_investors_within_50km, on='Company ID', how='left'
)

companies_seed['Number Seed Investors Within 50km'] = companies_seed['Number Seed Investors Within 50km'].fillna(0).astype(int)

companies_seed['Seed Investor Within 50km Binary'] = companies_seed['Number Seed Investors Within 50km'] > 0

Number Seed Lead investors within 50 km

In [362]:
seed_help['Seed Lead Investor Within 50km'] = seed_help['Lead'] & (seed_help['Distance km'] <= 10)

num_seed_lead_investors_within_50km = (
    seed_help.groupby('Company ID')['Seed Lead Investor Within 50km']
    .sum()
    .rename('Number Seed Lead Investors Within 50km')
)

companies_seed = companies_seed.merge(
    num_seed_lead_investors_within_50km, on='Company ID', how='left'
)

companies_seed['Number Seed Lead Investors Within 50km'] = companies_seed['Number Seed Lead Investors Within 50km'].fillna(0).astype(int)

companies_seed['Seed Lead Investor Within 50km Binary'] = companies_seed['Number Seed Lead Investors Within 50km'] > 0

Startups within 50km

In [363]:
companies_seed = companies_seed.dropna(subset=['Latitude', 'Longitude'])
companies_seed['Latitude'] = companies_seed['Latitude'].astype(float)
companies_seed['Longitude'] = companies_seed['Longitude'].astype(float)

companies_seed = companies_seed.sort_values('Founded Year').reset_index(drop=True)

coords_rad = np.radians(companies_seed[['Latitude', 'Longitude']].values)
years = companies_seed['Founded Year'].values

radius = 50 / 6371.0

within_50km_counts = []
incremental_coords = []

for i in range(len(companies_seed)):
    lat_lon = coords_rad[i]
    incremental_coords.append(lat_lon)

    tree = BallTree(np.array(incremental_coords), metric='haversine')
    count = tree.query_radius([lat_lon], r=radius, count_only=True)[0]

    within_50km_counts.append(count)

companies_seed['Startups Within 50km'] = within_50km_counts

VCs within 50km

In [364]:
vc_investors = investors_seed.dropna(subset=['Latitude', 'Longitude'])
vc_investors['Latitude'] = vc_investors['Latitude'].astype(float)
vc_investors['Longitude'] = vc_investors['Longitude'].astype(float)

vc_coords_rad = np.radians(vc_investors[['Latitude', 'Longitude']].values)

companies_seed = companies_seed.dropna(subset=['Latitude', 'Longitude'])
companies_seed['Latitude'] = companies_seed['Latitude'].astype(float)
companies_seed['Longitude'] = companies_seed['Longitude'].astype(float)

startup_coords_rad = np.radians(companies_seed[['Latitude', 'Longitude']].values)

radius = 50 / 6371.0

tree = BallTree(vc_coords_rad, metric='haversine')

counts = tree.query_radius(startup_coords_rad, r=radius, count_only=True)

companies_seed['VCs Within 50km'] = counts
companies_seed['VCs Within 50km Binary'] = (counts > 0).astype(int)

gdpr rates

In [365]:
gdp_data = pd.read_csv('data/gdp_rates/european_gdp_rates.csv')

gdp_data.rename(columns={'Year': 'Funding Year', 'GDP Rate (%)': 'GDP Growth (%) in Funding Year'}, inplace=True)

companies_seed = companies_seed.merge(
    gdp_data,
    left_on='Founded Year',
    right_on='Funding Year',
    how='left'
)

companies_seed.drop(columns=['Funding Year'], inplace=True)

Time to Exit

In [366]:
companies_seed['Exit Date'] = pd.to_datetime(companies_seed['Exit Date'], format='%d/%m/%Y', errors='coerce')

companies_seed['Exit Year'] = companies_seed['Exit Date'].dt.year

companies_seed['Years to Exit'] = (
    companies_seed['Exit Year'] - companies_seed['Founded Year']
).where(companies_seed['Exit Year'].notna() & companies_seed['Founded Year'].notna())

companies_seed['Years to Exit'] = companies_seed['Years to Exit'].astype('Int64')

Min, Median Max VC Distance

In [367]:
distance_stats_normal = seed_help[~seed_help['Lead']].groupby('Company ID')['Distance km'].agg(
    Min_Seed_VC_Distance_Normal='min',
    Avg_Seed_VC_Distance_Normal='mean',
    Max_Seed_VC_Distance_Normal='max'
)

distance_stats_lead = seed_help[seed_help['Lead']].groupby('Company ID')['Distance km'].agg(
    Min_Seed_VC_Distance_Lead='min',
    Avg_Seed_VC_Distance_Lead='mean',
    Max_Seed_VC_Distance_Lead='max'
)

companies_seed = companies_seed.merge(distance_stats_normal, on='Company ID', how='left')
companies_seed = companies_seed.merge(distance_stats_lead, on='Company ID', how='left')

columns_to_fill = [
    'Min_Seed_VC_Distance_Normal', 'Avg_Seed_VC_Distance_Normal', 'Max_Seed_VC_Distance_Normal',
    'Min_Seed_VC_Distance_Lead', 'Avg_Seed_VC_Distance_Lead', 'Max_Seed_VC_Distance_Lead'
]
companies_seed[columns_to_fill] = companies_seed[columns_to_fill].fillna(0)

companies_seed.rename(columns={
    'Min_Seed_VC_Distance_Normal': 'Minimum Seed VC Distance',
    'Avg_Seed_VC_Distance_Normal': 'Avg Seed VC Distance',
    'Max_Seed_VC_Distance_Normal': 'Maximum Seed VC Distance',
    'Min_Seed_VC_Distance_Lead': 'Minimum Lead Seed VC Distance',
    'Avg_Seed_VC_Distance_Lead': 'Avg Lead Seed VC Distance',
    'Max_Seed_VC_Distance_Lead': 'Maximum Lead Seed VC Distance'
}, inplace=True)

Investor Distances

In [368]:
companies_with_multiple_investors = (
    seed_help.groupby('Company ID')['Investor ID']
    .nunique()
    .loc[lambda x: x > 1]
    .index
)

filtered_seed_help = seed_help[seed_help['Company ID'].isin(companies_with_multiple_investors)]

grouped_investors = filtered_seed_help.groupby('Company ID')

investor_distance_stats = []
for company_id, investor_group in grouped_investors:
    investor_coords = investor_group[['Investor Latitude', 'Investor Longitude']].dropna().values
    pairwise_distances = [
        haversine(lat1, lon1, lat2, lon2)
        for (lat1, lon1), (lat2, lon2) in combinations(investor_coords, 2)
    ]
    investor_distance_stats.append({
        'Company ID': company_id,
        'Avg Seed Investor Pairwise Distance': sum(pairwise_distances) / len(pairwise_distances),
        'Min Seed Investor Pairwise Distance': min(pairwise_distances),
        'Max Seed Investor Pairwise Distance': max(pairwise_distances)
    })

investor_distance_stats_df = pd.DataFrame(investor_distance_stats)

companies_seed = companies_seed.merge(investor_distance_stats_df, on='Company ID', how='left')

Hub VC

In [369]:
hub_distance_threshold = 20 

seed_help['Hub VC in Seed Binary'] = seed_help['Distance km'] <= hub_distance_threshold

seed_help['Hub Lead VC in Seed Binary'] = seed_help['Lead'] & (seed_help['Distance km'] <= hub_distance_threshold)

hub_vc_binary = (
    seed_help.groupby('Company ID')['Hub VC in Seed Binary']
    .any()
    .rename('Hub VC in Seed Binary')
)

hub_lead_vc_binary = (
    seed_help.groupby('Company ID')['Hub Lead VC in Seed Binary']
    .any()
    .rename('Hub Lead VC in Seed Binary')
)

companies_seed = companies_seed.merge(hub_vc_binary, on='Company ID', how='left')
companies_seed = companies_seed.merge(hub_lead_vc_binary, on='Company ID', how='left')

companies_seed['Hub VC in Seed Binary'] = companies_seed['Hub VC in Seed Binary'].fillna(False)
companies_seed['Hub Lead VC in Seed Binary'] = companies_seed['Hub Lead VC in Seed Binary'].fillna(False)

Only consider companies for the lead features if there are more or equal than 1 lead VC

In [370]:
columns_to_update = [
    "Number Lead Seed Investors",
    "Seed Lead Investor Same City",
    "Homecountry Lead Investor",
    "Regional Lead Seed Investor Binary",
    "Overregional Lead Seed Investor Binary",
    "Specific Lead VC in Seed Binary",
    "Accelerator Lead Funding Binary",
    "Angel Group Lead Funding Binary",
    "Micro VC Lead Funding Binary",
    "Corporate Venture Capital Lead Funding Binary",
    "Target Lead VC in Seed Binary",
    "Hub Lead VC in Seed Binary"
]

mask_no_lead = companies_seed["Number Lead Seed Investors"] == 0
companies_seed.loc[mask_no_lead, columns_to_update] = None

  companies_seed.loc[mask_no_lead, columns_to_update] = None
  companies_seed.loc[mask_no_lead, columns_to_update] = None
  companies_seed.loc[mask_no_lead, columns_to_update] = None


Add university eco system

In [371]:
path = kagglehub.dataset_download("darrylljk/worlds-best-universities-qs-rankings-2025")

file_path = os.path.join(path, "qs-world-rankings-2025.csv")

university = pd.read_csv(file_path)

In [372]:
selected_countries = [
    "United Kingdom", "Switzerland", "France", "Germany", "Netherlands", "Belgium", 
    "Sweden", "Ireland", "Denmark", "Italy", "Finland", "Norway", "Austria", "Spain", 
    "Czech Republic", "Poland", "Portugal", "Greece", "Luxembourg", "Estonia", 
    "Belarus", "Cyprus", "Lithuania", "Hungary", "Slovenia", "Croatia", "Slovakia", 
    "Bulgaria", "Ukraine", "Latvia", "Serbia", "Malta", "Romania", "Bosnia and Herzegovina"
]

filtered_universities = university[
    university["Location Full"].isin(selected_countries)
]

In [373]:
filtered_universities['2025 Rank Numeric'] = pd.to_numeric(
    filtered_universities['2025 Rank'].str.replace('+', '', regex=False), errors='coerce'
)

min_rank = 1
max_rank = 200

universities_in_range = filtered_universities[
    (filtered_universities['2025 Rank Numeric'] >= min_rank) &
    (filtered_universities['2025 Rank Numeric'] <= max_rank)
]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_universities['2025 Rank Numeric'] = pd.to_numeric(


In [374]:
universities_in_range['OpenCage Request'] = universities_in_range['City'] + ", " + universities_in_range['Location Full']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  universities_in_range['OpenCage Request'] = universities_in_range['City'] + ", " + universities_in_range['Location Full']


In [375]:
def get_coordinates(location):
    url = f'https://api.opencagedata.com/geocode/v1/json?q={location}&key={api_key}'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data['results']:
            return data['results'][0]['geometry']['lat'], data['results'][0]['geometry']['lng']
    return None, None

backup_file = 'data/university/universities_coordinates_backup.csv'

if os.path.exists(backup_file):

    universities_in_range = pd.read_csv(backup_file)
    print("Loaded data from backup file.")
else:
    
    universities_in_range['Latitude'], universities_in_range['Longitude'] = zip(
        *universities_in_range['OpenCage Request'].apply(lambda loc: get_coordinates(loc) if pd.notna(loc) else (None, None))
    )
    
    universities_in_range.to_csv(backup_file, index=False)
    print("Coordinates fetched and saved to backup file.")

Loaded data from backup file.


In [376]:
companies_coords_rad = np.radians(companies_seed[['Latitude', 'Longitude']].values)
universities_coords_rad = np.radians(universities_in_range[['Latitude', 'Longitude']].dropna().values)

radius = 20 / 6371.0

university_tree = BallTree(universities_coords_rad, metric='haversine')

counts = university_tree.query_radius(companies_coords_rad, r=radius, count_only=True)

companies_seed['Top University Count'] = counts
companies_seed['Top University Ecosystem Binary'] = (counts > 0).astype(bool)

Successful Environment

In [377]:
EARTH_RADIUS_KM = 6371.0
radius_km       = 10.0
radius_rad      = radius_km / EARTH_RADIUS_KM

coords_rad = np.deg2rad(companies_seed[['Latitude', 'Longitude']].values)
tree       = BallTree(coords_rad, metric='haversine')

all_neighbors = tree.query_radius(coords_rad, r=radius_rad, return_distance=False)

counts         = []
success_counts = []

for idx, neigh_idx in enumerate(all_neighbors):
    year_cutoff = companies_seed.at[idx, 'Founded Year']
    
    valid = [
        j for j in neigh_idx
        if j != idx and companies_seed.at[j, 'Founded Year'] <= year_cutoff
    ]
    counts.append(len(valid))
    
    succ = companies_seed.loc[valid, 'Success'].sum()
    success_counts.append(succ)


companies_seed['Founded Startups in Environment']        = counts
companies_seed['Successful Startups in Environment']     = success_counts


companies_seed['Success Share 10km'] = (
    companies_seed['Successful Startups in Environment']
      .div(companies_seed['Founded Startups in Environment'])
      .replace([np.inf, -np.inf], 0)
      .fillna(0)
)

In [378]:
output_path = 'data/sets-for-r/investors_seed.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True) 
investors_seed.to_csv(output_path, index=False)

output_path = 'data/sets-for-r/companies_seed.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)  
companies_seed.to_csv(output_path, index=False)

output_path = 'data/sets-for-r/seed_help.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)  
seed_help.to_csv(output_path, index=False)

output_path = 'data/sets-for-r/rounds.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
rounds_filtered.to_csv(output_path, index=False)