# Feature Engineering
Explanation:

## Import Packages

In [42]:
from importnb import Notebook
import pandas as pd
import numpy as np
from rapidfuzz import fuzz, process
from transformers import pipeline
from fuzzywuzzy import fuzz
from dateutil import parser
from datetime import datetime
import re

## Dataframe import from 'DataPreprocessing'

In [43]:
with Notebook():
    from DataPreprocessing_Pipeline import companies, funding, investors, founders

## New Features: Companies <> Investment Rounds
Explanation:

### Feature 1: Months Until First Round
Number of months between the funding date and the first investment round. Explanation: The table 'all_rounds' has every investment round made during the existence of the companies. We already mapped the respectively funding round to the startup. 

In [44]:
def calculate_months_until_first_round(company_id, founded_date):
    rounds = funding[funding['Company ID'] == company_id]

    if rounds.empty:
        return np.nan

    first_round_date = rounds['Announced Date'].min()

    delta_months = (first_round_date.year - founded_date.year) * 12 + (first_round_date.month - founded_date.month)
    return max(delta_months, 0)

companies['Months until First Round'] = companies.apply(
    lambda row: calculate_months_until_first_round(row['ID'], row['Founded Date']),
    axis=1
)

companies['Months until First Round'] = companies['Months until First Round'].fillna(-1)

### Feature 2: Grant Y/N
If the company received a Grant, they get a Yes (1). If not, a No (0). For VCs it is desirable to have a company with a Grant because it leverages the invested money as a Grant do not dilute existing shareholders.

In [45]:
def check_grant_for_company(company_id):
    company_rounds = funding[funding['Company ID'] == company_id]
    has_grant = any(company_rounds['Funding Type'] == 'Grant')
    return 1 if has_grant else 0

companies['Grant Y/N'] = companies['ID'].apply(check_grant_for_company)

### Feature 3: Last Round Type
Explanation:

In [46]:
funding['Announced Date'] = pd.to_datetime(funding['Announced Date'])

last_round = (
    funding.sort_values(by=['Company ID', 'Announced Date'])
    .groupby('Company ID')
    .last()
    .reset_index()
)

last_round = last_round[['Company ID', 'Funding Type']]

companies = companies.merge(
    last_round,
    left_on='ID',
    right_on='Company ID',
    how='left'
)

companies = companies.rename(columns={'Funding Type': 'Last Round Type'})

companies.drop(columns=['Company ID'], inplace=True)

companies['Last Round Type'] = companies['Last Round Type'].fillna('No Funding')

companies.drop(columns=['Last Funding Type'], inplace=True)

### Feature 4: Acquisition Status 'Was Acquired', 'Made Acquisitions'
Explanation:

In [47]:
if 'Acquisition Status' in companies.columns:
    unique_values = companies['Acquisition Status'].dropna().unique()
    print(unique_values)

['Was Acquired' 'Made Acquisitions' 'Made Acquisitions, Was Acquired']


In [48]:
companies['Was Acquired'] = 0
companies['Made Acquisitions'] = 0

companies.loc[companies['Acquisition Status'] == 'Was Acquired', 'Was Acquired'] = 1
companies.loc[companies['Acquisition Status'] == 'Made Acquisitions', 'Made Acquisitions'] = 1
companies.loc[companies['Acquisition Status'] == 'Made Acquisitions, Was Acquired', ['Made Acquisitions', 'Was Acquired']] = 1

companies.drop(columns=['Acquisition Status'], inplace=True)

### Feature 5: Funding phases
Explanation:

In [49]:
if 'Last Round Type' in companies.columns:
    distinct_funding_types = companies['Last Round Type'].dropna().unique()
    print(distinct_funding_types)
else:
    print("The column 'Funding Type' does not exist in the DataFrame.")

['Seed' 'No Funding' 'Venture - Series Unknown' 'Series A' 'Series B'
 'Convertible Note' 'Angel' 'Post-IPO Debt' 'Private Equity' 'Pre-Seed'
 'Grant' 'Series C' 'Series F' 'Series D' 'Post-IPO Secondary' 'Series E'
 'Post-IPO Equity']


In [50]:
funding_columns = ['Project Funding', 'Startup Funding', 'Growth Funding', 'Expansion Funding', 'Exit Funding']
for col in funding_columns:
    companies[col] = 0

funding_mapping = {
    'Project Funding': ['Angel', 'Pre-Seed', 'Convertible Note', 'Grant', 'Venture - Series Unknown'],
    'Startup Funding': ['Seed', 'Venture - Series Unknown'],
    'Growth Funding': ['Series A', 'Series B', 'Series C'],
    'Expansion Funding': ['Series D', 'Series E', 'Series F'],
    'Exit Funding': ['Private Equity', 'Post-IPO Secondary', 'Post-IPO Equity']
}

for funding_type, types in funding_mapping.items():
    companies.loc[companies['Last Round Type'].isin(types), funding_type] = 1

companies.loc[companies['Exit Funding'] == 1, funding_columns[:-1]] = 1
companies.loc[companies['Expansion Funding'] == 1, ['Growth Funding', 'Startup Funding', 'Project Funding']] = 1
companies.loc[companies['Growth Funding'] == 1, ['Startup Funding', 'Project Funding']] = 1
companies.loc[companies['Startup Funding'] == 1, 'Project Funding'] = 1

### Feature 6: HighFunding (Series B or higher)

In [51]:
high_funding_rounds = ['Series B', 'Series C', 'Series D', 'Series E', 'Series F', 'Private Equity', 'Post-IPO Equity']
companies['HighFunding'] = companies['Last Round Type'].isin(high_funding_rounds).astype(int)

In [52]:
companies

Unnamed: 0,ID,Organization Name,Industries,Headquarters Location,Description,CB Rank (Company),Postal Code,Founded Date,Exit Date,Website,...,Grant Y/N,Last Round Type,Was Acquired,Made Acquisitions,Project Funding,Startup Funding,Growth Funding,Expansion Funding,Exit Funding,HighFunding
0,000001,2trde,"[Automotive, Software]","Munich, Bayern, Germany",2trde develops a software solution designed fo...,57123,,2017-01-01,NaT,https://www.2trde.com,...,0,Seed,0,0,1,1,0,0,0,0
1,000002,36ZERO Vision,"[Artificial Intelligence (AI), Computer Vision...","Munich, Bayern, Germany",36ZEROVision is an AI-powered visual inspectio...,51326,81671,2019-01-01,NaT,https://36zerovision.com/,...,0,Seed,0,0,1,1,0,0,0,0
2,000003,3Bears Foods,[Food and Beverage],"Munich, Bayern, Germany",3Bears Foods enable a balanced and delicious b...,275817,,2015-01-01,NaT,https://3bears.de/,...,0,Seed,0,0,1,1,0,0,0,0
3,000004,3dTrust,"[3D Printing, Manufacturing, Software]","Munich, Bayern, Germany",3dTrust helps companies integrate 3D printing ...,134694,80797,2015-01-01,NaT,http://3dtrust.de,...,0,No Funding,0,0,0,0,0,0,0,0
4,000005,abaut,"[Analytics, Artificial Intelligence (AI), Cons...","Munich, Bayern, Germany",abaut builds a SaaS that enables businesses al...,219525,80992,2017-07-21,NaT,https://abaut.de,...,0,Seed,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1513,001506,SAYM,"[Apps, B2B, B2C, Human Resources, Mobile Apps,...","Aachen, Nordrhein-Westfalen, Germany",The SAYM platform for swarm mobility defines t...,166910,52070.0,2019-01-01,NaT,https://www.saym.io/,...,1,Grant,0,0,1,0,0,0,0,0
1514,001507,SONAH GmbH,"[Apps, Artificial Intelligence (AI), Computer ...","Aachen, Nordrhein-Westfalen, Germany",SONAH developed a flexible embedded vision sen...,121752,52070.0,2016-01-01,NaT,http://www.sonah.tech,...,0,Seed,0,0,1,1,0,0,0,0
1515,001508,Taxy.io,"[FinTech, Legal Tech, Machine Learning, Software]","Aachen, Nordrhein-Westfalen, Germany",Taxy.io builds the leading platform for B2B ta...,220816,52070.0,2019-01-01,NaT,https://www.taxy.io/,...,0,Seed,0,0,1,1,0,0,0,0
1516,001509,TRINKKOST GmbH,"[Agriculture, Consumer Goods, Fitness, Food an...","Aachen, Nordrhein-Westfalen, Germany",TRINKKOST is a food supplement manufacturing c...,907817,,2016-01-01,NaT,http://www.trinkkost.de,...,0,Venture - Series Unknown,0,0,1,1,0,0,0,0


In [53]:
funding

Unnamed: 0,Funding ID,Organization Name,Funding Type,Money Raised,Announced Date,Lead Investors,Number of Investors,Investor Names,Company ID
0,000001,4stop,Series A,2500000.0,2019-05-15,Ventech,1.0,[Ventech],000000
1,000002,aiconix GmbH,Convertible Note,,2019-06-17,,,[],001193
2,000003,aiconix GmbH,Pre-Seed,300000.0,2019-08-28,,,[],001193
3,000004,AutLay,Seed,,2019-05-22,Crew Ventures,1.0,[Crew Ventures],001196
4,000005,Buynomics,Pre-Seed,,2019-09-01,DvH Ventures,2.0,"[DvH Ventures, Tomahawk.VC]",001205
...,...,...,...,...,...,...,...,...,...
6863,006864,VAMOS.ai,Seed,,2020-01-01,DDG AG,1.0,[DDG AG],000000
6864,006865,WeProfit,Pre-Seed,272000.0,2021-05-28,,5.0,"[Ara Abrahamyan, Armen Kocharyan, Clemens Boll...",000298
6865,006866,WindStar Medical GmbH,Seed,,2020-12-07,Project A Ventures,1.0,[Project A Ventures],000000
6866,006867,WorkMentality Foundation,Seed,,2023-03-01,,1.0,[Deutsche Bank],000000


### Feature 7: Average Time To Next Round
Explanation:

In [54]:
funding['Announced Date'] = pd.to_datetime(funding['Announced Date'])
companies['Founded Date'] = pd.to_datetime(companies['Founded Date'])

def calculate_avg_time_to_next_round(company_id, company_founded_date, funding_df):
    company_funding = funding_df[funding_df['Company ID'] == company_id]

    if len(company_funding) == 1:
        time_diff = (company_funding['Announced Date'].iloc[0] - company_founded_date).days
        avg_time_to_next_round = time_diff / 30
    elif len(company_funding) >= 2:
        time_diffs = []
        first_round_diff = (company_funding['Announced Date'].iloc[0] - company_founded_date).days
        time_diffs.append(first_round_diff)
        company_funding = company_funding.sort_values('Announced Date')
        for i in range(1, len(company_funding)):
            time_diff = (company_funding['Announced Date'].iloc[i] - company_funding['Announced Date'].iloc[i-1]).days
            time_diffs.append(time_diff)
        avg_time_to_next_round = sum(time_diffs) / len(time_diffs) / 30
    else:
        avg_time_to_next_round = None

    if pd.isna(avg_time_to_next_round) or avg_time_to_next_round < -1:
        avg_time_to_next_round = -1

    return avg_time_to_next_round

companies['Average Time To Next Round'] = companies.apply(
    lambda row: calculate_avg_time_to_next_round(row['ID'], row['Founded Date'], funding), axis=1
)

### Feature 8: Average Funding Size
Explanation:

In [55]:
funding_summary = funding.groupby('Company ID').agg(
    TotalMoneyRaised=('Money Raised', 'sum'),
    TotalRounds=('Money Raised', 'count')
).reset_index()

funding_summary['Average Funding Size'] = funding_summary['TotalMoneyRaised'] / funding_summary['TotalRounds']

companies = companies.merge(
    funding_summary[['Company ID', 'Average Funding Size']],
    left_on='ID',
    right_on='Company ID',
    how='left'
)

companies['Average Funding Size'] = companies['Average Funding Size'].fillna(0)

companies.drop(columns=['Company ID'], inplace=True)

## New Features: Companies <> Investors
Explanation:

### Feature 1-4: Average Number of Investments by Investors
Explanation:

In [56]:
funding_exploded = funding.explode('Investor Names')
funding_exploded['Investor Names'] = funding_exploded['Investor Names'].str.strip()
investors['Organization/Person Name'] = investors['Organization/Person Name'].str.strip()

merged = funding_exploded.merge(
    investors,
    left_on='Investor Names',
    right_on='Organization/Person Name',
    how='left'
)

columns_to_process = ['Number of Investments', 'Number of Exits', 'Number of Lead Investments', 'Number of Portfolio Organizations']
for col in columns_to_process:
    if col in merged.columns:
        merged[col] = pd.to_numeric(merged[col], errors='coerce')

not_found_investors = merged[merged['Number of Investments'].isnull()]['Investor Names'].unique()
if len(not_found_investors) > 0:
    print(f"{len(not_found_investors)} investors could not be found.")
else:
    print("All investors were successfully found.")

not_found_investors_df = pd.DataFrame(not_found_investors, columns=['Investor Names'])

if 'Company ID' in companies.columns:
    companies = companies.drop(columns=['Company ID'])

for col in columns_to_process:
    col_average = merged.groupby('Company ID')[col].mean().reset_index()
    companies = companies.merge(
        col_average,
        left_on='ID',
        right_on='Company ID',
        how='left',
        suffixes=('', '_drop')
    )
    companies[col] = companies[col].fillna(0)
    companies = companies.rename(columns={col: f'Average {col} by Investors'})

companies = companies.drop(columns=[col for col in companies.columns if col.endswith('_drop')], errors='ignore')


3608 investors could not be found.


### Feature 5: Origin Country of Investors
Explanation:

In [57]:
funding_exploded = funding.explode('Investor Names')
funding_exploded['Investor Names'] = funding_exploded['Investor Names'].str.strip()
investors['Organization/Person Name'] = investors['Organization/Person Name'].str.strip()

merged = funding_exploded.merge(
    investors[['Organization/Person Name', 'Country']],
    left_on='Investor Names',
    right_on='Organization/Person Name',
    how='left'
)

distinct_countries = investors['Country'].dropna().unique()[:5]

country_encoded = pd.get_dummies(merged['Country'], prefix='Investor Country', dtype=int)

country_aggregated = country_encoded.groupby(merged['Company ID']).max()

companies = companies.merge(
    country_aggregated,
    left_on='ID',
    right_index=True,
    how='left'
)

for country in distinct_countries:
    column_name = f'Investor Country_{country}'
    if column_name in companies.columns:
        companies[column_name] = companies[column_name].fillna(0)

### Feature 6: Top Investor Participation
Identifies whether a company has investors with a high number of exits.

In [58]:
top_investors = investors[investors['Number of Exits'] > investors['Number of Exits'].mean()]['Organization/Person Name']
def has_top_investor(company_id, funding_df, top_investors):
    company_funding = funding_df[funding_df['Company ID'] == company_id]
    if company_funding.empty:
        return 0
    investors_list = company_funding['Investor Names'].explode()
    return 1 if any(investor in top_investors.values for investor in investors_list) else 0

companies['Top Investor Participation'] = companies['ID'].apply(
    lambda x: has_top_investor(x, funding, top_investors)
)

## New Features: Companies

### Feature 1: 'Months between Founding and Acquisition'

In [59]:
companies['Months between Founding and Acquisition'] = companies.apply(
    lambda row: (row['Announced Date Acquisition'] - row['Founded Date']).days // 30
    if pd.notna(row['Announced Date Acquisition']) else -1,
    axis=1
)

### Feature 2: Category One Hot Encoding

In [60]:
exploded = companies.explode('Industry Groups')
exploded['Industry Groups'] = exploded['Industry Groups'].str.strip().str.lower()
exploded = exploded[exploded['Industry Groups'].notna() & (exploded['Industry Groups'] != '')]

one_hot_encoded = pd.get_dummies(exploded['Industry Groups'], prefix='Industry', dtype=int)
one_hot_aggregated = one_hot_encoded.groupby(exploded.index).max()

companies = pd.concat([companies, one_hot_aggregated], axis=1)

## New Features: LinkedIn-Founder (Aggregation to Company Level)

### Feature 1: Average LinkedIn Followers and Connections

In [61]:
def add_founder_metrics_to_companies(founders, companies):
    founder_metrics = founders.groupby('Company ID').agg(
        average_linkedin_followers_founders=('followers', 'mean'),
        average_linkedin_connections_founders=('connections', 'mean'),
        min_linkedin_followers_founders=('followers', 'min'),
        max_linkedin_followers_founders=('followers', 'max'),
        min_linkedin_connections_founders=('connections', 'min'),
        max_linkedin_connections_founders=('connections', 'max')
    ).reset_index()

    companies = companies.merge(founder_metrics, left_on='ID', right_on='Company ID', how='left')

    for col in ['Company ID', 'Company ID_y']:
        if col in companies.columns:
            companies.drop(columns=[col], inplace=True)

    return companies

companies = add_founder_metrics_to_companies(founders, companies)

## Feature 2: Highest Education

In [62]:
def categorize_highest_education(founders):
    education_mapping = {
        "Doktor/PhD": [
            "phd", "doctor", "doctoral", "dr.", "d.phil", "doctorate", "dsc",
            "dr.phil", "doctor of science", "dr.eng", "ph.d", "ed.d", "sc.d",
            "eng.d", "dr.med", "med.d", "doctor of medicine", "doctor of philosophy",
            "d.ed", "jd", "juris doctor", "doctor juris", "law doctorate", "d.v.m",
            "doctor of veterinary", "md", "m.d.", "doctor of law", "doctor of arts",
            "doctor in", "doctoral studies", "d.lit", "d.m.a", "d.clin.psych",
            "doctor of clinical psychology", "d.jur", "d.theol", "d.b.a",
            "doctor of business administration", "d.eng.sc", "d.arch", "d.d.s",
            "doctor of dental surgery", "d.v.sc", "d.med.sc", "d.p.h",
            "d.sc.tech", "doctor of public health", "d.health.sci", "d.n.p",
            "doctor of nursing practice", "doctor of social work", "doctor of theology",
            "d.comm", "doctor of communication", "d.env.sc", "doctor of environmental science"
        ],
        "Master": [
            "master", "m.sc", "msc", "mba", "m.tech", "ma", "m.eng", "m.ed",
            "ms", "m.phil", "mfa", "m.econ", "mfin", "master of science",
            "master of arts", "master of business", "master's degree",
            "m.des", "m.com", "m.div", "m.theol", "mres", "m.arch", "m.acc",
            "master of engineering", "master of finance", "master of commerce",
            "master of education", "master of philosophy", "master of public health",
            "mph", "m.pp", "m.ir", "mib", "m.int.business", "m.litt", "mchem",
            "diplom", "diploma", "diplom-ing", "diploma in engineering",
            "dipl.-ing", "dipl.-wirtschaftsingenieur", "dipl.-kfm", "dipl.-phys",
            "dipl.-math", "diplomkaufmann", "diplomingenieur", "diplomat",
            "m.comm", "m.a.ed", "m.sc.ed", "m.p.a", "m.h.a", "m.i.s", "m.c.s",
            "master of computer science", "master of information systems",
            "m.sc.tech", "m.plan", "master of planning", "mcm", "master of communication",
            "mhl", "master of human resources", "m.intl.rel", "master of international relations",
            "m.i.t", "m.arch.sc", "master of architecture", "m.journ", "m.fish.sci",
            "master of fisheries science", "m.r.s", "master of rural studies",
            "m.theo", "m.e.e", "master of electrical engineering", "m.med.sc",
            "m.env.sc", "master of environmental science", "m.agri", "master of agriculture",
            "m.p.h.a", "master of public health administration", "m.sc.math"
        ],
        "Bachelor": [
            "bachelor", "b.sc", "bsc", "b.tech", "ba", "b.eng", "b.ed",
            "bs", "b.arch", "bcom", "bba", "bfa", "bpharm", "b.econ",
            "bachelor's degree", "undergraduate", "bcom", "bdes", "bca",
            "bacc", "bachelor of science", "bachelor of arts", "bachelor of technology",
            "bachelor of engineering", "bachelor of education", "bachelor of commerce",
            "bachelor of fine arts", "bachelor of pharmacy", "bachelor of law",
            "bachelor of economics", "llb", "bachelor of computer applications",
            "b.litt", "b.a.e", "b.sc.eng", "b.plan", "b.comm", "b.h.sc",
            "b.i.t", "b.math", "b.stat", "b.mus", "b.of.design", "bcs",
            "bachelor of computing science", "b.eng.tech", "b.a.sc", "b.app.sci",
            "b.e", "b.journ", "b.a.hons", "b.sc.hons", "b.nurs", "b.sc.n",
            "b.soc.sc", "b.soc.work", "bachelor of social work", "b.v.sc",
            "bachelor of veterinary science", "b.med.sc", "b.biochem",
            "bachelor of biochemistry", "b.a.s", "b.env.sc", "bachelor of environmental science",
            "b.med", "bachelor of medicine", "b.optom", "bachelor of optometry",
            "b.psych", "bachelor of psychology", "b.public.health", "b.p.t",
            "bachelor of physical therapy"
        ]
    }

    education_priority = {"Doktor/PhD": 3, "Master": 2, "Bachelor": 1}

    def get_highest_education(degree):
        degree_lower = str(degree).lower()
        for level, keywords in education_mapping.items():
            if any(keyword in degree_lower for keyword in keywords):
                return level
        return None

    founders['degree_1_level'] = founders['degree_1'].apply(get_highest_education)
    founders['degree_2_level'] = founders['degree_2'].apply(get_highest_education)

    founders['degree_1_priority'] = founders['degree_1_level'].map(education_priority).fillna(0)
    founders['degree_2_priority'] = founders['degree_2_level'].map(education_priority).fillna(0)

    founders['highest_education'] = founders.apply(
        lambda row: row['degree_1_level'] if row['degree_1_priority'] >= row['degree_2_priority'] else row['degree_2_level'],
        axis=1
    )

    return founders


def add_highest_education_one_hot_to_companies(founders, companies):
    founders = categorize_highest_education(founders)
    one_hot_encoded = pd.get_dummies(founders['highest_education'], prefix='highest_education')
    founders = pd.concat([founders, one_hot_encoded], axis=1)
    aggregated_one_hot = founders.groupby('Company ID')[one_hot_encoded.columns].sum().reset_index()
    companies = companies.merge(aggregated_one_hot, left_on='ID', right_on='Company ID', how='left')
    companies.fillna(0, inplace=True)
    companies.drop(columns=['Company ID'], inplace=True, errors='ignore')
    return companies

companies = add_highest_education_one_hot_to_companies(founders, companies)

  companies.fillna(0, inplace=True)


In [63]:
def calculate_recognition_rate(founders):
    total_with_degrees = founders[
        founders['degree_1'].notna() | founders['degree_2'].notna()
        ]

    recognized_degrees = total_with_degrees['highest_education'].notna().sum()

    total_with_degrees_count = len(total_with_degrees)
    recognition_rate = recognized_degrees / total_with_degrees_count if total_with_degrees_count > 0 else 0

    return recognition_rate, total_with_degrees_count, recognized_degrees

founders = categorize_highest_education(founders)

recognition_rate, total_with_degrees, recognized_count = calculate_recognition_rate(founders)

print(f"Recognition Rate: {recognition_rate:.2%}")
print(f"Total Entries with Degrees: {total_with_degrees}")
print(f"Recognized Degrees: {recognized_count}")

Recognition Rate: 91.04%
Total Entries with Degrees: 1663
Recognized Degrees: 1514


In [64]:
companies

Unnamed: 0,ID,Organization Name,Industries,Headquarters Location,Description,CB Rank (Company),Postal Code,Founded Date,Exit Date,Website,...,Industry_video,average_linkedin_followers_founders,average_linkedin_connections_founders,min_linkedin_followers_founders,max_linkedin_followers_founders,min_linkedin_connections_founders,max_linkedin_connections_founders,highest_education_Bachelor,highest_education_Doktor/PhD,highest_education_Master
0,000001,2trde,"[Automotive, Software]","Munich, Bayern, Germany",2trde develops a software solution designed fo...,57123,0,2017-01-01,0,https://www.2trde.com,...,0.0,4815.000000,3914.000000,4815.0,4815.0,3914.0,3914.0,0.0,0.0,1.0
1,000002,36ZERO Vision,"[Artificial Intelligence (AI), Computer Vision...","Munich, Bayern, Germany",36ZEROVision is an AI-powered visual inspectio...,51326,81671,2019-01-01,0,https://36zerovision.com/,...,0.0,6482.000000,6479.000000,6482.0,6482.0,6479.0,6479.0,0.0,0.0,1.0
2,000003,3Bears Foods,[Food and Beverage],"Munich, Bayern, Germany",3Bears Foods enable a balanced and delicious b...,275817,0,2015-01-01,0,https://3bears.de/,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,000004,3dTrust,"[3D Printing, Manufacturing, Software]","Munich, Bayern, Germany",3dTrust helps companies integrate 3D printing ...,134694,80797,2015-01-01,0,http://3dtrust.de,...,0.0,5664.666667,5337.666667,2089.0,9782.0,1893.0,9238.0,0.0,0.0,2.0
4,000005,abaut,"[Analytics, Artificial Intelligence (AI), Cons...","Munich, Bayern, Germany",abaut builds a SaaS that enables businesses al...,219525,80992,2017-07-21,0,https://abaut.de,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1513,001506,SAYM,"[Apps, B2B, B2C, Human Resources, Mobile Apps,...","Aachen, Nordrhein-Westfalen, Germany",The SAYM platform for swarm mobility defines t...,166910,52070.0,2019-01-01,0,https://www.saym.io/,...,0.0,202.000000,204.000000,202.0,202.0,204.0,204.0,0.0,0.0,1.0
1514,001507,SONAH GmbH,"[Apps, Artificial Intelligence (AI), Computer ...","Aachen, Nordrhein-Westfalen, Germany",SONAH developed a flexible embedded vision sen...,121752,52070.0,2016-01-01,0,http://www.sonah.tech,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1515,001508,Taxy.io,"[FinTech, Legal Tech, Machine Learning, Software]","Aachen, Nordrhein-Westfalen, Germany",Taxy.io builds the leading platform for B2B ta...,220816,52070.0,2019-01-01,0,https://www.taxy.io/,...,0.0,2646.666667,2589.000000,1176.0,5489.0,1116.0,5477.0,0.0,0.0,2.0
1516,001509,TRINKKOST GmbH,"[Agriculture, Consumer Goods, Fitness, Food an...","Aachen, Nordrhein-Westfalen, Germany",TRINKKOST is a food supplement manufacturing c...,907817,0,2016-01-01,0,http://www.trinkkost.de,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
founders

Unnamed: 0,Founder ID,Company ID,Founder Name,Organization Name,first_name,last_name,linkedin_url,logo_url,facebook_url,twitter_url,...,degree_1_priority,degree_2_priority,highest_education,is_international,degree_1_top_match,degree_1_top_score,degree_2_top_match,degree_2_top_score,founder_top_uni,founder_studies_abroad
0,000001,000001,Johannes Stoffel,2trde,Johannes,Stoffel,https://www.linkedin.com/in/johannes-stoffel-2...,https://images.crunchbase.com/image/upload/t_c...,,,...,2.0,2.0,Master,0,,0.0,,0.0,0,0
2,000002,000002,Florian Ziesche,36ZERO Vision,Florian,Ziesche,,,,,...,0.0,0.0,,1,,0.0,,0.0,0,0
3,000003,000002,Zeeshan Karamat,36ZERO Vision,Zeeshan,Karamat,https://www.linkedin.com/in/zkaramat/,https://images.crunchbase.com/image/upload/t_c...,,,...,2.0,2.0,Master,1,,0.0,Georgia Institute of Technology,100.0,1,1
4,000004,000003,Caroline Steingruber,3Bears Foods,Caroline,Steingruber,,,,,...,0.0,0.0,,1,,0.0,,0.0,0,0
5,000005,000003,Tim Nichols,3Bears Foods,Tim,Nichols,,,,,...,0.0,0.0,,1,,0.0,,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3013,003137,001508,Sven Peper,Taxy.io,Sven,Peper,https://www.linkedin.com/in/sven-peper,,,,...,0.0,0.0,,1,,0.0,,0.0,0,0
3014,003138,001508,Sven Weber,Taxy.io,Sven,Weber,http://www.linkedin.com/in/svenweber,https://images.crunchbase.com/image/upload/t_c...,,,...,2.0,0.0,Master,1,,0.0,,0.0,0,0
3015,003139,001509,Dr. Gennadi Schechtmann,TRINKKOST GmbH,Dr.,Gennadi Schechtmann,,,,,...,0.0,0.0,,1,,0.0,,0.0,0,0
3016,003140,001509,Timon Ortloff,TRINKKOST GmbH,Timon,Ortloff,,,,,...,0.0,0.0,,1,,0.0,,0.0,0,0


### Feature 3: International Team

In [66]:
def add_international_team_to_companies(founders, companies):
    founders['is_international'] = founders['country_code'].apply(
        lambda x: 0 if x == 'DEU' or x is None else 1
    )

    international_team = founders.groupby('Company ID')['is_international'].max().reset_index()
    international_team.rename(columns={'is_international': 'international_team'}, inplace=True)

    companies = companies.merge(international_team, left_on='ID', right_on='Company ID', how='left')

    companies['international_team'].fillna(0, inplace=True)
    companies['international_team'] = companies['international_team'].astype(int)

    if 'Company ID' in companies.columns:
        companies.drop(columns=['Company ID'], inplace=True)

    return companies

companies = add_international_team_to_companies(founders, companies)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  companies['international_team'].fillna(0, inplace=True)


### Feature 4: Top University

In [67]:
from rapidfuzz import process, fuzz

ranking = pd.read_csv('/Users/janlinzner/Library/Mobile Documents/com~apple~CloudDocs/Documents/McGill/1_Lectures/Decision Analytics (MGSC 662)/2_Coding/Exercises/ADA_Coding/Datasets/Universities/UniversityRanking.csv')
ranking['rank display'] = pd.to_numeric(ranking['rank display'], errors='coerce')
top100 = ranking[ranking['rank display'] <= 100]
top_universities = top100['institution'].dropna().unique().tolist()

def get_top_university_match(uni, top_universities_list, threshold=90):
    if pd.isna(uni) or not uni.strip():
        return None, 0
    match = process.extractOne(uni, top_universities_list, scorer=fuzz.token_set_ratio)
    if match and match[1] >= threshold:
        return match[0], match[1]
    return None, 0

founders['degree_1_top_match'] = None
founders['degree_1_top_score'] = 0
founders['degree_2_top_match'] = None
founders['degree_2_top_score'] = 0

for idx, row in founders.iterrows():
    uni1 = row.get('degree_1_university', '')
    uni2 = row.get('degree_2_university', '')
    match1, score1 = get_top_university_match(uni1, top_universities)
    match2, score2 = get_top_university_match(uni2, top_universities)
    founders.at[idx, 'degree_1_top_match'] = match1
    founders.at[idx, 'degree_1_top_score'] = score1
    founders.at[idx, 'degree_2_top_match'] = match2
    founders.at[idx, 'degree_2_top_score'] = score2

print(founders[['degree_1_university', 'degree_1_top_match', 'degree_1_top_score',
                'degree_2_university', 'degree_2_top_match', 'degree_2_top_score']].head())

def check_top_university(universities, top_universities_list, threshold=90):
    for uni in universities:
        if pd.isna(uni) or not uni.strip():
            continue
        match = process.extractOne(uni, top_universities_list, scorer=fuzz.token_set_ratio)
        if match and match[1] >= threshold:
            return True
    return False

founders['founder_top_uni'] = founders.apply(
    lambda row: check_top_university(
        [row.get('degree_1_university', ''), row.get('degree_2_university', '')],
        top_universities
    ),
    axis=1
).astype(int)

company_top_uni = founders.groupby('Company ID')['founder_top_uni'].mean().reset_index()

companies = companies.merge(company_top_uni, left_on='ID', right_on='Company ID', how='left')
companies['top_university'] = companies['founder_top_uni'].fillna(0)
companies.drop(['Company ID', 'founder_top_uni'], axis=1, inplace=True)

print(companies.head())

  founders.at[idx, 'degree_1_top_score'] = score1
  founders.at[idx, 'degree_2_top_score'] = score2


              degree_1_university degree_1_top_match  degree_1_top_score  \
0            ESCP Business School               None                 0.0   
2                             NaN               None                 0.0   
3  Technische Universität München               None                 0.0   
4                             NaN               None                 0.0   
5                             NaN               None                 0.0   

                        degree_2_university               degree_2_top_match  \
0  EBS Universität für Wirtschaft und Recht                             None   
2                                       NaN                             None   
3           Georgia Institute of Technology  Georgia Institute of Technology   
4                                       NaN                             None   
5                                       NaN                             None   

   degree_2_top_score  
0                 0.0  
2             

### Feature 5: Studies Abroad

In [68]:
foreign_universities = ranking[~ranking['location code'].isin(['DE', 'CH', 'AT'])]
foreign_universities = foreign_universities['institution'].dropna().unique().tolist()

def check_foreign_university(universities, foreign_universities_list, threshold=90):
    for uni in universities:
        if pd.isna(uni) or not uni.strip():
            continue
        match = process.extractOne(uni, foreign_universities_list, scorer=fuzz.token_set_ratio)
        if match and match[1] >= threshold:
            return True
    return False

founders['founder_studies_abroad'] = founders.apply(
    lambda row: check_foreign_university(
        [row.get('degree_1_university', ''), row.get('degree_2_university', '')],
        foreign_universities
    ),
    axis=1
).astype(int)

company_abroad = founders.groupby('Company ID')['founder_studies_abroad'].mean().reset_index()

companies = companies.merge(company_abroad, left_on='ID', right_on='Company ID', how='left')
companies['studies_abroad_founder'] = companies['founder_studies_abroad'].fillna(0)
companies.drop(['Company ID', 'founder_studies_abroad'], axis=1, inplace=True)

print(companies[['ID', 'studies_abroad_founder']].head())

       ID  studies_abroad_founder
0  000001                     0.0
1  000002                     0.5
2  000003                     0.0
3  000004                     0.0
4  000005                     0.0


In [69]:
companies

Unnamed: 0,ID,Organization Name,Industries,Headquarters Location,Description,CB Rank (Company),Postal Code,Founded Date,Exit Date,Website,...,min_linkedin_followers_founders,max_linkedin_followers_founders,min_linkedin_connections_founders,max_linkedin_connections_founders,highest_education_Bachelor,highest_education_Doktor/PhD,highest_education_Master,international_team,top_university,studies_abroad_founder
0,000001,2trde,"[Automotive, Software]","Munich, Bayern, Germany",2trde develops a software solution designed fo...,57123,0,2017-01-01,0,https://www.2trde.com,...,4815.0,4815.0,3914.0,3914.0,0.0,0.0,1.0,0,0.000000,0.00
1,000002,36ZERO Vision,"[Artificial Intelligence (AI), Computer Vision...","Munich, Bayern, Germany",36ZEROVision is an AI-powered visual inspectio...,51326,81671,2019-01-01,0,https://36zerovision.com/,...,6482.0,6482.0,6479.0,6479.0,0.0,0.0,1.0,1,0.500000,0.50
2,000003,3Bears Foods,[Food and Beverage],"Munich, Bayern, Germany",3Bears Foods enable a balanced and delicious b...,275817,0,2015-01-01,0,https://3bears.de/,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.00
3,000004,3dTrust,"[3D Printing, Manufacturing, Software]","Munich, Bayern, Germany",3dTrust helps companies integrate 3D printing ...,134694,80797,2015-01-01,0,http://3dtrust.de,...,2089.0,9782.0,1893.0,9238.0,0.0,0.0,2.0,1,0.333333,0.00
4,000005,abaut,"[Analytics, Artificial Intelligence (AI), Cons...","Munich, Bayern, Germany",abaut builds a SaaS that enables businesses al...,219525,80992,2017-07-21,0,https://abaut.de,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1513,001506,SAYM,"[Apps, B2B, B2C, Human Resources, Mobile Apps,...","Aachen, Nordrhein-Westfalen, Germany",The SAYM platform for swarm mobility defines t...,166910,52070.0,2019-01-01,0,https://www.saym.io/,...,202.0,202.0,204.0,204.0,0.0,0.0,1.0,1,0.000000,0.00
1514,001507,SONAH GmbH,"[Apps, Artificial Intelligence (AI), Computer ...","Aachen, Nordrhein-Westfalen, Germany",SONAH developed a flexible embedded vision sen...,121752,52070.0,2016-01-01,0,http://www.sonah.tech,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.00
1515,001508,Taxy.io,"[FinTech, Legal Tech, Machine Learning, Software]","Aachen, Nordrhein-Westfalen, Germany",Taxy.io builds the leading platform for B2B ta...,220816,52070.0,2019-01-01,0,https://www.taxy.io/,...,1176.0,5489.0,1116.0,5477.0,0.0,0.0,2.0,1,0.250000,0.25
1516,001509,TRINKKOST GmbH,"[Agriculture, Consumer Goods, Fitness, Food an...","Aachen, Nordrhein-Westfalen, Germany",TRINKKOST is a food supplement manufacturing c...,907817,0,2016-01-01,0,http://www.trinkkost.de,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.00


In [70]:
founders

Unnamed: 0,Founder ID,Company ID,Founder Name,Organization Name,first_name,last_name,linkedin_url,logo_url,facebook_url,twitter_url,...,degree_1_priority,degree_2_priority,highest_education,is_international,degree_1_top_match,degree_1_top_score,degree_2_top_match,degree_2_top_score,founder_top_uni,founder_studies_abroad
0,000001,000001,Johannes Stoffel,2trde,Johannes,Stoffel,https://www.linkedin.com/in/johannes-stoffel-2...,https://images.crunchbase.com/image/upload/t_c...,,,...,2.0,2.0,Master,0,,0.0,,0.0,0,0
2,000002,000002,Florian Ziesche,36ZERO Vision,Florian,Ziesche,,,,,...,0.0,0.0,,1,,0.0,,0.0,0,0
3,000003,000002,Zeeshan Karamat,36ZERO Vision,Zeeshan,Karamat,https://www.linkedin.com/in/zkaramat/,https://images.crunchbase.com/image/upload/t_c...,,,...,2.0,2.0,Master,1,,0.0,Georgia Institute of Technology,100.0,1,1
4,000004,000003,Caroline Steingruber,3Bears Foods,Caroline,Steingruber,,,,,...,0.0,0.0,,1,,0.0,,0.0,0,0
5,000005,000003,Tim Nichols,3Bears Foods,Tim,Nichols,,,,,...,0.0,0.0,,1,,0.0,,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3013,003137,001508,Sven Peper,Taxy.io,Sven,Peper,https://www.linkedin.com/in/sven-peper,,,,...,0.0,0.0,,1,,0.0,,0.0,0,0
3014,003138,001508,Sven Weber,Taxy.io,Sven,Weber,http://www.linkedin.com/in/svenweber,https://images.crunchbase.com/image/upload/t_c...,,,...,2.0,0.0,Master,1,,0.0,,0.0,0,0
3015,003139,001509,Dr. Gennadi Schechtmann,TRINKKOST GmbH,Dr.,Gennadi Schechtmann,,,,,...,0.0,0.0,,1,,0.0,,0.0,0,0
3016,003140,001509,Timon Ortloff,TRINKKOST GmbH,Timon,Ortloff,,,,,...,0.0,0.0,,1,,0.0,,0.0,0,0


## New Features: Companies <> LinkedIn Founder Information

In [71]:
ln_filepath= '/Users/janlinzner/Library/Mobile Documents/com~apple~CloudDocs/Documents/McGill/1_Lectures/Decision Analytics (MGSC 662)/2_Coding/Exercises/ADA_Coding/Datasets/LinkedIn/Founders/founders_linkedin_details.csv'

ln_details = pd.read_csv(ln_filepath)

ln_details.head()

Unnamed: 0,url,username,followers,connections,headline,degree_1,degree_1_university,degree_1_duration,degree_2,degree_2_university,...,job_company_9,job_duration_9,job_location_9,job_description_9,job_title_10,job_company_10,job_duration_10,job_location_10,job_description_10,error
0,https://www.linkedin.com/in/johannes-stoffel-2...,johannes-stoffel-27389667,4823.0,3916.0,Founder bei 2trde \n**We are hiring**,"Executive Master, Digital Innovation and Entre...",ESCP Business School,ESCP Business School,"Bachelor of Science - BS, Gerneral Management",EBS Universität für Wirtschaft und Recht,...,,,,,,,,,,
1,https://www.linkedin.com/in/zkaramat/,zkaramat,6489.0,6485.0,CTO/CoFounder 36ZERO Vision,"Masters Computer Science, Artificial Intelligence",Technische Universität München,2018,"Masters Computer Science, Artificial Intelligence",Georgia Institute of Technology,...,,,,,,,,,,
2,https://www.linkedin.com/in/alexandre-gu%C3%A9...,alexandre-gu%C3%A9rin-79734986,9782.0,9238.0,Co-Founder at Holori,"Master's degree, International Business and Fi...","ESB Business School, Reutlingen University",2010–2011,,ESCE International Business School,...,,,,,,,,,,
3,https://www.linkedin.com/in/mituca,mituca,2090.0,1893.0,Human-like visual perception for physical-worl...,"Master's degree, Informatics",Technische Universität München,,"Engineer, Computer Science",Universitatea „Politehnica” din Timișoara,...,,,,,,,,,,
4,https://www.linkedin.com/in/antoine-jeol,antoine-jeol,5126.0,4885.0,Co-Founder at Holori,,Ludwig-Maximilians-Universität München,Ludwig-Maximilians-Universität München,,"ESB Business School, Reutlingen University",...,,,,,,,,,,


### Feature 1: First Time Founder
- firsttime_founder_ratio: The proportion of founders who are first-time founders.
- all_firsttime_founders: Indicator of whether all founders are first-time founders.
- any_firsttime_founder: Indicator of whether at least one founder is a first-time founder.

In [72]:
from transformers import pipeline

ln_details = ln_details.fillna("")
merged = ln_details.merge(founders, left_on="url", right_on="linkedin_url", how="left")

def filter_job_titles(row):
    organization = row["Organization Name"]
    job_titles = []
    match_found = False
    for i in range(1, 11):
        company_col = f"job_company_{i}"
        title_col = f"job_title_{i}"
        if company_col in row and title_col in row:
            company = row[company_col]
            if not match_found and fuzz.partial_ratio(organization.lower(), company.lower()) > 80:
                match_found = True
                continue
            if match_found and row[title_col]:
                job_titles.append(row[title_col])
    return " ".join(job_titles)

merged["filtered_job_titles"] = merged.apply(filter_job_titles, axis=1)

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def is_founder(row):
    job_titles = row["filtered_job_titles"]
    if not job_titles.strip():
        return 1
    result = classifier(
        job_titles,
        candidate_labels=["founder", "not a founder"],
        hypothesis_template="This person is a {}."
    )
    return 0 if result["labels"][0] == "founder" else 1

merged["is_firsttime_founder"] = merged.apply(is_founder, axis=1)

if "is_firsttime_founder" not in merged.columns:
    raise ValueError("is_firsttime_founder column was not created correctly in merged.")
print(merged[["linkedin_url", "is_firsttime_founder"]].head())

founders = founders.merge(
    merged[["linkedin_url", "is_firsttime_founder"]],
    left_on="linkedin_url",
    right_on="linkedin_url",
    how="left"
)

if "is_firsttime_founder" not in founders.columns:
    raise ValueError("is_firsttime_founder column was not added to founders.")
print(founders.head())

company_firsttime_stats = founders.groupby('Company ID')['is_firsttime_founder'].agg(
    firsttime_founder_ratio='mean',
    all_firsttime_founders=lambda x: int(x.mean() == 1),
    any_firsttime_founder=lambda x: int(x.mean() > 0)
).reset_index()

companies['ID'] = companies['ID'].astype(str)
company_firsttime_stats['Company ID'] = company_firsttime_stats['Company ID'].astype(str)

companies = companies.merge(company_firsttime_stats, left_on='ID', right_on='Company ID', how='left')

companies['firsttime_founder_ratio'] = companies['firsttime_founder_ratio'].fillna(0)
companies['all_firsttime_founders'] = companies['all_firsttime_founders'].fillna(0)
companies['any_firsttime_founder'] = companies['any_firsttime_founder'].fillna(0)

companies.drop(['Company ID'], axis=1, inplace=True)

print(companies)

Device set to use mps:0


                                        linkedin_url  is_firsttime_founder
0  https://www.linkedin.com/in/johannes-stoffel-2...                     0
1              https://www.linkedin.com/in/zkaramat/                     0
2  https://www.linkedin.com/in/alexandre-gu%C3%A9...                     0
3                 https://www.linkedin.com/in/mituca                     1
4           https://www.linkedin.com/in/antoine-jeol                     0
  Founder ID Company ID          Founder Name Organization Name first_name  \
0     000001     000001      Johannes Stoffel             2trde   Johannes   
1     000002     000002       Florian Ziesche     36ZERO Vision    Florian   
2     000003     000002       Zeeshan Karamat     36ZERO Vision    Zeeshan   
3     000004     000003  Caroline Steingruber      3Bears Foods   Caroline   
4     000005     000003           Tim Nichols      3Bears Foods        Tim   

     last_name                                       linkedin_url  \
0      Stoff

## Feature 2: Researcher

- researcher_ratio: The proportion of founders who were previously 'Researchers'.
- all_researchers: Indicator of whether all founders were previously “Researchers” (1 = Yes, 0 = No).
- any_researcher: Indicator of whether at least one founder was previously a “Researcher” (1 = Yes, 0 = No).

In [73]:
from fuzzywuzzy import fuzz
from transformers import pipeline
import pandas as pd

ln_details = ln_details.fillna("")
merged = ln_details.merge(founders, left_on="url", right_on="linkedin_url", how="left")

def filter_job_titles(row):
    organization = row["Organization Name"]
    job_titles = []
    match_found = False
    for i in range(1, 11):
        company_col = f"job_company_{i}"
        title_col = f"job_title_{i}"
        if company_col in row and title_col in row:
            company = row[company_col]
            if not match_found and fuzz.partial_ratio(organization.lower(), company.lower()) > 80:
                match_found = True
                continue
            if match_found and row[title_col]:
                job_titles.append(row[title_col])
    return " ".join(job_titles)

merged["filtered_job_titles"] = merged.apply(filter_job_titles, axis=1)

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def is_researcher(row):
    job_titles = row["filtered_job_titles"]
    if not job_titles.strip():
        return 0
    result = classifier(
        job_titles,
        candidate_labels=["researcher", "not a researcher"],
        hypothesis_template="This person is a {}."
    )
    return 1 if result["labels"][0] == "researcher" else 0

merged["was_researcher"] = merged.apply(is_researcher, axis=1)

founders = founders.merge(
    merged[["linkedin_url", "was_researcher"]],
    left_on="linkedin_url",
    right_on="linkedin_url",
    how="left"
)

company_researcher_stats = founders.groupby('Company ID')['was_researcher'].agg(
    researcher_ratio='mean',
    all_researchers=lambda x: int(x.mean() == 1),
    any_researcher=lambda x: int(x.mean() > 0)
).reset_index()

companies['ID'] = companies['ID'].astype(str)
company_researcher_stats['Company ID'] = company_researcher_stats['Company ID'].astype(str)

companies = companies.merge(company_researcher_stats, left_on='ID', right_on='Company ID', how='left')

companies['researcher_ratio'] = companies['researcher_ratio'].fillna(0)
companies['all_researchers'] = companies['all_researchers'].fillna(0)
companies['any_researcher'] = companies['any_researcher'].fillna(0)

companies.drop(['Company ID'], axis=1, inplace=True)

print(companies)

Device set to use mps:0


          ID Organization Name  \
0     000001             2trde   
1     000002     36ZERO Vision   
2     000003      3Bears Foods   
3     000004           3dTrust   
4     000005             abaut   
...      ...               ...   
1513  001506              SAYM   
1514  001507        SONAH GmbH   
1515  001508           Taxy.io   
1516  001509    TRINKKOST GmbH   
1517  001510   worqs Coworking   

                                             Industries  \
0                                [Automotive, Software]   
1     [Artificial Intelligence (AI), Computer Vision...   
2                                   [Food and Beverage]   
3                [3D Printing, Manufacturing, Software]   
4     [Analytics, Artificial Intelligence (AI), Cons...   
...                                                 ...   
1513  [Apps, B2B, B2C, Human Resources, Mobile Apps,...   
1514  [Apps, Artificial Intelligence (AI), Computer ...   
1515  [FinTech, Legal Tech, Machine Learning, Software]   
1

In [74]:
companies

Unnamed: 0,ID,Organization Name,Industries,Headquarters Location,Description,CB Rank (Company),Postal Code,Founded Date,Exit Date,Website,...,highest_education_Master,international_team,top_university,studies_abroad_founder,firsttime_founder_ratio,all_firsttime_founders,any_firsttime_founder,researcher_ratio,all_researchers,any_researcher
0,000001,2trde,"[Automotive, Software]","Munich, Bayern, Germany",2trde develops a software solution designed fo...,57123,0,2017-01-01,0,https://www.2trde.com,...,1.0,0,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.0,0.0
1,000002,36ZERO Vision,"[Artificial Intelligence (AI), Computer Vision...","Munich, Bayern, Germany",36ZEROVision is an AI-powered visual inspectio...,51326,81671,2019-01-01,0,https://36zerovision.com/,...,1.0,1,0.500000,0.50,0.000000,0.0,0.0,1.000000,1.0,1.0
2,000003,3Bears Foods,[Food and Beverage],"Munich, Bayern, Germany",3Bears Foods enable a balanced and delicious b...,275817,0,2015-01-01,0,https://3bears.de/,...,0.0,1,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.0,0.0
3,000004,3dTrust,"[3D Printing, Manufacturing, Software]","Munich, Bayern, Germany",3dTrust helps companies integrate 3D printing ...,134694,80797,2015-01-01,0,http://3dtrust.de,...,2.0,1,0.333333,0.00,0.333333,0.0,1.0,0.666667,0.0,1.0
4,000005,abaut,"[Analytics, Artificial Intelligence (AI), Cons...","Munich, Bayern, Germany",abaut builds a SaaS that enables businesses al...,219525,80992,2017-07-21,0,https://abaut.de,...,0.0,1,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1513,001506,SAYM,"[Apps, B2B, B2C, Human Resources, Mobile Apps,...","Aachen, Nordrhein-Westfalen, Germany",The SAYM platform for swarm mobility defines t...,166910,52070.0,2019-01-01,0,https://www.saym.io/,...,1.0,1,0.000000,0.00,1.000000,1.0,1.0,0.000000,0.0,0.0
1514,001507,SONAH GmbH,"[Apps, Artificial Intelligence (AI), Computer ...","Aachen, Nordrhein-Westfalen, Germany",SONAH developed a flexible embedded vision sen...,121752,52070.0,2016-01-01,0,http://www.sonah.tech,...,0.0,1,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.0,0.0
1515,001508,Taxy.io,"[FinTech, Legal Tech, Machine Learning, Software]","Aachen, Nordrhein-Westfalen, Germany",Taxy.io builds the leading platform for B2B ta...,220816,52070.0,2019-01-01,0,https://www.taxy.io/,...,2.0,1,0.250000,0.25,1.000000,1.0,1.0,0.333333,0.0,1.0
1516,001509,TRINKKOST GmbH,"[Agriculture, Consumer Goods, Fitness, Food an...","Aachen, Nordrhein-Westfalen, Germany",TRINKKOST is a food supplement manufacturing c...,907817,0,2016-01-01,0,http://www.trinkkost.de,...,0.0,1,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.0,0.0


### Feature 3: Senior Roles

In [75]:
from fuzzywuzzy import fuzz
from transformers import pipeline

ln_details = ln_details.fillna("")
merged = ln_details.merge(founders, left_on="url", right_on="linkedin_url", how="left")

def filter_pre_foundation_jobs(row):
    organization = row["Organization Name"]
    job_titles = []
    for i in range(1, 11):
        company_col = f"job_company_{i}"
        title_col = f"job_title_{i}"
        if company_col in row and title_col in row:
            company = row[company_col]
            if fuzz.partial_ratio(organization.lower(), company.lower()) > 80:
                break
            if row[title_col]:
                job_titles.append(row[title_col])
    return " ".join(job_titles)

merged["filtered_job_titles"] = merged.apply(filter_pre_foundation_jobs, axis=1)

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def is_executive(row):
    job_titles = row["filtered_job_titles"]
    if not job_titles.strip():
        return 0
    result = classifier(
        job_titles,
        candidate_labels=["executive", "not an executive"],
        hypothesis_template="This person is a {}."
    )
    return 1 if result["labels"][0] == "executive" else 0

print("Checking filtered_job_titles:")
print(merged[["linkedin_url", "filtered_job_titles"]].head())

merged["was_executive"] = merged.apply(is_executive, axis=1)

if "was_executive" not in merged.columns:
    raise ValueError("Column 'was_executive' was not created in merged.")
print(merged[["linkedin_url", "was_executive"]].head())

founders = founders.merge(
    merged[["linkedin_url", "was_executive"]],
    left_on="linkedin_url",
    right_on="linkedin_url",
    how="left"
)


if "was_executive" not in founders.columns:
    raise ValueError("Column 'was_executive' was not added to founders.")
print(founders[["Company ID", "was_executive"]].head())


company_executive_stats = founders.groupby('Company ID')['was_executive'].agg(
    executive_ratio='mean',
    all_executives=lambda x: int(x.mean() == 1),
    any_executive=lambda x: int(x.mean() > 0)
).reset_index()

companies['ID'] = companies['ID'].astype(str)
company_executive_stats['Company ID'] = company_executive_stats['Company ID'].astype(str)

companies = companies.merge(company_executive_stats, left_on='ID', right_on='Company ID', how='left')

companies['executive_ratio'] = companies['executive_ratio'].fillna(0)
companies['all_executives'] = companies['all_executives'].fillna(0)
companies['any_executive'] = companies['any_executive'].fillna(0)

companies.drop(['Company ID'], axis=1, inplace=True)

print(companies)

Device set to use mps:0


Checking filtered_job_titles:
                                        linkedin_url    filtered_job_titles
0  https://www.linkedin.com/in/johannes-stoffel-2...                       
1              https://www.linkedin.com/in/zkaramat/                       
2  https://www.linkedin.com/in/alexandre-gu%C3%A9...             Co-founder
3                 https://www.linkedin.com/in/mituca  Founder EXIST Founder
4           https://www.linkedin.com/in/antoine-jeol             Co-Founder
                                        linkedin_url  was_executive
0  https://www.linkedin.com/in/johannes-stoffel-2...              0
1              https://www.linkedin.com/in/zkaramat/              0
2  https://www.linkedin.com/in/alexandre-gu%C3%A9...              1
3                 https://www.linkedin.com/in/mituca              0
4           https://www.linkedin.com/in/antoine-jeol              1
  Company ID  was_executive
0     000001            0.0
1     000002            NaN
2     000002          

In [76]:
founders

Unnamed: 0,Founder ID,Company ID,Founder Name,Organization Name,first_name,last_name,linkedin_url,logo_url,facebook_url,twitter_url,...,is_international,degree_1_top_match,degree_1_top_score,degree_2_top_match,degree_2_top_score,founder_top_uni,founder_studies_abroad,is_firsttime_founder,was_researcher,was_executive
0,000001,000001,Johannes Stoffel,2trde,Johannes,Stoffel,https://www.linkedin.com/in/johannes-stoffel-2...,https://images.crunchbase.com/image/upload/t_c...,,,...,0,,0.0,,0.0,0,0,0.0,0.0,0.0
1,000002,000002,Florian Ziesche,36ZERO Vision,Florian,Ziesche,,,,,...,1,,0.0,,0.0,0,0,,,
2,000003,000002,Zeeshan Karamat,36ZERO Vision,Zeeshan,Karamat,https://www.linkedin.com/in/zkaramat/,https://images.crunchbase.com/image/upload/t_c...,,,...,1,,0.0,Georgia Institute of Technology,100.0,1,1,0.0,1.0,0.0
3,000004,000003,Caroline Steingruber,3Bears Foods,Caroline,Steingruber,,,,,...,1,,0.0,,0.0,0,0,,,
4,000005,000003,Tim Nichols,3Bears Foods,Tim,Nichols,,,,,...,1,,0.0,,0.0,0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2844,003137,001508,Sven Peper,Taxy.io,Sven,Peper,https://www.linkedin.com/in/sven-peper,,,,...,1,,0.0,,0.0,0,0,1.0,0.0,0.0
2845,003138,001508,Sven Weber,Taxy.io,Sven,Weber,http://www.linkedin.com/in/svenweber,https://images.crunchbase.com/image/upload/t_c...,,,...,1,,0.0,,0.0,0,0,1.0,0.0,1.0
2846,003139,001509,Dr. Gennadi Schechtmann,TRINKKOST GmbH,Dr.,Gennadi Schechtmann,,,,,...,1,,0.0,,0.0,0,0,,,
2847,003140,001509,Timon Ortloff,TRINKKOST GmbH,Timon,Ortloff,,,,,...,1,,0.0,,0.0,0,0,,,


In [77]:
ln_details

Unnamed: 0,url,username,followers,connections,headline,degree_1,degree_1_university,degree_1_duration,degree_2,degree_2_university,...,job_company_9,job_duration_9,job_location_9,job_description_9,job_title_10,job_company_10,job_duration_10,job_location_10,job_description_10,error
0,https://www.linkedin.com/in/johannes-stoffel-2...,johannes-stoffel-27389667,4823.0,3916.0,Founder bei 2trde \n**We are hiring**,"Executive Master, Digital Innovation and Entre...",ESCP Business School,ESCP Business School,"Bachelor of Science - BS, Gerneral Management",EBS Universität für Wirtschaft und Recht,...,,,,,,,,,,
1,https://www.linkedin.com/in/zkaramat/,zkaramat,6489.0,6485.0,CTO/CoFounder 36ZERO Vision,"Masters Computer Science, Artificial Intelligence",Technische Universität München,2018,"Masters Computer Science, Artificial Intelligence",Georgia Institute of Technology,...,,,,,,,,,,
2,https://www.linkedin.com/in/alexandre-gu%C3%A9...,alexandre-gu%C3%A9rin-79734986,9782.0,9238.0,Co-Founder at Holori,"Master's degree, International Business and Fi...","ESB Business School, Reutlingen University",2010–2011,,ESCE International Business School,...,,,,,,,,,,
3,https://www.linkedin.com/in/mituca,mituca,2090.0,1893.0,Human-like visual perception for physical-worl...,"Master's degree, Informatics",Technische Universität München,,"Engineer, Computer Science",Universitatea „Politehnica” din Timișoara,...,,,,,,,,,,
4,https://www.linkedin.com/in/antoine-jeol,antoine-jeol,5126.0,4885.0,Co-Founder at Holori,,Ludwig-Maximilians-Universität München,Ludwig-Maximilians-Universität München,,"ESB Business School, Reutlingen University",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971,https://www.linkedin.com/in/benjamin-d%C3%B6rr...,benjamin-d%C3%B6rries-776ab2169,204.0,206.0,"Es gibt zwei Dinge, die sind unendlich ...","Master of Science - M.Sc., Produktionstechnik",RWTH Aachen University,2017–2019,,,...,,,,,,,,,,
1972,https://www.linkedin.com/in/steffen-kirchhoff/,steffen-kirchhoff,1184.0,1120.0,CTO at Taxy.io: intelligent automation for B2B...,Computer Science,Harvard University,,"Diplom (Dipl.-Inform.) in Computer Science, Co...",RWTH Aachen University,...,,,,,,,,,,
1973,https://www.linkedin.com/in/sven-peper,sven-peper,1280.0,1177.0,CEO Taxy.io GmbH - bridging AI and legal consu...,Ingenieurwissenschaften,RWTH Aachen University,RWTH Aachen University,,,...,,,,,,,,,,
1974,http://www.linkedin.com/in/svenweber,svenweber,5492.0,5480.0,Managing Principal at Knightsbridge Advisers LLC,"Master, Physics",Universität Heidelberg,,,,...,,,,,,,,,,


### Feature 4: Job Durations

In [78]:
import pandas as pd
from dateutil import parser
from datetime import datetime
import re

# Function to translate German months
MONTHS_TRANSLATION = {
    "Jan.": "Jan", "Feb.": "Feb", "März": "Mar", "Apr.": "Apr",
    "Mai": "May", "Juni": "Jun", "Juli": "Jul", "Aug.": "Aug",
    "Sept.": "Sep", "Okt.": "Oct", "Nov.": "Nov", "Dez.": "Dec"
}

def translate_months(date_str):
    for de, en in MONTHS_TRANSLATION.items():
        date_str = date_str.replace(de, en)
    return date_str

def parse_intervals(row):
    intervals = []
    for col in row.index:
        if "duration" in col and isinstance(row[col], str) and row[col].strip():
            duration = translate_months(row[col])
            match = re.match(r"([\w.]+ \d{4}|\d{4})–(Heute|[\w.]+ \d{4}|\d{4})", duration)
            if match:
                start_str, end_str = match.groups()
                start_date = parser.parse(start_str, dayfirst=False)
                end_date = datetime.now() if end_str == "Heute" else parser.parse(end_str, dayfirst=False)
                intervals.append((start_date, end_date))
    return intervals

def merge_intervals(intervals):
    if not intervals:
        return []
    intervals.sort(key=lambda x: x[0])
    merged = [intervals[0]]
    for current in intervals[1:]:
        last = merged[-1]
        if current[0] <= last[1]:
            merged[-1] = (last[0], max(last[1], current[1]))
        else:
            merged.append(current)
    return merged

def calculate_total_months(merged_intervals):
    return sum((end.year - start.year) * 12 + (end.month - start.month) for start, end in merged_intervals)

def calculate_gaps(merged_intervals):
    if len(merged_intervals) < 2:
        return 0
    gaps = [
        (merged_intervals[i][0] - merged_intervals[i-1][1]).days // 30
        for i in range(1, len(merged_intervals))
    ]
    return sum(gaps)

# Processing ln_details to calculate intervals and experience metrics
ln_details["intervals"] = ln_details.apply(lambda row: parse_intervals(row), axis=1)
ln_details["merged_intervals"] = ln_details["intervals"].apply(merge_intervals)
ln_details["total_experience_months"] = ln_details["merged_intervals"].apply(calculate_total_months)
ln_details["total_experience_years"] = ln_details["total_experience_months"] / 12
ln_details["few_years_experience"] = ln_details["total_experience_years"].apply(lambda x: 1 if 1 <= x <= 5 else 0)
ln_details["decade_experience"] = ln_details["total_experience_years"].apply(lambda x: 1 if x >= 10 else 0)
ln_details["mid_career_experience"] = ln_details["total_experience_years"].apply(lambda x: 1 if 5 <= x < 10 else 0)
ln_details["gaps_in_experience"] = ln_details["merged_intervals"].apply(calculate_gaps)
ln_details["longest_position_duration"] = ln_details["merged_intervals"].apply(
    lambda intervals: max((end - start).days / 30 for start, end in intervals) if intervals else 0
)

# Merge ln_details with founders
founders = founders.merge(
    ln_details[["url", "few_years_experience", "decade_experience",
                "mid_career_experience", "gaps_in_experience",
                "longest_position_duration"]],
    left_on="linkedin_url",
    right_on="url",
    how="left",
    suffixes=("_founders", "_ln_details")
)

# Drop redundant column
founders.drop(columns=["url"], inplace=True, errors="ignore")

# Aggregation to company level
agg_funcs = {
    "few_years_experience": "mean",
    "decade_experience": "mean",
    "mid_career_experience": "mean",
    "gaps_in_experience": "mean",
    "longest_position_duration": "mean"
}

company_experience_stats = founders.groupby("Company ID").agg(agg_funcs).reset_index()

# Rename aggregated columns
company_experience_stats.rename(
    columns={
        "few_years_experience": "few_years_experience_ratio",
        "decade_experience": "decade_experience_ratio",
        "mid_career_experience": "mid_career_experience_ratio",
        "gaps_in_experience": "avg_gaps_in_experience",
        "longest_position_duration": "avg_longest_position_duration"
    },
    inplace=True
)

# Merge with companies DataFrame
companies["ID"] = companies["ID"].astype(str)
company_experience_stats["Company ID"] = company_experience_stats["Company ID"].astype(str)
companies = companies.merge(company_experience_stats, left_on="ID", right_on="Company ID", how="left")

# Fill missing values and drop redundant columns
companies.fillna(0, inplace=True)
companies.drop(columns=["Company ID"], inplace=True, errors="ignore")

# Final output
print(companies)

          ID Organization Name  \
0     000001             2trde   
1     000002     36ZERO Vision   
2     000003      3Bears Foods   
3     000004           3dTrust   
4     000005             abaut   
...      ...               ...   
1513  001506              SAYM   
1514  001507        SONAH GmbH   
1515  001508           Taxy.io   
1516  001509    TRINKKOST GmbH   
1517  001510   worqs Coworking   

                                             Industries  \
0                                [Automotive, Software]   
1     [Artificial Intelligence (AI), Computer Vision...   
2                                   [Food and Beverage]   
3                [3D Printing, Manufacturing, Software]   
4     [Analytics, Artificial Intelligence (AI), Cons...   
...                                                 ...   
1513  [Apps, B2B, B2C, Human Resources, Mobile Apps,...   
1514  [Apps, Artificial Intelligence (AI), Computer ...   
1515  [FinTech, Legal Tech, Machine Learning, Software]   
1

In [82]:
companies

Unnamed: 0,ID,Organization Name,Industries,Headquarters Location,Description,CB Rank (Company),Postal Code,Founded Date,Exit Date,Website,...,all_researchers,any_researcher,executive_ratio,all_executives,any_executive,few_years_experience_ratio,decade_experience_ratio,mid_career_experience_ratio,avg_gaps_in_experience,avg_longest_position_duration
0,000001,2trde,"[Automotive, Software]","Munich, Bayern, Germany",2trde develops a software solution designed fo...,57123,0,2017-01-01,0,https://www.2trde.com,...,0.0,0.0,0.000000,0.0,0.0,0.0,1.000000,0.000000,0.000000,150.166667
1,000002,36ZERO Vision,"[Artificial Intelligence (AI), Computer Vision...","Munich, Bayern, Germany",36ZEROVision is an AI-powered visual inspectio...,51326,81671,2019-01-01,0,https://36zerovision.com/,...,1.0,1.0,0.000000,0.0,0.0,0.0,1.000000,0.000000,7.000000,82.233333
2,000003,3Bears Foods,[Food and Beverage],"Munich, Bayern, Germany",3Bears Foods enable a balanced and delicious b...,275817,0,2015-01-01,0,https://3bears.de/,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
3,000004,3dTrust,"[3D Printing, Manufacturing, Software]","Munich, Bayern, Germany",3dTrust helps companies integrate 3D printing ...,134694,80797,2015-01-01,0,http://3dtrust.de,...,0.0,1.0,0.666667,0.0,1.0,0.0,0.666667,0.333333,15.666667,99.055556
4,000005,abaut,"[Analytics, Artificial Intelligence (AI), Cons...","Munich, Bayern, Germany",abaut builds a SaaS that enables businesses al...,219525,80992,2017-07-21,0,https://abaut.de,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1513,001506,SAYM,"[Apps, B2B, B2C, Human Resources, Mobile Apps,...","Aachen, Nordrhein-Westfalen, Germany",The SAYM platform for swarm mobility defines t...,166910,52070.0,2019-01-01,0,https://www.saym.io/,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,1.000000,13.000000,56.833333
1514,001507,SONAH GmbH,"[Apps, Artificial Intelligence (AI), Computer ...","Aachen, Nordrhein-Westfalen, Germany",SONAH developed a flexible embedded vision sen...,121752,52070.0,2016-01-01,0,http://www.sonah.tech,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1515,001508,Taxy.io,"[FinTech, Legal Tech, Machine Learning, Software]","Aachen, Nordrhein-Westfalen, Germany",Taxy.io builds the leading platform for B2B ta...,220816,52070.0,2019-01-01,0,https://www.taxy.io/,...,0.0,1.0,0.333333,0.0,1.0,0.0,1.000000,0.000000,2.666667,151.888889
1516,001509,TRINKKOST GmbH,"[Agriculture, Consumer Goods, Fitness, Food an...","Aachen, Nordrhein-Westfalen, Germany",TRINKKOST is a food supplement manufacturing c...,907817,0,2016-01-01,0,http://www.trinkkost.de,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
