# Feature Engineering
Explanation:

## Import Packages

In [81]:
from importnb import Notebook
import pandas as pd
import numpy as np

## Dataframe import from 'DataPreprocessing'

In [82]:
with Notebook():
    from DataPreprocessing_Pipeline import companies, funding, investors

## New Features: Companies <> Investment Rounds
Explanation:

### Feature 1: Months Until First Round
Number of months between the funding date and the first investment round. Explanation: The table 'all_rounds' has every investment round made during the existence of the companies. We already mapped the respectively funding round to the startup. 

In [83]:
def calculate_months_until_first_round(company_id, founded_date):
    rounds = funding[funding['Company ID'] == company_id]

    if rounds.empty:
        return np.nan

    first_round_date = rounds['Announced Date'].min()

    delta_months = (first_round_date.year - founded_date.year) * 12 + (first_round_date.month - founded_date.month)
    return max(delta_months, 0)

companies['Months until First Round'] = companies.apply(
    lambda row: calculate_months_until_first_round(row['ID'], row['Founded Date']),
    axis=1
)

companies['Months until First Round'] = companies['Months until First Round'].fillna(-1)

### Feature 2: Grant Y/N
If the company received a Grant, they get a Yes (1). If not, a No (0). For VCs it is desirable to have a company with a Grant because it leverages the invested money as a Grant do not dilute existing shareholders.

In [84]:
def check_grant_for_company(company_id):
    company_rounds = funding[funding['Company ID'] == company_id]
    has_grant = any(company_rounds['Funding Type'] == 'Grant')
    return 1 if has_grant else 0

companies['Grant Y/N'] = companies['ID'].apply(check_grant_for_company)

### Feature 3: Last Round Type
Explanation:

In [85]:
funding['Announced Date'] = pd.to_datetime(funding['Announced Date'])

last_round = (
    funding.sort_values(by=['Company ID', 'Announced Date'])
    .groupby('Company ID')
    .last()
    .reset_index()
)

last_round = last_round[['Company ID', 'Funding Type']]

companies = companies.merge(
    last_round,
    left_on='ID',
    right_on='Company ID',
    how='left'
)

companies = companies.rename(columns={'Funding Type': 'Last Round Type'})

companies.drop(columns=['Company ID'], inplace=True)

companies['Last Round Type'] = companies['Last Round Type'].fillna('No Funding')

companies.drop(columns=['Last Funding Type'], inplace=True)

### Feature 4: Acquisition Status 'Was Acquired', 'Made Acquisitions'
Explanation:

In [86]:
if 'Acquisition Status' in companies.columns:
    unique_values = companies['Acquisition Status'].dropna().unique()
    print(unique_values)

['Was Acquired' 'Made Acquisitions' 'Made Acquisitions, Was Acquired']


In [87]:
companies['Was Acquired'] = 0
companies['Made Acquisitions'] = 0

companies.loc[companies['Acquisition Status'] == 'Was Acquired', 'Was Acquired'] = 1
companies.loc[companies['Acquisition Status'] == 'Made Acquisitions', 'Made Acquisitions'] = 1
companies.loc[companies['Acquisition Status'] == 'Made Acquisitions, Was Acquired', ['Made Acquisitions', 'Was Acquired']] = 1

companies.drop(columns=['Acquisition Status'], inplace=True)

### Feature 5: Funding phases
Explanation:

In [88]:
if 'Last Round Type' in companies.columns:
    distinct_funding_types = companies['Last Round Type'].dropna().unique()
    print(distinct_funding_types)
else:
    print("The column 'Funding Type' does not exist in the DataFrame.")

['Seed' 'No Funding' 'Venture - Series Unknown' 'Series A' 'Series B'
 'Convertible Note' 'Angel' 'Post-IPO Debt' 'Private Equity' 'Pre-Seed'
 'Grant' 'Series C' 'Series F' 'Series D' 'Post-IPO Secondary' 'Series E'
 'Post-IPO Equity']


In [89]:
funding_columns = ['Project Funding', 'Startup Funding', 'Growth Funding', 'Expansion Funding', 'Exit Funding']
for col in funding_columns:
    companies[col] = 0

funding_mapping = {
    'Project Funding': ['Angel', 'Pre-Seed', 'Convertible Note', 'Grant', 'Venture - Series Unknown'],
    'Startup Funding': ['Seed', 'Venture - Series Unknown'],
    'Growth Funding': ['Series A', 'Series B', 'Series C'],
    'Expansion Funding': ['Series D', 'Series E', 'Series F'],
    'Exit Funding': ['Private Equity', 'Post-IPO Secondary', 'Post-IPO Equity']
}

for funding_type, types in funding_mapping.items():
    companies.loc[companies['Last Round Type'].isin(types), funding_type] = 1

companies.loc[companies['Exit Funding'] == 1, funding_columns[:-1]] = 1
companies.loc[companies['Expansion Funding'] == 1, ['Growth Funding', 'Startup Funding', 'Project Funding']] = 1
companies.loc[companies['Growth Funding'] == 1, ['Startup Funding', 'Project Funding']] = 1
companies.loc[companies['Startup Funding'] == 1, 'Project Funding'] = 1

In [90]:
companies

Unnamed: 0,ID,Organization Name,Industries,Description,CB Rank (Company),Founded Date,Exit Date,Website,Twitter,Facebook,...,Months until First Round,Grant Y/N,Last Round Type,Was Acquired,Made Acquisitions,Project Funding,Startup Funding,Growth Funding,Expansion Funding,Exit Funding
0,000001,2trde,"[Automotive, Software]",2trde develops a software solution designed fo...,57123,2017-01-01,NaT,https://www.2trde.com,,,...,40.0,0,Seed,0,0,1,1,0,0,0
1,000002,36ZERO Vision,"[Artificial Intelligence (AI), Computer Vision...",36ZEROVision is an AI-powered visual inspectio...,51326,2019-01-01,NaT,https://36zerovision.com/,,,...,15.0,0,Seed,0,0,1,1,0,0,0
2,000003,3Bears Foods,[Food and Beverage],3Bears Foods enable a balanced and delicious b...,275817,2015-01-01,NaT,https://3bears.de/,,https://www.facebook.com/3bearsporridge/,...,29.0,0,Seed,0,0,1,1,0,0,0
3,000004,3dTrust,"[3D Printing, Manufacturing, Software]",3dTrust helps companies integrate 3D printing ...,134694,2015-01-01,NaT,http://3dtrust.de,https://twitter.com/3dTrust,,...,-1.0,0,No Funding,0,0,0,0,0,0,0
4,000005,abaut,"[Analytics, Artificial Intelligence (AI), Cons...",abaut builds a SaaS that enables businesses al...,219525,2017-07-21,NaT,https://abaut.de,,,...,32.0,0,Seed,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,001506,SAYM,"[Apps, B2B, B2C, Human Resources, Mobile Apps,...",The SAYM platform for swarm mobility defines t...,166910,2019-01-01,NaT,https://www.saym.io/,https://twitter.com/saym_io,https://www.facebook.com/saymcommute/,...,4.0,1,Grant,0,0,1,0,0,0,0
1506,001507,SONAH GmbH,"[Apps, Artificial Intelligence (AI), Computer ...",SONAH developed a flexible embedded vision sen...,121752,2016-01-01,NaT,http://www.sonah.tech,,,...,59.0,0,Seed,0,0,1,1,0,0,0
1507,001508,Taxy.io,"[FinTech, Legal Tech, Machine Learning, Software]",Taxy.io builds the leading platform for B2B ta...,220816,2019-01-01,NaT,https://www.taxy.io/,https://twitter.com/taxy_io,,...,4.0,0,Seed,0,0,1,1,0,0,0
1508,001509,TRINKKOST GmbH,"[Agriculture, Consumer Goods, Fitness, Food an...",TRINKKOST is a food supplement manufacturing c...,907817,2016-01-01,NaT,http://www.trinkkost.de,https://www.twitter.com/trinkkost,https://www.facebook.com/trinkkost/,...,14.0,0,Venture - Series Unknown,0,0,1,1,0,0,0


In [91]:
funding

Unnamed: 0,Funding ID,Organization Name,Funding Type,Money Raised,Announced Date,Lead Investors,Number of Investors,Investor Names,Company ID
0,000001,4stop,Series A,2500000.0,2019-05-15,Ventech,1.0,[Ventech],000000
1,000002,aiconix GmbH,Convertible Note,,2019-06-17,,,[],001193
2,000003,aiconix GmbH,Pre-Seed,300000.0,2019-08-28,,,[],001193
3,000004,AutLay,Seed,,2019-05-22,Crew Ventures,1.0,[Crew Ventures],001196
4,000005,Buynomics,Pre-Seed,,2019-09-01,DvH Ventures,2.0,"[DvH Ventures, Tomahawk.VC]",001205
...,...,...,...,...,...,...,...,...,...
6863,006864,VAMOS.ai,Seed,,2020-01-01,DDG AG,1.0,[DDG AG],000000
6864,006865,WeProfit,Pre-Seed,272000.0,2021-05-28,,5.0,"[Ara Abrahamyan, Armen Kocharyan, Clemens Boll...",000298
6865,006866,WindStar Medical GmbH,Seed,,2020-12-07,Project A Ventures,1.0,[Project A Ventures],000000
6866,006867,WorkMentality Foundation,Seed,,2023-03-01,,1.0,[Deutsche Bank],000000


### Feature 6: Average Time To Next Round
Explanation:

In [92]:
funding['Announced Date'] = pd.to_datetime(funding['Announced Date'])
companies['Founded Date'] = pd.to_datetime(companies['Founded Date'])

def calculate_avg_time_to_next_round(company_id, company_founded_date, funding_df):
    company_funding = funding_df[funding_df['Company ID'] == company_id]

    if len(company_funding) == 1:
        time_diff = (company_funding['Announced Date'].iloc[0] - company_founded_date).days
        avg_time_to_next_round = time_diff / 30
    elif len(company_funding) >= 2:
        time_diffs = []
        first_round_diff = (company_funding['Announced Date'].iloc[0] - company_founded_date).days
        time_diffs.append(first_round_diff)
        company_funding = company_funding.sort_values('Announced Date')
        for i in range(1, len(company_funding)):
            time_diff = (company_funding['Announced Date'].iloc[i] - company_funding['Announced Date'].iloc[i-1]).days
            time_diffs.append(time_diff)
        avg_time_to_next_round = sum(time_diffs) / len(time_diffs) / 30
    else:
        avg_time_to_next_round = None

    if pd.isna(avg_time_to_next_round) or avg_time_to_next_round < -1:
        avg_time_to_next_round = -1

    return avg_time_to_next_round

companies['Average Time To Next Round'] = companies.apply(
    lambda row: calculate_avg_time_to_next_round(row['ID'], row['Founded Date'], funding), axis=1
)

### Feature 7: Average Funding Size
Explanation:

In [93]:
funding_summary = funding.groupby('Company ID').agg(
    TotalMoneyRaised=('Money Raised', 'sum'),
    TotalRounds=('Money Raised', 'count')
).reset_index()

funding_summary['Average Funding Size'] = funding_summary['TotalMoneyRaised'] / funding_summary['TotalRounds']

companies = companies.merge(
    funding_summary[['Company ID', 'Average Funding Size']],
    left_on='ID',
    right_on='Company ID',
    how='left'
)

companies['Average Funding Size'] = companies['Average Funding Size'].fillna(0)

companies.drop(columns=['Company ID'], inplace=True)

## New Features: Companies <> Investors
Explanation:

### Feature 1-4: Average Number of Investments by Investors
Explanation:

In [94]:
funding_exploded = funding.explode('Investor Names')
funding_exploded['Investor Names'] = funding_exploded['Investor Names'].str.strip()
investors['Organization/Person Name'] = investors['Organization/Person Name'].str.strip()

merged = funding_exploded.merge(
    investors,
    left_on='Investor Names',
    right_on='Organization/Person Name',
    how='left'
)

columns_to_process = ['Number of Investments', 'Number of Exits', 'Number of Lead Investments', 'Number of Portfolio Organizations']
for col in columns_to_process:
    if col in merged.columns:
        merged[col] = pd.to_numeric(merged[col], errors='coerce')

not_found_investors = merged[merged['Number of Investments'].isnull()]['Investor Names'].unique()
if len(not_found_investors) > 0:
    print(f"{len(not_found_investors)} investors could not be found.")
else:
    print("All investors were successfully found.")

not_found_investors_df = pd.DataFrame(not_found_investors, columns=['Investor Names'])

if 'Company ID' in companies.columns:
    companies = companies.drop(columns=['Company ID'])

for col in columns_to_process:
    col_average = merged.groupby('Company ID')[col].mean().reset_index()
    companies = companies.merge(
        col_average,
        left_on='ID',
        right_on='Company ID',
        how='left',
        suffixes=('', '_drop')
    )
    companies[col] = companies[col].fillna(0)
    companies = companies.rename(columns={col: f'Average {col} by Investors'})

companies = companies.drop(columns=[col for col in companies.columns if col.endswith('_drop')], errors='ignore')


3608 investors could not be found.


### Feature 5: Origin Country of Investors
Explanation:

In [95]:
funding_exploded = funding.explode('Investor Names')
funding_exploded['Investor Names'] = funding_exploded['Investor Names'].str.strip()
investors['Organization/Person Name'] = investors['Organization/Person Name'].str.strip()

merged = funding_exploded.merge(
    investors[['Organization/Person Name', 'Country']],
    left_on='Investor Names',
    right_on='Organization/Person Name',
    how='left'
)

distinct_countries = investors['Country'].dropna().unique()[:5]

country_encoded = pd.get_dummies(merged['Country'], prefix='Investor Country', dtype=int)

country_aggregated = country_encoded.groupby(merged['Company ID']).max()

companies = companies.merge(
    country_aggregated,
    left_on='ID',
    right_index=True,
    how='left'
)

for country in distinct_countries:
    column_name = f'Investor Country_{country}'
    if column_name in companies.columns:
        companies[column_name] = companies[column_name].fillna(0)

## New Features: Companies

### Feature 1: 'Months between Founding and Acquisition'

In [96]:
companies['Months between Founding and Acquisition'] = companies.apply(
    lambda row: (row['Announced Date Acquisition'] - row['Founded Date']).days // 30
    if pd.notna(row['Announced Date Acquisition']) else -1,
    axis=1
)

### Feature 2: Category One Hot Encoding

In [97]:
exploded = companies.explode('Industry Groups')
exploded['Industry Groups'] = exploded['Industry Groups'].str.strip().str.lower()
exploded = exploded[exploded['Industry Groups'].notna() & (exploded['Industry Groups'] != '')]

one_hot_encoded = pd.get_dummies(exploded['Industry Groups'], prefix='Industry', dtype=int)
one_hot_aggregated = one_hot_encoded.groupby(exploded.index).max()

companies = pd.concat([companies, one_hot_aggregated], axis=1)
