# Model Preparation

## Import Packages

In [1]:
from importnb import Notebook
import pandas as pd
import numpy as np

## Data-Import from 'Feature Engineering'

In [2]:
with Notebook():
    from FeatureEngineering import companies


Results with possible duplicates:
     Founder ID Company ID             Founder Name Organization Name  \
0        000001     000001         Johannes Stoffel             2trde   
1        000002     000002          Florian Ziesche     36ZERO Vision   
2        000003     000002          Zeeshan Karamat     36ZERO Vision   
3        000004     000003     Caroline Steingruber      3Bears Foods   
4        000005     000003              Tim Nichols      3Bears Foods   
...         ...        ...                      ...               ...   
3136     003137     001508               Sven Peper           Taxy.io   
3137     003138     001508               Sven Weber           Taxy.io   
3138     003139     001509  Dr. Gennadi Schechtmann    TRINKKOST GmbH   
3139     003140     001509            Timon Ortloff    TRINKKOST GmbH   
3140     003141     001510           Tobias Kollewe   worqs Coworking   

     first_name            last_name  \
0      Johannes              Stoffel   
1       

## Success Definition

In [3]:
data = companies

data['Success'] = ((data['Was Acquired'] == 1) |
                   (data['IPO'] == 1) |
                   (data['HighFunding'] == 1)).astype(int)

data['Success'].value_counts(normalize=True) * 100

Success
0    82.551223
1    17.448777
Name: proportion, dtype: float64

## Data Preparation

In [4]:
# Remove non-numeric columns
data = data.select_dtypes(include=[np.number])

# count nan values
nan_values = data.isna().sum()
nan_values = nan_values[nan_values > 0]
print(nan_values)

# count rows
rows=data.shape[0]
print(f"Remaining rows after dropping NaNs: {rows}")


Last Funding Amount                          754
Total Funding Amount                         615
Industry_administrative services              18
Industry_advertising                          18
Industry_agriculture and farming              18
Industry_apps                                 18
Industry_artificial intelligence (ai)         18
Industry_biotechnology                        18
Industry_blockchain and cryptocurrency        18
Industry_clothing and apparel                 18
Industry_commerce and shopping                18
Industry_community and lifestyle              18
Industry_consumer electronics                 18
Industry_consumer goods                       18
Industry_content and publishing               18
Industry_data and analytics                   18
Industry_design                               18
Industry_education                            18
Industry_energy                               18
Industry_events                               18
Industry_financial s

## Correlation Analysis

In [5]:
correlation_matrix = data.corr()
correlation_matrix

Unnamed: 0,Number of Founders,Number of Employees,Number of Funding Rounds,Last Funding Amount,Total Funding Amount,Number of Lead Investors,Number of Investors,Number of Acquisitions,IPO,Months until First Round,...,Industry_sales and marketing,Industry_science and engineering,Industry_social impact,Industry_software,Industry_sports,Industry_sustainability,Industry_transportation,Industry_travel and tourism,Industry_video,Success
Number of Founders,1.000000,0.122795,0.219110,0.032472,0.084670,0.250995,0.190135,0.040614,0.012716,-0.081884,...,0.017795,0.031885,0.040773,0.037775,0.022544,0.012360,0.020594,-0.026889,-0.003204,0.182687
Number of Employees,0.122795,1.000000,0.329667,0.313007,0.580628,0.397638,0.372195,0.259662,0.215157,-0.036349,...,-0.035936,-0.009028,-0.015021,-0.091456,0.041796,0.105403,0.046526,0.002177,-0.029546,0.226878
Number of Funding Rounds,0.219110,0.329667,1.000000,0.166756,0.489598,0.741205,0.671119,0.191521,0.130815,-0.272159,...,-0.082236,0.081942,0.037082,0.003283,-0.030580,0.092757,0.056057,-0.038989,-0.049284,0.324414
Last Funding Amount,0.032472,0.313007,0.166756,1.000000,0.422864,0.270947,0.228415,0.193076,0.161930,0.122052,...,-0.044561,-0.012641,-0.022726,-0.050826,0.000627,0.024024,0.050229,-0.031337,-0.030500,0.259367
Total Funding Amount,0.084670,0.580628,0.489598,0.422864,1.000000,0.503766,0.483041,0.146643,0.087949,-0.027047,...,-0.033113,-0.032093,-0.012786,-0.083050,-0.023837,0.089176,0.030160,-0.023961,-0.022516,0.185244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Industry_sustainability,0.012360,0.105403,0.092757,0.024024,0.089176,0.113031,0.055609,-0.046901,0.156076,0.082687,...,-0.068026,-0.013981,0.029746,-0.086444,-0.052410,1.000000,0.039709,-0.057095,-0.030939,0.031917
Industry_transportation,0.020594,0.046526,0.056057,0.050229,0.030160,0.032098,0.036374,-0.010864,0.062435,0.005893,...,-0.090989,-0.069209,0.009176,-0.103862,-0.047923,0.039709,1.000000,0.067139,-0.041383,0.000533
Industry_travel and tourism,-0.026889,0.002177,-0.038989,-0.031337,-0.023961,-0.069187,-0.035079,0.008169,-0.019187,0.012634,...,-0.027520,-0.098672,-0.020738,-0.080822,-0.042073,-0.057095,0.067139,1.000000,-0.024837,0.079985
Industry_video,-0.003204,-0.029546,-0.049284,-0.030500,-0.022516,-0.072713,-0.043793,-0.020403,-0.010397,-0.043177,...,-0.029592,-0.006082,-0.011238,0.064585,-0.022799,-0.030939,-0.041383,-0.024837,1.000000,-0.060022


In [6]:
feature_correlation_matrix = data.drop(columns=['Success']).corr()
upper_triangle = feature_correlation_matrix.where(np.triu(np.ones(feature_correlation_matrix.shape), k=1).astype(bool))
highly_correlated_features = [
    (feature_correlation_matrix.columns[col], feature_correlation_matrix.index[row], upper_triangle.iloc[row, col])
    for row in range(upper_triangle.shape[0])
    for col in range(row + 1, upper_triangle.shape[1])
    if abs(upper_triangle.iloc[row, col]) > 0.8
]

# Print highly correlated features
if highly_correlated_features:
    print("Strongly correlated feature pairs:")
    for feature1, feature2, correlation in highly_correlated_features:
        print(f"{feature1} and {feature2} with a correlation of {correlation:.2f}")
else:
    print("No strongly correlated features found.")

# Drop unnecessary columns
columns_to_drop = ['Was Acquired', 'IPO', 'HighFunding'         # Already included in target
    ,'Months between Founding and Acquisition']  # Highly correlated with 'Was Acquired'
data = data.drop(columns=columns_to_drop)

Strongly correlated feature pairs:
Average Funding Size and Last Funding Amount with a correlation of 0.91
Average Time To Next Round and Months until First Round with a correlation of 0.82
Months between Founding and Acquisition and Was Acquired with a correlation of 0.94
Average Number of Portfolio Organizations by Investors and Average Number of Investments by Investors with a correlation of 0.94
Industry_data and analytics and Industry_artificial intelligence (ai) with a correlation of 0.85
