In [79]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.models import Sequential, save_model, load_model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, Nadam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import joblib
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  

SEED = 42
LEARNING_RATE = 0.001
BATCH_SIZE = 32
EPOCHS = 500
PATIENCE = 15
REDUCE_LR_PATIENCE = 5
MIN_LR = 1e-6

In [80]:
df = pd.read_csv(r"C:\Users\JacopoBinati\OneDrive - Venionaire Capital\Desktop\crunchbase\dataset.csv")
df.drop(columns=[
    "EV/Sales",
    "ROE",
    "Expected growth - next 5 years",
    "Forward PE",
    "% of Money Losing firms (Trailing)",
], inplace=True)

X = df.drop('normalized_valuation', axis=1)

y = df['normalized_valuation']

categorical_variables = {
    'Industry': [col for col in X.columns if col.startswith('Industry_')],
    'Region': [col for col in X.columns if col.startswith('Region_')],
    'Last Funding Type': [col for col in X.columns if col.startswith('LastFundingType__')],
    'All Funding Type': [col for col in X.columns if col.startswith('FundingType__')],
    'size of the company': [col for col in X.columns if col.startswith('NumberEmployees__')]
}

In [81]:
df.shape

(17924, 165)

In [82]:
df.columns.to_list()

['Number of Founders',
 'Number of Funding Rounds',
 'Last Funding Amount (in USD)',
 'Last Equity Funding Amount (in USD)',
 'Total Equity Funding Amount (in USD)',
 'Total Funding Amount (in USD)',
 'Number of Investors',
 'Number of Investments',
 'Last Funding Year',
 'Founded Year',
 'Status__Private',
 'CompanyType__For Profit',
 'CompanyType__Non-profit',
 'NumberEmployees__1-10',
 'NumberEmployees__10001+',
 'NumberEmployees__1001-5000',
 'NumberEmployees__101-250',
 'NumberEmployees__11-50',
 'NumberEmployees__251-500',
 'NumberEmployees__5001-10000',
 'NumberEmployees__501-1000',
 'NumberEmployees__51-100',
 'FundingType__Angel',
 'FundingType__Convertible Note',
 'FundingType__Corporate Round',
 'FundingType__Debt Financing',
 'FundingType__Equity Crowdfunding',
 'FundingType__Grant',
 'FundingType__Initial Coin Offering',
 'FundingType__Non-equity Assistance',
 'FundingType__Post-IPO Debt',
 'FundingType__Post-IPO Equity',
 'FundingType__Post-IPO Secondary',
 'FundingType__

In [85]:
#  mappin from feature to categorical variable - otherwise big problema
feature_to_category = {}
for category, dummy_columns in categorical_variables.items():
    for feature in dummy_columns:
        feature_to_category[feature] = category

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# get vars means, mins, and maxs
feature_means = X_train.mean()
feature_mins = X_train.min()
feature_maxs = X_train.max()


In [86]:
#  variables to be auto filled based on industry name - STILL YOU NEED TO WORK 
auto_fill_variables = [
    'Net Margin', 'Norm EV/Sales', 'Pre-tax Operating Margin', 'PBV', 'Norm ROE',
    'EV/ Invested Capital', 'ROIC', 'EV/EBITDAR&D', 'EV/EBITDA', 'EV/EBIT',
    'EV/EBIT (1-t)', 'EV/EBITDAR&D2', 'EV/EBITDA3', 'EV/EBIT4', 'EV/EBIT (1-t)5',
    'Norm % of Money Losing firms (Trailing)', 'Current PE', 'Trailing PE', 'Norm Forward PE',
    'Aggregate Mkt Cap/ Trailing Net Income (only money making firms)', 'Norm Expected growth 5 years'
]

In [87]:
# build and train the model - THIS IS ACTUALLY REALLY NICE! HAPPY
def build_and_train_enhanced_model(X_train, y_train, learning_rate=0.001, batch_size=32, epochs=500, scaler=None):
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    model = Sequential([
        Dense(512, input_dim=X_train.shape[1], activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1)
    ])

    optimizer = Nadam(learning_rate=LEARNING_RATE)
    model.compile(loss='mean_squared_error', optimizer=optimizer)

    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=PATIENCE,
        restore_best_weights=True,
        verbose=1
    )

    reduce_lr = ReduceLROnPlateau( monitor='val_loss',
        factor=0.2,
        patience=PATIENCE,
        min_lr=MIN_LR,
        verbose=1)
     
    history = model.fit(X_train, y_train, 
        validation_data=(X_val, y_val), 
        epochs=EPOCHS, 
        batch_size=BATCH_SIZE, 
        callbacks=[early_stopping, reduce_lr], 
        verbose=1)

    return model, history, scaler

# train & save
model, history, scaler = build_and_train_enhanced_model(X_train_scaled, y_train, scaler=scaler)


Epoch 1/500


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m359/359[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - loss: 0.5157 - val_loss: 0.0283 - learning_rate: 0.0010
Epoch 2/500
[1m359/359[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0710 - val_loss: 0.0102 - learning_rate: 0.0010
Epoch 3/500
[1m359/359[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0317 - val_loss: 0.0052 - learning_rate: 0.0010
Epoch 4/500
[1m359/359[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0173 - val_loss: 0.0045 - learning_rate: 0.0010
Epoch 5/500
[1m359/359[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0108 - val_loss: 0.0070 - learning_rate: 0.0010
Epoch 6/500
[1m359/359[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0082 - val_loss: 0.0046 - learning_rate: 0.0010
Epoch 7/500
[1m359/359[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0066 - val_loss: 0.0042 - learning_rate: 0.00

In [88]:
# update  saving code:
save_model(model, 'optimized_model.h5')
joblib.dump(scaler, 'scaler.save')
joblib.dump(feature_means, 'feature_means.save')
print("Model, scaler, and feature means have been saved.")

# load the model, scaler, and feature means
model = load_model('optimized_model.h5')
scaler = joblib.load('scaler.save')
feature_means = joblib.load('feature_means.save')

# Load the column names (features)
feature_columns = feature_means.index.tolist()



Model, scaler, and feature means have been saved.


In [89]:
# Reconstruct categorical variables and mappings
categorical_variables = {
    'Industry': [col for col in feature_columns if col.startswith('Industry_')],
    'Region': [col for col in feature_columns if col.startswith('Region_')],
    'Last Funding Type': [col for col in feature_columns if col.startswith('LastFundingType__')],
    'All Funding Type': [col for col in feature_columns if col.startswith('FundingType__')],
    'size of the company': [col for col in feature_columns if col.startswith('NumberEmployees__')]
}
# here create mapping from feature to categorical variable
feature_to_category = {}
for category, dummy_columns in categorical_variables.items():
    for feature in dummy_columns:
        feature_to_category[feature] = category

In [90]:
# preprocess new input data
def preprocess_input(input_data):
    input_df = pd.DataFrame([input_data])
    
    for col in X.columns:
        if col not in input_df.columns:
            if col in feature_means:
                input_df[col] = feature_means[col]
            else:
                input_df[col] = 0  
    
    input_df = input_df[X.columns] # reorder columns to match the training data structure
    input_scaled = scaler.transform(input_df)  # scale the input data using the pre-fitted scaler
    
    return input_scaled

In [91]:
# make predictions using the trained model
def predict(input_data):
    # handle missing Total Funding Amount
    if 'Total Funding Amount (in USD)' not in input_data:
        print("Warning: Feature 'Total Funding Amount (in USD)' is missing. Using mean value.")
        input_data['Total Funding Amount (in USD)'] = feature_means['Total Funding Amount (in USD)']
    
    # 
    preprocessed_data = preprocess_input(input_data)
    
    # 
    raw_prediction = model.predict(preprocessed_data)[0][0]
    
    # 
    if 'normalized_valuation' in feature_mins and 'normalized_valuation' in feature_maxs:
        min_val = feature_mins['normalized_valuation']
        max_val = feature_maxs['normalized_valuation']
        
        # 
        denormalized_prediction = raw_prediction * (max_val - min_val) + min_val
        
        print(f"Normalized Predicted Valuation: {raw_prediction:.6f}")
        print(f"Denormalized Predicted Valuation: ${denormalized_prediction:,.2f}")
    else:
        # Fallback behavior in case 'normalized_valuation' is missing
        print("Warning: 'normalized_valuation' not found in feature_mins or feature_maxs. Using raw prediction.")
        denormalized_prediction = raw_prediction
        
        print(f"Raw Predicted Valuation (no denormalization applied): {raw_prediction:.6f}")
    
    return denormalized_prediction

In [94]:
def get_user_input():
    input_data = {}
    input_data['Number of Founders'] = int(input("Enter Number of Founders: "))
    input_data['Number of Funding Rounds'] = int(input("Enter Number of Funding Rounds: "))
    input_data['Last Funding Amount (in USD)'] = float(input("Enter Last Funding Amount (in USD): "))
    input_data['Last Equity Funding Amount (in USD)'] = float(input("Enter Last Equity Funding Amount (in USD): "))
    input_data['Total Equity Funding Amount (in USD)'] = float(input("Enter Total Equity Funding Amount (in USD): "))
    input_data['Total Funding Amount (in USD)'] = float(input("Enter Total Funding Amount (in USD): "))
    input_data['Number of Investors'] = int(input("Enter Number of Investors: "))
    input_data['Last Funding Year'] = int(input("Enter Last Funding Year: "))
    input_data['Founded Year'] = int(input("Enter Founded Year: "))
    
    # Get categorical variables
    status = input("Enter Status (Private/Public): ")
    if status.lower() == "private":
        input_data['Status__Private'] = 1.0
        input_data['Status__Public'] = 0.0
    else:
        input_data['Status__Private'] = 0.0
        input_data['Status__Public'] = 1.0
        
    company_type = input("Enter Company Type (For Profit/Non-profit): ")
    if company_type.lower() == "for profit":
        input_data['CompanyType__For Profit'] = 1.0
        input_data['CompanyType__Non-profit'] = 0.0
    else:
        input_data['CompanyType__For Profit'] = 0.0
        input_data['CompanyType__Non-profit'] = 1.0
        
    # Get list of employees
    employees = input("Enter Number of Employees (separate with commas): ").split(',')
    employees = [employee.strip() for employee in employees]
    for employee in employees:
        input_data[f'NumberEmployees__{employee}'] = 1.0
        
    # Get list of funding types
    funding_types = input("Enter Funding Types (separate with commas): ").split(',')
    funding_types = [funding_type.strip() for funding_type in funding_types]
    for funding_type in funding_types:
        input_data[f'FundingType__{funding_type}'] = 1.0
        
    # Get list of last funding types
    last_funding_types = input("Enter Last Funding Types (separate with commas): ").split(',')
    last_funding_types = [last_funding_type.strip() for last_funding_type in last_funding_types]
    for last_funding_type in last_funding_types:
        input_data[f'LastFundingType__{last_funding_type}'] = 1.0
        
    # Get list of regions
    regions = input("Enter Regions (separate with commas): ").split(',')
    regions = [region.strip() for region in regions]
    for region in regions:
        input_data[f'Region__{region}'] = 1.0
        
    # Get list of industries
    industries = input("Enter Industries (separate with commas): ").split(',')
    industries = [industry.strip() for industry in industries]
    for industry in industries:
        input_data[f'Industry__{industry}'] = 1.0
        
    # Compute Norm Total Funding
    total_funding = input_data['Last Equity Funding Amount (in USD)'] + input_data['Total Equity Funding Amount (in USD)'] + input_data['Total Funding Amount (in USD)']
    min_funding = 0
    max_funding = 10000000000000  # Assuming a maximum funding of 1 billion USD
    input_data['Norm Total Funding'] = (total_funding - min_funding) / (max_funding - min_funding)
    
    def compute_variables(input_data, df, selected_regions, selected_industries):
    # Compute other variables
        for region in selected_regions:
            for industry in selected_industries:
            # Filter the DataFrame for the selected region and industry
                filtered_df = df[(df[f'Region_{region}'] == 1.0) & (df[f'Industry_{industry}'] == 1.0)]
                
                # Compute average values for each variable
                avg_ev_sales = filtered_df[f'{region}__{industry}_EV_Sales'].mean()
                avg_roe = filtered_df[f'{region}__{industry}_ROE'].mean()
                avg_expected_growth = filtered_df[f'{region}__{industry}_Expected_Growth'].mean()
                avg_forward_pe = filtered_df[f'{region}__{industry}_Forward_PE'].mean()
                avg_money_losing_firms = filtered_df[f'{region}__{industry}_Money_Losing_Firms'].mean()
                avg_price_sales = filtered_df[f'{region}__{industry}_Price_Sales'].mean()
                avg_net_margin = filtered_df[f'{region}__{industry}_Net_Margin'].mean()
                avg_pre_tax_operating_margin = filtered_df[f'{region}__{industry}_Pre_Tax_Operating_Margin'].mean()
                avg_pbv = filtered_df[f'{region}__{industry}_PBV'].mean()
                avg_ev_invested_capital = filtered_df[f'{region}__{industry}_EV_Invested_Capital'].mean()
                avg_roic = filtered_df[f'{region}__{industry}_ROIC'].mean()
                avg_ev_ebitdar_d = filtered_df[f'{region}__{industry}_EV_EBITDAR_D'].mean()
                avg_ev_ebitda = filtered_df[f'{region}__{industry}_EV_EBITDA'].mean()
                avg_ev_ebit = filtered_df[f'{region}__{industry}_EV_EBIT'].mean()
                avg_ev_ebit_1_t = filtered_df[f'{region}__{industry}_EV_EBIT_1_T'].mean()
                avg_ev_ebitda3 = filtered_df[f'{region}__{industry}_EV_EBITDA3'].mean()
                avg_ev_ebit4 = filtered_df[f'{region}__{industry}_EV_EBIT4'].mean()
                avg_ev_ebit_1_t5 = filtered_df[f'{region}__{industry}_EV_EBIT_1_T5'].mean()
                avg_current_pe = filtered_df[f'{region}__{industry}_Current_PE'].mean()
                avg_trailing_pe = filtered_df[f'{region}__{industry}_Trailing_PE'].mean()
                avg_aggregate_mkt_cap_net_income = filtered_df[f'{region}__{industry}_Aggregate_Mkt_Cap_Net_Income'].mean()
                avg_aggregate_mkt_cap_trailing_net_income = filtered_df[f'{region}__{industry}_Aggregate_Mkt_Cap_Trailing_Net_Income'].mean()
                avg_peg_ratio = filtered_df[f'{region}__{industry}_PEG_Ratio'].mean()
                
                input_data[f'Norm EV/Sales'] = avg_ev_sales
                input_data[f'Norm ROE'] = avg_roe
                input_data[f'Norm Expected growth 5 years'] = avg_expected_growth
                input_data[f'Norm Forward PE'] = avg_forward_pe
                input_data[f'Norm % of Money Losing firms (Trailing)'] = avg_money_losing_firms
                input_data[f'Price/Sales'] = avg_price_sales
                input_data[f'Net Margin'] = avg_net_margin
                input_data[f'Pre-tax Operating Margin'] = avg_pre_tax_operating_margin
                input_data[f'PBV'] = avg_pbv
                input_data[f'EV/ Invested Capital'] = avg_ev_invested_capital
                input_data[f'ROIC'] = avg_roic
                input_data[f'EV/EBITDAR&D'] = avg_ev_ebitdar_d
                input_data[f'EV/EBITDA'] = avg_ev_ebitda
                input_data[f'EV/EBIT'] = avg_ev_ebit
                input_data[f'EV/EBIT (1-t)'] = avg_ev_ebit_1_t
                input_data[f'EV/EBITDA3'] = avg_ev_ebitda3
                input_data[f'EV/EBIT4'] = avg_ev_ebit4
                input_data[f'EV/EBIT (1-t)5'] = avg_ev_ebit_1_t5
                input_data[f'Current PE'] = avg_current_pe
                input_data[f'Trailing PE'] = avg_trailing_pe
                input_data[f'Aggregate Mkt Cap/ Net Income (all firms)'] = avg_aggregate_mkt_cap_net_income
                input_data[f'Aggregate Mkt Cap/ Trailing Net Income (only money making firms)'] = avg_aggregate_mkt_cap_trailing_net_income
                input_data[f'PEG Ratio'] = avg_peg_ratio

    return input_data

In [93]:
if __name__ == "__main__":
    while True:
        input_data = input_for_prediction()
        print("\nYour input data for prediction:")
        print(input_data)
        proceed = input("\nDo you want to input another set of data? (y/n): ").strip().lower()
        if proceed != 'y':
            print("Exiting the input session.")
            break


Provide input for each feature (or press Enter to use mean value):

Select size of the company(s) from the list (you can select multiple):
1. NumberEmployees__1-10
2. NumberEmployees__10001+
3. NumberEmployees__1001-5000
4. NumberEmployees__101-250
5. NumberEmployees__11-50
6. NumberEmployees__251-500
7. NumberEmployees__5001-10000
8. NumberEmployees__501-1000
9. NumberEmployees__51-100

Select All Funding Type(s) from the list (you can select multiple):
1. FundingType__Angel
2. FundingType__Convertible Note
3. FundingType__Corporate Round
4. FundingType__Debt Financing
5. FundingType__Equity Crowdfunding
6. FundingType__Grant
7. FundingType__Initial Coin Offering
8. FundingType__Non-equity Assistance
9. FundingType__Post-IPO Debt
10. FundingType__Post-IPO Equity
11. FundingType__Post-IPO Secondary
12. FundingType__Pre-Seed
13. FundingType__Private Equity
14. FundingType__Product Crowdfunding
15. FundingType__Secondary Market
16. FundingType__Seed
17. FundingType__Series A
18. Funding

KeyError: 'Industry'