In [1]:
import ast
import joblib
import pickle
import warnings
from collections import Counter
from pathlib import Path
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

import optuna
from optuna.distributions import IntDistribution, FloatDistribution, CategoricalDistribution
from optuna.integration import OptunaSearchCV

from pocketknife.database import connect_database, read_from_database

from scipy.stats import t, ttest_1samp
import statsmodels.formula.api as smf

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

from xgboost import XGBClassifier

from success_prediction.modelling.config import (
    ALL_BINARY_FEATURE_COLS, ALL_CATEGORICAL_FEATURE_COLS, ALL_CONTINUOUS_FEATURE_COLS, 
    FOUNDING_WEBSITE_FEATURE_COLS, CURRENT_WEBSITE_FEATURE_COLS, TARGET_COLS
)
from success_prediction.config import RAW_DATA_DIR, PROCESSED_DATA_DIR, MODELS_DIR

warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.preprocessing._encoders")
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)

optuna.logging.set_verbosity(optuna.logging.ERROR)

[32m2025-06-19 15:45:14.240[0m | [1mINFO    [0m | [36msuccess_prediction.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/manuelbolz/Documents/git/for_work/company_success_prediction[0m


# READ COMPANY SAMPLE

In [2]:
company_sample = pd.read_csv(RAW_DATA_DIR / 'company_sample' / 'until_2020' / '2020_sample_encoded_features.csv')

all_feature_df = company_sample[TARGET_COLS + ALL_BINARY_FEATURE_COLS + ALL_CATEGORICAL_FEATURE_COLS + ALL_CONTINUOUS_FEATURE_COLS]

  company_sample = pd.read_csv(RAW_DATA_DIR / 'company_sample' / 'until_2020' / '2020_sample_encoded_features.csv')


In [3]:
all_feature_df[TARGET_COLS].corr(method='pearson')

Unnamed: 0,target_inv_exit,target_acquisition,target_non_gov_investment,target_inno_subsidy
target_inv_exit,1.0,-0.032101,-0.017236,-0.019113
target_acquisition,-0.032101,1.0,-0.001043,-0.004808
target_non_gov_investment,-0.017236,-0.001043,1.0,0.348525
target_inno_subsidy,-0.019113,-0.004808,0.348525,1.0


In [4]:
company_sample[['target_inv_exit', 'current_vp_w']].corr(method='pearson')

Unnamed: 0,target_inv_exit,current_vp_w
target_inv_exit,1.0,-0.064871
current_vp_w,-0.064871,1.0


## Handle missing values in base features

In [5]:
missing_df = all_feature_df.isna().sum().loc[lambda x: x > 0].to_frame('n_missing')
missing_df[missing_df['n_missing'] > 0]

Unnamed: 0,n_missing
district_id,3
canton_id,3
urban_rural,3
typology_9c,3
typology_25c,3
founding_dominant_language,60987
current_dominant_language,64720
firm_name_length,1
population,6
founding_mean_text_len,60987


In [6]:
# Drop the row with missing firm name length
all_feature_df = all_feature_df[all_feature_df['firm_name_length'].notna()]

# Fill missing population with 0
all_feature_df['population'] = all_feature_df['population'].fillna(0)

# Keep the following collumns as missings since they are in locations not belonging to any canton, missing is correct in this case
for col in ['district_id', 'canton_id', 'urban_rural', 'typology_9c', 'typology_25c']:
    all_feature_df[col] = all_feature_df[col].fillna(-1)

In [7]:
for col in ALL_BINARY_FEATURE_COLS + TARGET_COLS:
    if col in all_feature_df.columns:
        all_feature_df[col] = all_feature_df[col].astype('int8')

for col in ALL_CONTINUOUS_FEATURE_COLS:
    if col in all_feature_df.columns:
        all_feature_df[col] = all_feature_df[col].astype('float32')

for col in ALL_CATEGORICAL_FEATURE_COLS:
    if col in all_feature_df.columns:
        all_feature_df[col] = all_feature_df[col].astype('category')

## LogReg Features

For logistic regression, the features listed below are included.
- High-cardinality categorical features are removed and the remaining one-hot encoded.
- Continuous features are scaled using StandardScaler().
- Binary features are used as is.

In [8]:
LOGREG_BINARY_FEATURES = [
    'firm_name_swiss_ref',
    'firm_name_holding_ref',
    'firm_name_geog_ref',
    'firm_name_founder_match',
    'firm_name_male_match',
    'firm_name_female_match',
    'bps_geographic_term',
    'bps_male_name',
    'bps_female_name',
]

# < 30 categories and not strongly correlated
LOGREG_LOW_CAT_FEATURES = [
    'founding_legal_form',
    'section_1_label',
    'typology_9c',
    'canton_id',
    'bps_length_quantiles_5',
    'founding_dominant_language',
    'current_dominant_language',
]

LOGREG_HIGH_CAT_FEATURES = []  # No features with high cardinality

LOGREG_CONTINUOUS_FEATURES = [  
    'capital_chf',
    'firm_name_length',
    'population',
    'n_firms_within_10m',
    'n_firms_within_2.5km',
    'n_founders',
    'n_inscribed_firms',
    'n_distinct_nationalities',
    'pct_female_founders',
    'pct_foreign_founders',
    'pct_dr_titles',
    'pct_founders_same_residence',
    'pct_founders_with_prior_founding',
    'n_dissolved_firms',
    'n_existing_firms',
    'bps_mean_word_length',
    'bps_lix',
    'bps_min_word_freq_norm',
    'bps_max_word_freq_norm',
    'bps_freq_ratio_norm',
    'founding_mean_text_len',
    'founding_n_internal_links_mean',
    'founding_n_external_links_mean',
    'founding_n_languages',
    'founding_pr_sdg_similarity',
    'founding_pr_w_sdg_similarity',
    'founding_pr_w_red_sdg_similarity',
    'founding_doc2vec_diff',
    'founding_lp',
    'founding_lp_w',
    'founding_lp_w_red',
    'founding_vp',
    'founding_vp_w',
    'founding_vp_w_red',
    'current_mean_text_len',
    'current_n_internal_links_mean',
    'current_n_external_links_mean',
    'current_n_languages',
    'current_pr_sdg_similarity',
    'current_pr_w_sdg_similarity',
    'current_pr_w_red_sdg_similarity',
    'current_doc2vec_diff',
    'current_lp',
    'current_lp_w',
    'current_lp_w_red',
    'current_vp',
    'current_vp_w',
    'current_vp_w_red',
    'days_of_prior_observations',
    'prediction_1_score',
]

## RandomForest Features

For RF, the features listed below are included.
- All included categorical features are encoded using an ordinal encoding.
- Binary and categorical are used as is (unscaled).

In [9]:
RF_BINARY_FEATURES = [
    'firm_name_swiss_ref',
    'firm_name_holding_ref',
    'firm_name_geog_ref',
    'firm_name_founder_match',
    'firm_name_male_match',
    'firm_name_female_match',
    'bps_geographic_term',
    'bps_male_name',
    'bps_female_name',
]

# < 30 categories
RF_LOW_CAT_FEATURES = [
    'founding_legal_form',
    'section_1_label',
    'section_2_label',
    'section_3_label',
    'canton_id',
    'urban_rural',
    'typology_9c',
    'founding_dominant_language',
    'current_dominant_language',
]

RF_HIGH_CAT_FEATURES = [
    'division_1_label',
    'group_1_label',
    'class_1_label',
    'division_2_label',
    'group_2_label',
    'class_2_label',
    'division_3_label',
    'group_3_label',
    'class_3_label',
    'founding_bfs_code',
    'district_id',
]

RF_CONTINUOUS_FEATURES = [
    'capital_chf',
    'firm_name_length',
    'latitude',
    'longitude',
    'population',
    'n_firms_within_10m',
    'n_firms_within_1km',
    'n_firms_within_2.5km',
    'n_firms_within_10km',
    'n_founders',
    'n_inscribed_firms',
    'n_distinct_nationalities',
    'pct_female_founders',
    'pct_foreign_founders',
    'pct_dr_titles',
    'pct_founders_same_residence',
    'pct_founders_with_prior_founding',
    'n_dissolved_firms',
    'n_existing_firms',
    'bps_length',
    'bps_mean_word_length',
    'bps_lix',
    'bps_min_word_freq_norm',
    'bps_max_word_freq_norm',
    'bps_freq_ratio_norm',
    'founding_mean_text_len',
    'founding_n_internal_links_mean',
    'founding_n_external_links_mean',
    'founding_n_languages',
    'founding_pr_sdg_similarity',
    'founding_pr_w_sdg_similarity',
    'founding_pr_w_red_sdg_similarity',
    'founding_doc2vec_diff',
    'founding_lp',
    'founding_lp_w',
    'founding_lp_w_red',
    'founding_vp',
    'founding_vp_w',
    'founding_vp_w_red',
    'current_mean_text_len',
    'current_n_internal_links_mean',
    'current_n_external_links_mean',
    'current_n_languages',
    'current_pr_sdg_similarity',
    'current_pr_w_sdg_similarity',
    'current_pr_w_red_sdg_similarity',
    'current_doc2vec_diff',
    'current_lp',
    'current_lp_w',
    'current_lp_w_red',
    'current_vp',
    'current_vp_w',
    'current_vp_w_red',
    'days_of_prior_observations',
    'prediction_1_score',
    'prediction_2_score',
    'prediction_3_score',
]

## XGBoost Features

For XGBoost, the features listed below are included.
- Low-cardinality categorical features are one-hot encoded.
- High-cardinality categorical features are encoded using ordinal encoding.
- Binary and continuous features are included as is (unscaled).

In [10]:
XGB_BINARY_FEATURES = [
    'firm_name_swiss_ref',
    'firm_name_holding_ref',
    'firm_name_geog_ref',
    'firm_name_founder_match',
    'firm_name_male_match',
    'firm_name_female_match',
    'bps_geographic_term',
    'bps_male_name',
    'bps_female_name',
]

XGB_LOW_CAT_FEATURES = [
    'founding_legal_form',
    'section_1_label',  # Top 3 NOGA Levels
    'section_2_label',
    'section_3_label',
    'urban_rural',  # Cat
    'typology_9c',  # Cat
    'founding_dominant_language',
    'current_dominant_language',
]

XGB_HIGH_CAT_FEATURES = [
    'division_1_label',
    'group_1_label',
    'class_1_label',

    'division_2_label',
    'group_2_label',
    'class_2_label',

    'division_3_label',
    'group_3_label',
    'class_3_label',

    'founding_bfs_code',
    'district_id',
    'canton_id',

    'typology_25c',  # Cat
]

XGB_CONTINUOUS_FEATURES = [  
    'capital_chf',
    'firm_name_length',
    'latitude',
    'longitude',
    'population',
    'n_firms_within_10m',
    'n_firms_within_1km',
    'n_firms_within_2.5km',
    'n_firms_within_10km',
    'n_founders',
    'n_inscribed_firms',
    'n_distinct_nationalities',
    'pct_female_founders',
    'pct_foreign_founders',
    'pct_dr_titles',
    'pct_founders_same_residence',
    'pct_founders_with_prior_founding',
    'n_dissolved_firms',
    'n_existing_firms',
    'bps_length',
    'bps_mean_word_length',
    'bps_lix',
    'bps_min_word_freq_norm',
    'bps_max_word_freq_norm',
    'bps_freq_ratio_norm',
    'founding_mean_text_len',
    'founding_n_internal_links_mean',
    'founding_n_external_links_mean',
    'founding_n_languages',
    'founding_pr_sdg_similarity',
    'founding_pr_w_sdg_similarity',
    'founding_pr_w_red_sdg_similarity',
    'founding_doc2vec_diff',
    'founding_lp',
    'founding_lp_w',
    'founding_lp_w_red',
    'founding_vp',
    'founding_vp_w',
    'founding_vp_w_red',
    'current_mean_text_len',
    'current_n_internal_links_mean',
    'current_n_external_links_mean',
    'current_n_languages',
    'current_pr_sdg_similarity',
    'current_pr_w_sdg_similarity',
    'current_pr_w_red_sdg_similarity',
    'current_doc2vec_diff',
    'current_lp',
    'current_lp_w',
    'current_lp_w_red',
    'current_vp',
    'current_vp_w',
    'current_vp_w_red',
    'days_of_prior_observations',
    'prediction_1_score',
    'prediction_2_score',
    'prediction_3_score',
]

# MODEL TRAINING AND EVALUATION CLASS

In [11]:
class ModelEvaluation:

    def __init__(self, target_vars: list[str], model_specs: dict, random_state: int = 42):
        """
        target_vars: list of target column names
        model_specs: dict with keys as model names, values as dicts with:
                     - 'model': a scikit-learn estimator class or a callable returning an instance
                     - 'param_grid': dict of hyperparameter grid for grid search
        Example:
        model_specs = {
            'logreg': {'model': LogisticRegression, 'param_grid': {...}},
            'rf': {'model': RandomForestClassifier, 'param_grid': {...}}
        }
        """
        self.target_vars = target_vars
        self.model_specs = model_specs
        self.random_state = random_state
        self.best_params = {m: {t: None for t in target_vars} for m in model_specs}
        self.selected_features = {m: {t: None for t in target_vars} for m in model_specs}
        self.best_models = {m: {t: None for t in target_vars} for m in model_specs}
        self.production_models = {m: {t: None for t in target_vars} for m in model_specs}
        self.metrics_report = {m: {t: None for t in target_vars} for m in model_specs}
        
        for model_name, spec in self.model_specs.items():
            self._assert_feature_lists_mutually_exclusive(spec['features'])

    def _assert_feature_lists_mutually_exclusive(self, features_dict):
        all_features = []
        for name, feature_list in features_dict.items():
            all_features.extend(feature_list)
        if len(all_features) != len(set(all_features)):
            duplicates = [item for item, count in Counter(all_features).items() if count > 1]
            raise ValueError(f"Duplicate features detected: {duplicates}")

    def _assert_all_features_present(self, features_dict):
        all_features = []
        for name, feature_list in features_dict.items():
            all_features.extend(feature_list)
        missing_features = [feature for feature in all_features if feature not in self.feature_cols]
        if missing_features:
            raise ValueError(f"Missing features detected: {missing_features}")
        
    def load_data(self, df: pd.DataFrame):
        self.df = df.copy()
        self.feature_cols = [col for col in df.columns if col not in self.target_vars]
        for model_name, spec in self.model_specs.items():
            self._assert_all_features_present(spec['features'])

    def load_best_params(self, file_path: Path, model_names: list[str], target_vars: list[str]):
        with open(file_path, 'rb') as f:
            all_params = pickle.load(f)
        self.best_params = {
            m: {t: all_params[m][t] for t in target_vars}
            for m in model_names
        }

    def load_best_features(self, file_path: Path, model_names: list[str], target_vars: list[str], additional_features: list = []):
        with open(file_path, 'rb') as f:
            all_features = pickle.load(f)
        self.selected_features = {
            m: {t: all_features[m][t] + additional_features for t in target_vars}
            for m in model_names
        }

    def _get_feature_importances(self, model):
        """Return feature importances as a pandas Series, sorted descending."""
        # Tree-based models
        if hasattr(model, "feature_importances_"):
            return model.feature_importances_
        # Linear models (coefficients)
        elif hasattr(model, "coef_"):
            return np.abs(model.coef_).flatten()
        else:
            return None

    def _add_class_weights_to_fit_params(self, fit_params, ModelClass, model_name, y):
        
        if not self.model_specs[model_name]['account_for_class_weights']:
            return fit_params

        if model_name == 'xgb':
            classes = np.unique(y)
            n_pos = np.sum(y == classes[1])
            n_neg = np.sum(y == classes[0])
            fit_params['scale_pos_weight'] = n_neg / n_pos if n_pos > 0 else 1.0

        elif hasattr(ModelClass(), 'class_weight'):
            fit_params['class_weight'] = 'balanced'

        return fit_params

    def _build_preprocessor(self, preprocessor_steps: list[tuple], model_name: str, target: str, best_features: bool = False):
        steps = []
        for name, transformer in preprocessor_steps:
            features = self.model_specs[model_name]['features'][name]
            if best_features:
                features = [col for col in features if col in self.selected_features[model_name][target]]
            steps.append((name, transformer, features))
        return ColumnTransformer(steps)

    def nested_cv_with_feature_selection(
        self,
        k_outer: int = 5,
        k_inner: int = 3,
        min_features_to_select: int = 10,
        scoring: str = "average_precision"
    ):
        """
        Perform feature selection (RFECV) using cross-validation without hyperparameter tuning.

        Saves the most frequently selected features across outer folds.
        """
        print("Starting nested CV with feature selection...")

        for model_name, spec in tqdm(self.model_specs.items(), desc="Models"):

            ModelClass = spec['model']

            for target in tqdm(self.target_vars, desc=f"{model_name} targets", leave=False):
                print(f"Model: {model_name} | Target: {target}")

                preprocessor = self._build_preprocessor(spec['preprocessor_steps'], model_name, target)

                X = self.df[self.feature_cols]
                y = self.df[target]

                outer_skf = StratifiedKFold(n_splits=k_outer, shuffle=True, random_state=self.random_state)

                selected_feature_masks = []

                for train_idx, _ in tqdm(list(outer_skf.split(X, y)), desc="Outer folds", leave=False):
                    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]

                    X_train_proc = preprocessor.fit_transform(X_train)

                    fit_params = self._add_class_weights_to_fit_params(spec['fit_params'], ModelClass, model_name, y_train)
                    model = ModelClass(**fit_params)
                    
                    inner_skf = StratifiedKFold(n_splits=k_inner, shuffle=True, random_state=self.random_state)
                    rfecv = RFECV(
                        estimator=model,
                        step=1,
                        min_features_to_select=min_features_to_select,
                        cv=inner_skf,
                        scoring=scoring,
                        n_jobs=-1
                    )

                    rfecv.fit(X_train_proc, y_train)
                    selected_feature_masks.append(rfecv.support_)

                # Aggregate selected features across folds
                selected_feature_masks = np.array(selected_feature_masks)
                mean_mask = selected_feature_masks.mean(axis=0)
                threshold = 0.6  # at least 60% of folds must have selected a feature
                final_mask = mean_mask >= threshold
                final_features = np.array(self.feature_cols)[final_mask].tolist()

                self.selected_features[model_name][target] = final_features
                print(f"[{model_name}/{target}] Selected {len(final_features)} features: {final_features}")

    def nested_cv_with_hyperparam_search(
        self,
        out_folder: Path,
        k_outer: int = 5,
        k_inner: int = 3,
        best_features: bool = False,
        n_trials: int = 200,
        scoring: str = 'average_precision'
    ) -> None:
        
        if best_features and not self.selected_features:
            raise ValueError('To use the best features execure find_best_feature_subset first!')

        print("Starting nested CV with hyperparameter search...")

        for model_name, spec in tqdm(self.model_specs.items(), desc="Models"):

            ModelClass = spec['model']
            param_grid = spec['param_grid']

            for target in tqdm(self.target_vars, desc=f"{model_name} targets", leave=False):
                print(f"[STARTED] Model: {model_name} | Target: {target}")

                # Initialize preprocessor with the specified feature columns from the model specs
                preprocessor = self._build_preprocessor(
                    spec['preprocessor_steps'], model_name, target, best_features
                )

                X = self.df[self.feature_cols]  # Always select all features, dropping of unspecified features is handled by the preprocessor
                y = self.df[target]

                outer_skf = StratifiedKFold(n_splits=k_outer, shuffle=True, random_state=self.random_state)
                
                outer_metrics = {'roc_auc': [], 'pr_auc': [], 'f1_macro': []}
                inner_best_scores = []
                inner_best_params = []
                
                # Outer loop over k_outer folds
                for train_idx, test_idx in tqdm(list(outer_skf.split(X, y)), desc="Outer folds", leave=False):
                    
                    # Init data of the current outer fold
                    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
                    
                    X_train_proc = preprocessor.fit_transform(X_train)
                    X_test_proc = preprocessor.transform(X_test)

                    # Only include relevant fit_params for this model
                    fit_params = self._add_class_weights_to_fit_params(spec['fit_params'], ModelClass, model_name, y_train)
                    model = ModelClass(**fit_params)

                    # Set up inner grid search loop for k_inner folds
                    inner_skf = StratifiedKFold(n_splits=k_inner, shuffle=True, random_state=self.random_state)
                    if spec['search_type'] == 'grid':
                        searcher = GridSearchCV(
                            estimator=model,
                            param_grid=param_grid,
                            cv=inner_skf,
                            scoring=scoring,  # Because highly imbalanced data
                            n_jobs=-1
                        )
                    elif spec['search_type'] == 'optuna':
                        searcher = OptunaSearchCV(
                            estimator=model,
                            param_distributions=param_grid,
                            cv=inner_skf,
                            scoring=scoring,
                            n_trials=n_trials,
                            n_jobs=-1,
                            random_state=self.random_state,
                            verbose=0,
                        )
                    elif spec['search_type'] is None:
                        pass
                    else:
                        raise ValueError("search_type must be 'grid' or 'optuna'")

                    if spec['search_type'] is None:
                        best_params = {}
                    else:
                        # Determine best hyperparameters of this fold using only the training data and not testing
                        # training data is then again split into k_inner folds
                        searcher.fit(X_train_proc, y_train)

                        best_params = searcher.best_params_
                        # Select and store best hyperparam config determined on the training data
                        inner_best_scores.append(searcher.best_score_)
                        inner_best_params.append(best_params)
                    
                    # Refit model with full training data to estimate auc and feature importance of the outer fold
                    temp_fit_params = {**fit_params, 'n_jobs': -1}  # For training without CV set to -1
                    params = dict(best_params, **temp_fit_params)
                    best_model = ModelClass(**params)
                    best_model.fit(X_train_proc, y_train)
                    y_pred = best_model.predict(X_test_proc)
                    y_pred_proba = best_model.predict_proba(X_test_proc)[:, 1]
                    
                    outer_metrics['roc_auc'].append(roc_auc_score(y_test, y_pred_proba))
                    outer_metrics['pr_auc'].append(average_precision_score(y_test, y_pred_proba))
                    outer_metrics['f1_macro'].append(f1_score(y_test, y_pred, average="macro"))

                if spec['search_type'] is None:
                    overall_lambda_star = {}  # No tuned hyperparams
                else:
                    # Select hyperparameters from the inner folds that achieved the highest score
                    best_idx = np.argmax(inner_best_scores)
                    overall_lambda_star = inner_best_params[best_idx]

                # Set class weights again based on the full data
                fit_params = self._add_class_weights_to_fit_params(spec['fit_params'], ModelClass, model_name, y)

                # Retrain final model on all data
                temp_fit_params = {**fit_params, 'n_jobs': -1}  # For training without CV set to -1
                best_params = dict(overall_lambda_star, **temp_fit_params)
                production_model = ModelClass(**best_params)
                
                X_proc = preprocessor.fit_transform(X)
                production_model.fit(X_proc, y)

                # Store production model
                self.best_models[model_name][target] = production_model

                # Store best hyperparameters
                self.best_params[model_name][target] = overall_lambda_star

                # Store metrics report for the avg performance of the model on the current target
                self.metrics_report[model_name][target] = {
                    'mean_roc_auc': np.mean(outer_metrics['roc_auc']),
                    'std_roc_auc': np.std(outer_metrics['roc_auc']),
                    'all_roc_auc': outer_metrics['roc_auc'],
                    'mean_pr_auc': np.mean(outer_metrics['pr_auc']),
                    'std_pr_auc': np.std(outer_metrics['pr_auc']),
                    'all_pr_auc': outer_metrics['pr_auc'],
                    'mean_f1_macro': np.mean(outer_metrics['f1_macro']),
                    'std_f1_macro': np.std(outer_metrics['f1_macro']),
                    'all_f1_macro': outer_metrics['f1_macro'],
                    'best_params': overall_lambda_star,
                }
                print(f"[FINISHED] Model: {model_name} | Target: {target} | Mean ROC-AUC: {np.mean(outer_metrics['roc_auc']):.4f} | Mean PR-AUC: {np.mean(outer_metrics['pr_auc']):.4f}")

        self._save_models_and_reports(out_folder)

    def _save_models_and_reports(self, out_folder: Path):
        """Retrains model on the full dataset and stores production ready model with additional performance reports
        from the k-fold CV evaluations."""

        out_folder.mkdir(parents=True, exist_ok=True)

        # Save best models stored in self.best_models[model_name][target]
        models_dir = out_folder / 'trained_models'
        models_dir.mkdir(exist_ok=True)
        for model_name, targets in self.best_models.items():
            for target, model in targets.items():
                model_path = models_dir / f'{model_name}_{target}.joblib'
                joblib.dump(model, model_path)
                print(f"Saved model for {model_name}/{target} to {model_path}")

        # Save metrics report as csv file
        summary_rows = []
        for model_name, model_results in self.metrics_report.items():
            for target, d in model_results.items():
                summary_rows.append({
                    'model': model_name,
                    'target': target,
                    'mean_roc_auc': d['mean_roc_auc'],
                    'std_roc_auc': d['std_roc_auc'],
                    'all_roc_auc': str(d['all_roc_auc']),
                    'mean_pr_auc': d['mean_pr_auc'],
                    'std_pr_auc': d['std_pr_auc'],
                    'all_pr_auc': str(d['all_pr_auc']),
                    'mean_f1_macro': d['mean_f1_macro'],
                    'std_f1_macro': d['std_f1_macro'],
                    'all_f1_macro': str(d['all_f1_macro']),
                    'best_params': str(d['best_params']),
                })
        pd.DataFrame(summary_rows).to_csv(out_folder / 'cv_metrics_report.csv', index=False)
        print(f"Saved metrics summary to {str(out_folder / 'cv_metrics_report.csv')}")

        # Save hyperparameters
        best_params_path = out_folder / 'best_hyperparameters.pkl'
        with open(best_params_path, 'wb') as f:
            pickle.dump(self.best_params, f)
        print(f"Saved best hyperparameters to {best_params_path}")

        # Save best feature set
        best_features_path = out_folder / 'best_features.pkl'
        with open(best_features_path, 'wb') as f:
            pickle.dump(self.selected_features, f)
        print(f"Saved best features to {best_features_path}")

In [12]:
# Global params
RANDOM_STATE = 42
WEBSITE_FEATURE_COLS = FOUNDING_WEBSITE_FEATURE_COLS + CURRENT_WEBSITE_FEATURE_COLS

## Experiment A: Reproducability experiment and model evaluation on the full data set without website data


In [11]:
# Config for replication study using only baseline features
MODEL_SPECS = {
    'vanilla_logreg': {
        'model': LogisticRegression,
        'preprocessor_steps': [
            ('categorical_low_card', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')),
            ('continuous', StandardScaler()),
            ('binary', 'passthrough'),
        ],
        'features': {
            'binary': [f for f in LOGREG_BINARY_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_low_card': [f for f in LOGREG_LOW_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_high_card': [f for f in LOGREG_HIGH_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],  # empty
            'continuous': [f for f in LOGREG_CONTINUOUS_FEATURES if f not in WEBSITE_FEATURE_COLS],
        },
        'fit_params': {
            'random_state': RANDOM_STATE,
            'n_jobs': 1,  # Avoid thread thrashing, so model n_jobs should be set to 1 because Grid Search CV and Feature Selection is set to -1
            'max_iter': 10_000,
            'solver': 'saga',  # Fixed for computational efficiency
        },
        'param_grid': {},  # No hyperparams for vanilla LogReg
        'search_type': None,
        'account_for_class_weights': True
    },
    'logreg': {
        'model': LogisticRegression,
        'preprocessor_steps': [
            ('categorical_low_card', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')),
            ('continuous', StandardScaler()),
            ('binary', 'passthrough'),
        ],
        'features': {
            'binary': [f for f in LOGREG_BINARY_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_low_card': [f for f in LOGREG_LOW_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_high_card': [f for f in LOGREG_HIGH_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],  # empty
            'continuous': [f for f in LOGREG_CONTINUOUS_FEATURES if f not in WEBSITE_FEATURE_COLS],
        },
        'fit_params': {
            'random_state': RANDOM_STATE,
            'n_jobs': 1,  # Avoid thread thrashing, so model n_jobs should be set to 1 because Grid Search CV and Feature Selection is set to -1
            'max_iter': 10_000,
            'solver': 'saga',  # Fixed for computational efficiency
        },
        'param_grid': {
            'penalty': ['l1', 'l2'],  # Test Lasso and Ridge regularization
            'C': [0.01, 0.1, 1, 10, 100],
        },
        'search_type': 'grid',
        'account_for_class_weights': True
    },
    'rf': {
        'model': RandomForestClassifier,
        'preprocessor_steps': [
            ('categorical_low_card', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
            ('categorical_high_card', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
            ('continuous', 'passthrough'),
            ('binary', 'passthrough'),
        ],
        'features': {
            'binary': [f for f in RF_BINARY_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_low_card': [f for f in RF_LOW_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_high_card': [f for f in RF_HIGH_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'continuous': [f for f in RF_CONTINUOUS_FEATURES if f not in WEBSITE_FEATURE_COLS],
        },
        'fit_params': {
            'random_state': RANDOM_STATE,
            'n_jobs': 1,
        },
        'param_grid': {
            'n_estimators': IntDistribution(100, 400, step=50),
            'max_depth':  CategoricalDistribution([None, 10, 20, 30]),
            'min_samples_split': IntDistribution(2, 20),
            'min_samples_leaf': IntDistribution(1, 10),
            'max_features': CategoricalDistribution(['sqrt', 'log2', 0.5]),
        },
        'search_type': 'optuna',
        'account_for_class_weights': True
    },
    'xgb': {
        'model': XGBClassifier,
        'preprocessor_steps': [
            ('categorical_low_card', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')),
            ('categorical_high_card', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
            ('continuous', 'passthrough'),
            ('binary', 'passthrough'),
        ],
        'features': {
            'binary': [f for f in XGB_BINARY_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_low_card': [f for f in XGB_LOW_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_high_card': [f for f in XGB_HIGH_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'continuous': [f for f in XGB_CONTINUOUS_FEATURES if f not in WEBSITE_FEATURE_COLS],
        },
        'fit_params': {
            'random_state': RANDOM_STATE,
            'n_jobs': 1,
            'objective': 'binary:logistic',
            'verbosity': 0,
            'booster': 'gbtree',
            'tree_method': 'hist',
            'use_label_encoder': False,
            'eval_metric': 'aucpr',
        },
        'param_grid': {
            'max_depth': IntDistribution(3, 10),
            'min_child_weight': IntDistribution(1, 10),
            'gamma': FloatDistribution(0, 5.0),
            'subsample': FloatDistribution(0.5, 1.0),
            'colsample_bytree': FloatDistribution(0.5, 1.0),
            'learning_rate': FloatDistribution(0.005, 0.1, log=True),
            'n_estimators': IntDistribution(100, 400, step=50),
            'reg_alpha': FloatDistribution(0, 5.0),  # L1 regularization
            'reg_lambda': FloatDistribution(1.0, 10.0),  # L2 regularization
            'max_delta_step': IntDistribution(0, 10),
        },
        'search_type': 'optuna',
        'account_for_class_weights': True
    }
}

In [12]:
# 1. Load data for experiment A
base_df = all_feature_df[[col for col in all_feature_df.columns if col not in WEBSITE_FEATURE_COLS]]

# 2. Initialize model evaluation with targets and model specs
meval = ModelEvaluation(TARGET_COLS, MODEL_SPECS, random_state=RANDOM_STATE)
meval.load_data(base_df)

# 3. Training procedure on all features for the baseline reproduction
out_folder = MODELS_DIR / 'experiment_A'
meval.nested_cv_with_hyperparam_search(out_folder=out_folder)

Starting nested CV with hyperparameter search...


Models:   0%|          | 0/4 [00:00<?, ?it/s]

vanilla_logreg targets:   0%|          | 0/4 [00:00<?, ?it/s]

[STARTED] Model: vanilla_logreg | Target: target_inv_exit


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]

[FINISHED] Model: vanilla_logreg | Target: target_inv_exit | Mean ROC-AUC: 0.6897 | Mean PR-AUC: 0.2643
[STARTED] Model: vanilla_logreg | Target: target_acquisition


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]

[FINISHED] Model: vanilla_logreg | Target: target_acquisition | Mean ROC-AUC: 0.7769 | Mean PR-AUC: 0.0328
[STARTED] Model: vanilla_logreg | Target: target_non_gov_investment


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]

[FINISHED] Model: vanilla_logreg | Target: target_non_gov_investment | Mean ROC-AUC: 0.9084 | Mean PR-AUC: 0.0766
[STARTED] Model: vanilla_logreg | Target: target_inno_subsidy


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]

[FINISHED] Model: vanilla_logreg | Target: target_inno_subsidy | Mean ROC-AUC: 0.8603 | Mean PR-AUC: 0.0600


logreg targets:   0%|          | 0/4 [00:00<?, ?it/s]

[STARTED] Model: logreg | Target: target_inv_exit


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]

[FINISHED] Model: logreg | Target: target_inv_exit | Mean ROC-AUC: 0.6883 | Mean PR-AUC: 0.2648
[STARTED] Model: logreg | Target: target_acquisition


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]



[FINISHED] Model: logreg | Target: target_acquisition | Mean ROC-AUC: 0.7857 | Mean PR-AUC: 0.0325
[STARTED] Model: logreg | Target: target_non_gov_investment


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]



[FINISHED] Model: logreg | Target: target_non_gov_investment | Mean ROC-AUC: 0.9108 | Mean PR-AUC: 0.0788
[STARTED] Model: logreg | Target: target_inno_subsidy


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]



[FINISHED] Model: logreg | Target: target_inno_subsidy | Mean ROC-AUC: 0.8629 | Mean PR-AUC: 0.0613


rf targets:   0%|          | 0/4 [00:00<?, ?it/s]

[STARTED] Model: rf | Target: target_inv_exit


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]

[FINISHED] Model: rf | Target: target_inv_exit | Mean ROC-AUC: 0.7060 | Mean PR-AUC: 0.2914
[STARTED] Model: rf | Target: target_acquisition


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]

[FINISHED] Model: rf | Target: target_acquisition | Mean ROC-AUC: 0.7830 | Mean PR-AUC: 0.0724
[STARTED] Model: rf | Target: target_non_gov_investment


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]

[FINISHED] Model: rf | Target: target_non_gov_investment | Mean ROC-AUC: 0.9113 | Mean PR-AUC: 0.1576
[STARTED] Model: rf | Target: target_inno_subsidy


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]

[FINISHED] Model: rf | Target: target_inno_subsidy | Mean ROC-AUC: 0.8719 | Mean PR-AUC: 0.1130


xgb targets:   0%|          | 0/4 [00:00<?, ?it/s]

[STARTED] Model: xgb | Target: target_inv_exit


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inv_exit | Mean ROC-AUC: 0.7096 | Mean PR-AUC: 0.2932
[STARTED] Model: xgb | Target: target_acquisition


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_acquisition | Mean ROC-AUC: 0.7626 | Mean PR-AUC: 0.0751
[STARTED] Model: xgb | Target: target_non_gov_investment


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_non_gov_investment | Mean ROC-AUC: 0.9267 | Mean PR-AUC: 0.1672
[STARTED] Model: xgb | Target: target_inno_subsidy


Outer folds:   0%|          | 0/5 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inno_subsidy | Mean ROC-AUC: 0.8783 | Mean PR-AUC: 0.1201
Saved model for vanilla_logreg/target_inv_exit to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_A/trained_models/vanilla_logreg_target_inv_exit.joblib
Saved model for vanilla_logreg/target_acquisition to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_A/trained_models/vanilla_logreg_target_acquisition.joblib
Saved model for vanilla_logreg/target_non_gov_investment to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_A/trained_models/vanilla_logreg_target_non_gov_investment.joblib
Saved model for vanilla_logreg/target_inno_subsidy to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_A/trained_models/vanilla_logreg_target_inno_subsidy.joblib
Saved model for logreg/target_inv_exit to /Users/manuelbolz/Documents/git/for_work/company_success_predic

In [13]:
"""
Experiment B: Performance difference between Doc2vec and my implementation
"""

BEST_MODEL = 'xgb'

MODELS = {
    'logreg': LogisticRegression,
    'rf': RandomForestClassifier,
    'xgb': XGBClassifier,
}

FOUNDING_WEBSITE_STATS = [
    'founding_mean_text_len',
    'founding_n_internal_links_mean',
    'founding_n_external_links_mean',
    'founding_n_languages',
]
CURRENT_WEBSITE_STATS = [
    'current_mean_text_len',
    'current_n_internal_links_mean',
    'current_n_external_links_mean',
    'current_n_languages',
]
BINARY_BASE = [f for f in XGB_BINARY_FEATURES if f not in WEBSITE_FEATURE_COLS]
LOW_CAT_BASE = [f for f in XGB_LOW_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS]
HIGH_CAT_BASE = [f for f in XGB_HIGH_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS]
CONTINUOUS_BASE = [f for f in XGB_CONTINUOUS_FEATURES if f not in WEBSITE_FEATURE_COLS]

FEATURE_CONFIGS = {
    'founding_base': {
        'cont': [],
        'low_cat': []
    },
    'founding_doc2vec': {
        'cont': ['founding_doc2vec_diff'] + FOUNDING_WEBSITE_STATS,
        'low_cat': ['founding_dominant_language']
    },
    'founding_dim768': {
        'cont': ['founding_pr_sdg_similarity', 'founding_lp', 'founding_vp'] + FOUNDING_WEBSITE_STATS,
        'low_cat': ['founding_dominant_language']
    },
    'founding_dim768_w': {
        'cont': ['founding_pr_w_sdg_similarity', 'founding_lp_w', 'founding_vp_w'] + FOUNDING_WEBSITE_STATS,
        'low_cat': ['founding_dominant_language']
    },
    'founding_dim300_w': {
        'cont': ['founding_pr_w_red_sdg_similarity', 'founding_lp_w_red', 'founding_vp_w_red'] + FOUNDING_WEBSITE_STATS,
        'low_cat': ['founding_dominant_language']
    },
    'current_base': {
        'cont': [],
        'low_cat': []
    },
    'current_doc2vec': {
        'cont': ['current_doc2vec_diff'] + CURRENT_WEBSITE_STATS,
        'low_cat': ['current_dominant_language']
    },
    'current_dim768': {
        'cont': ['current_pr_sdg_similarity', 'current_lp', 'current_vp'] + CURRENT_WEBSITE_STATS,
        'low_cat': ['current_dominant_language']
    },
    'current_dim768_w': {
        'cont': ['current_pr_w_sdg_similarity', 'current_lp_w', 'current_vp_w'] + CURRENT_WEBSITE_STATS,
        'low_cat': ['current_dominant_language']
    },
    'current_dim300_w': {
        'cont': ['current_pr_w_red_sdg_similarity', 'current_lp_w_red', 'current_vp_w_red'] + CURRENT_WEBSITE_STATS,
        'low_cat': ['current_dominant_language']
    },
}

# 1. Load data for experiment B
for experiment_config, website_features in FEATURE_CONFIGS.items():
    print(f'START CONDUCTING EXPERIMENT B FOR: {experiment_config}')

    # Set model specs
    MODEL_SPECS = {
        'xgb': {
            'model': XGBClassifier,
            'preprocessor_steps': [
                ('categorical_low_card', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')),
                ('categorical_high_card', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
                ('continuous', 'passthrough'),
                ('binary', 'passthrough'),
            ],
            'features': {
                'binary': BINARY_BASE,
                'categorical_low_card': LOW_CAT_BASE + website_features['low_cat'],
                'categorical_high_card': HIGH_CAT_BASE,
                'continuous': CONTINUOUS_BASE + website_features['cont'],
            },
            'fit_params': {
                'random_state': RANDOM_STATE,
                'n_jobs': 1,
                'objective': 'binary:logistic',
                'verbosity': 0,
                'booster': 'gbtree',
                'tree_method': 'hist',
                'use_label_encoder': False,
                'eval_metric': 'aucpr',
            },
            'param_grid': {
                'max_depth': IntDistribution(3, 10),
                'min_child_weight': IntDistribution(1, 10),
                'gamma': FloatDistribution(0, 5.0),
                'subsample': FloatDistribution(0.5, 1.0),
                'colsample_bytree': FloatDistribution(0.5, 1.0),
                'learning_rate': FloatDistribution(0.005, 0.1, log=True),
                'n_estimators': IntDistribution(100, 400, step=50),
                'reg_alpha': FloatDistribution(0, 5.0),  # L1 regularization
                'reg_lambda': FloatDistribution(1.0, 10.0),  # L2 regularization
                'max_delta_step': IntDistribution(0, 10),
            },
            'search_type': 'optuna',
            'account_for_class_weights': True
        }
    }

    # 1. Load data for experiment A
    if 'current' in experiment_config:
        website_df = all_feature_df[~all_feature_df['current_vp'].isna()][[col for col in all_feature_df.columns if col not in FOUNDING_WEBSITE_FEATURE_COLS]].copy()

    elif 'founding' in experiment_config:
        website_df = all_feature_df[~all_feature_df['founding_vp'].isna()][[col for col in all_feature_df.columns if col not in CURRENT_WEBSITE_FEATURE_COLS]].copy()

    targets = TARGET_COLS

    # 2. Initialize model evaluation with targets and model specs
    meval = ModelEvaluation(targets, MODEL_SPECS, random_state=RANDOM_STATE)
    meval.load_data(website_df)
    
    # 3. Evaluate with doc2vec scores
    out_folder = MODELS_DIR / 'experiment_B' / experiment_config
    meval.nested_cv_with_hyperparam_search(out_folder=out_folder, k_outer=10)

START CONDUCTING EXPERIMENT B FOR: founding_base
Starting nested CV with hyperparameter search...


Models:   0%|          | 0/1 [00:00<?, ?it/s]

xgb targets:   0%|          | 0/4 [00:00<?, ?it/s]

[STARTED] Model: xgb | Target: target_inv_exit


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inv_exit | Mean ROC-AUC: 0.6690 | Mean PR-AUC: 0.1099
[STARTED] Model: xgb | Target: target_acquisition


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_acquisition | Mean ROC-AUC: 0.8067 | Mean PR-AUC: 0.0727
[STARTED] Model: xgb | Target: target_non_gov_investment


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_non_gov_investment | Mean ROC-AUC: 0.9174 | Mean PR-AUC: 0.2501
[STARTED] Model: xgb | Target: target_inno_subsidy


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inno_subsidy | Mean ROC-AUC: 0.8575 | Mean PR-AUC: 0.1544
Saved model for xgb/target_inv_exit to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_base/trained_models/xgb_target_inv_exit.joblib
Saved model for xgb/target_acquisition to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_base/trained_models/xgb_target_acquisition.joblib
Saved model for xgb/target_non_gov_investment to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_base/trained_models/xgb_target_non_gov_investment.joblib
Saved model for xgb/target_inno_subsidy to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_base/trained_models/xgb_target_inno_subsidy.joblib
Saved metrics summary to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_base/cv_metrics

Models:   0%|          | 0/1 [00:00<?, ?it/s]

xgb targets:   0%|          | 0/4 [00:00<?, ?it/s]

[STARTED] Model: xgb | Target: target_inv_exit


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inv_exit | Mean ROC-AUC: 0.6719 | Mean PR-AUC: 0.1106
[STARTED] Model: xgb | Target: target_acquisition


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_acquisition | Mean ROC-AUC: 0.8248 | Mean PR-AUC: 0.0665
[STARTED] Model: xgb | Target: target_non_gov_investment


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_non_gov_investment | Mean ROC-AUC: 0.9369 | Mean PR-AUC: 0.2849
[STARTED] Model: xgb | Target: target_inno_subsidy


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inno_subsidy | Mean ROC-AUC: 0.8643 | Mean PR-AUC: 0.1823
Saved model for xgb/target_inv_exit to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_doc2vec/trained_models/xgb_target_inv_exit.joblib
Saved model for xgb/target_acquisition to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_doc2vec/trained_models/xgb_target_acquisition.joblib
Saved model for xgb/target_non_gov_investment to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_doc2vec/trained_models/xgb_target_non_gov_investment.joblib
Saved model for xgb/target_inno_subsidy to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_doc2vec/trained_models/xgb_target_inno_subsidy.joblib
Saved metrics summary to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_doc

Models:   0%|          | 0/1 [00:00<?, ?it/s]

xgb targets:   0%|          | 0/4 [00:00<?, ?it/s]

[STARTED] Model: xgb | Target: target_inv_exit


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inv_exit | Mean ROC-AUC: 0.6682 | Mean PR-AUC: 0.1086
[STARTED] Model: xgb | Target: target_acquisition


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_acquisition | Mean ROC-AUC: 0.7972 | Mean PR-AUC: 0.0722
[STARTED] Model: xgb | Target: target_non_gov_investment


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_non_gov_investment | Mean ROC-AUC: 0.9298 | Mean PR-AUC: 0.2799
[STARTED] Model: xgb | Target: target_inno_subsidy


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inno_subsidy | Mean ROC-AUC: 0.8669 | Mean PR-AUC: 0.1780
Saved model for xgb/target_inv_exit to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_dim768/trained_models/xgb_target_inv_exit.joblib
Saved model for xgb/target_acquisition to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_dim768/trained_models/xgb_target_acquisition.joblib
Saved model for xgb/target_non_gov_investment to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_dim768/trained_models/xgb_target_non_gov_investment.joblib
Saved model for xgb/target_inno_subsidy to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_dim768/trained_models/xgb_target_inno_subsidy.joblib
Saved metrics summary to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_dim768/

Models:   0%|          | 0/1 [00:00<?, ?it/s]

xgb targets:   0%|          | 0/4 [00:00<?, ?it/s]

[STARTED] Model: xgb | Target: target_inv_exit


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inv_exit | Mean ROC-AUC: 0.6710 | Mean PR-AUC: 0.1091
[STARTED] Model: xgb | Target: target_acquisition


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_acquisition | Mean ROC-AUC: 0.8144 | Mean PR-AUC: 0.0600
[STARTED] Model: xgb | Target: target_non_gov_investment


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_non_gov_investment | Mean ROC-AUC: 0.9411 | Mean PR-AUC: 0.2866
[STARTED] Model: xgb | Target: target_inno_subsidy


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inno_subsidy | Mean ROC-AUC: 0.8663 | Mean PR-AUC: 0.1790
Saved model for xgb/target_inv_exit to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_dim768_w/trained_models/xgb_target_inv_exit.joblib
Saved model for xgb/target_acquisition to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_dim768_w/trained_models/xgb_target_acquisition.joblib
Saved model for xgb/target_non_gov_investment to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_dim768_w/trained_models/xgb_target_non_gov_investment.joblib
Saved model for xgb/target_inno_subsidy to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_dim768_w/trained_models/xgb_target_inno_subsidy.joblib
Saved metrics summary to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding

Models:   0%|          | 0/1 [00:00<?, ?it/s]

xgb targets:   0%|          | 0/4 [00:00<?, ?it/s]

[STARTED] Model: xgb | Target: target_inv_exit


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inv_exit | Mean ROC-AUC: 0.6707 | Mean PR-AUC: 0.1102
[STARTED] Model: xgb | Target: target_acquisition


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_acquisition | Mean ROC-AUC: 0.8063 | Mean PR-AUC: 0.0625
[STARTED] Model: xgb | Target: target_non_gov_investment


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_non_gov_investment | Mean ROC-AUC: 0.9323 | Mean PR-AUC: 0.2776
[STARTED] Model: xgb | Target: target_inno_subsidy


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inno_subsidy | Mean ROC-AUC: 0.8684 | Mean PR-AUC: 0.1916
Saved model for xgb/target_inv_exit to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_dim300_w/trained_models/xgb_target_inv_exit.joblib
Saved model for xgb/target_acquisition to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_dim300_w/trained_models/xgb_target_acquisition.joblib
Saved model for xgb/target_non_gov_investment to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_dim300_w/trained_models/xgb_target_non_gov_investment.joblib
Saved model for xgb/target_inno_subsidy to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding_dim300_w/trained_models/xgb_target_inno_subsidy.joblib
Saved metrics summary to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/founding

Models:   0%|          | 0/1 [00:00<?, ?it/s]

xgb targets:   0%|          | 0/4 [00:00<?, ?it/s]

[STARTED] Model: xgb | Target: target_inv_exit


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inv_exit | Mean ROC-AUC: 0.6810 | Mean PR-AUC: 0.0630
[STARTED] Model: xgb | Target: target_acquisition


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_acquisition | Mean ROC-AUC: 0.8124 | Mean PR-AUC: 0.0599
[STARTED] Model: xgb | Target: target_non_gov_investment


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_non_gov_investment | Mean ROC-AUC: 0.9175 | Mean PR-AUC: 0.2447
[STARTED] Model: xgb | Target: target_inno_subsidy


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inno_subsidy | Mean ROC-AUC: 0.8715 | Mean PR-AUC: 0.1765
Saved model for xgb/target_inv_exit to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_base/trained_models/xgb_target_inv_exit.joblib
Saved model for xgb/target_acquisition to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_base/trained_models/xgb_target_acquisition.joblib
Saved model for xgb/target_non_gov_investment to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_base/trained_models/xgb_target_non_gov_investment.joblib
Saved model for xgb/target_inno_subsidy to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_base/trained_models/xgb_target_inno_subsidy.joblib
Saved metrics summary to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_base/cv_metrics_repo

Models:   0%|          | 0/1 [00:00<?, ?it/s]

xgb targets:   0%|          | 0/4 [00:00<?, ?it/s]

[STARTED] Model: xgb | Target: target_inv_exit


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inv_exit | Mean ROC-AUC: 0.7052 | Mean PR-AUC: 0.0745
[STARTED] Model: xgb | Target: target_acquisition


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_acquisition | Mean ROC-AUC: 0.8034 | Mean PR-AUC: 0.0523
[STARTED] Model: xgb | Target: target_non_gov_investment


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_non_gov_investment | Mean ROC-AUC: 0.9444 | Mean PR-AUC: 0.2998
[STARTED] Model: xgb | Target: target_inno_subsidy


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inno_subsidy | Mean ROC-AUC: 0.8831 | Mean PR-AUC: 0.2022
Saved model for xgb/target_inv_exit to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_doc2vec/trained_models/xgb_target_inv_exit.joblib
Saved model for xgb/target_acquisition to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_doc2vec/trained_models/xgb_target_acquisition.joblib
Saved model for xgb/target_non_gov_investment to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_doc2vec/trained_models/xgb_target_non_gov_investment.joblib
Saved model for xgb/target_inno_subsidy to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_doc2vec/trained_models/xgb_target_inno_subsidy.joblib
Saved metrics summary to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_doc2vec/

Models:   0%|          | 0/1 [00:00<?, ?it/s]

xgb targets:   0%|          | 0/4 [00:00<?, ?it/s]

[STARTED] Model: xgb | Target: target_inv_exit


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inv_exit | Mean ROC-AUC: 0.7051 | Mean PR-AUC: 0.0797
[STARTED] Model: xgb | Target: target_acquisition


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_acquisition | Mean ROC-AUC: 0.8086 | Mean PR-AUC: 0.0611
[STARTED] Model: xgb | Target: target_non_gov_investment


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_non_gov_investment | Mean ROC-AUC: 0.9441 | Mean PR-AUC: 0.3048
[STARTED] Model: xgb | Target: target_inno_subsidy


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inno_subsidy | Mean ROC-AUC: 0.8870 | Mean PR-AUC: 0.2101
Saved model for xgb/target_inv_exit to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim768/trained_models/xgb_target_inv_exit.joblib
Saved model for xgb/target_acquisition to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim768/trained_models/xgb_target_acquisition.joblib
Saved model for xgb/target_non_gov_investment to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim768/trained_models/xgb_target_non_gov_investment.joblib
Saved model for xgb/target_inno_subsidy to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim768/trained_models/xgb_target_inno_subsidy.joblib
Saved metrics summary to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim768/cv_me

Models:   0%|          | 0/1 [00:00<?, ?it/s]

xgb targets:   0%|          | 0/4 [00:00<?, ?it/s]

[STARTED] Model: xgb | Target: target_inv_exit


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inv_exit | Mean ROC-AUC: 0.7107 | Mean PR-AUC: 0.0839
[STARTED] Model: xgb | Target: target_acquisition


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_acquisition | Mean ROC-AUC: 0.8109 | Mean PR-AUC: 0.0676
[STARTED] Model: xgb | Target: target_non_gov_investment


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_non_gov_investment | Mean ROC-AUC: 0.9446 | Mean PR-AUC: 0.3100
[STARTED] Model: xgb | Target: target_inno_subsidy


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inno_subsidy | Mean ROC-AUC: 0.8856 | Mean PR-AUC: 0.2020
Saved model for xgb/target_inv_exit to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim768_w/trained_models/xgb_target_inv_exit.joblib
Saved model for xgb/target_acquisition to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim768_w/trained_models/xgb_target_acquisition.joblib
Saved model for xgb/target_non_gov_investment to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim768_w/trained_models/xgb_target_non_gov_investment.joblib
Saved model for xgb/target_inno_subsidy to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim768_w/trained_models/xgb_target_inno_subsidy.joblib
Saved metrics summary to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim7

Models:   0%|          | 0/1 [00:00<?, ?it/s]

xgb targets:   0%|          | 0/4 [00:00<?, ?it/s]

[STARTED] Model: xgb | Target: target_inv_exit


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inv_exit | Mean ROC-AUC: 0.7113 | Mean PR-AUC: 0.0821
[STARTED] Model: xgb | Target: target_acquisition


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_acquisition | Mean ROC-AUC: 0.8071 | Mean PR-AUC: 0.0627
[STARTED] Model: xgb | Target: target_non_gov_investment


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_non_gov_investment | Mean ROC-AUC: 0.9441 | Mean PR-AUC: 0.3101
[STARTED] Model: xgb | Target: target_inno_subsidy


Outer folds:   0%|          | 0/10 [00:00<?, ?it/s]

[FINISHED] Model: xgb | Target: target_inno_subsidy | Mean ROC-AUC: 0.8870 | Mean PR-AUC: 0.2046
Saved model for xgb/target_inv_exit to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim300_w/trained_models/xgb_target_inv_exit.joblib
Saved model for xgb/target_acquisition to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim300_w/trained_models/xgb_target_acquisition.joblib
Saved model for xgb/target_non_gov_investment to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim300_w/trained_models/xgb_target_non_gov_investment.joblib
Saved model for xgb/target_inno_subsidy to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim300_w/trained_models/xgb_target_inno_subsidy.joblib
Saved metrics summary to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_B/current_dim3

In [14]:
FEATURE_CONFIGS.keys()

dict_keys(['founding_base', 'founding_doc2vec', 'founding_dim768', 'founding_dim768_w', 'founding_dim300_w', 'current_base', 'current_doc2vec', 'current_dim768', 'current_dim768_w', 'current_dim300_w'])

In [None]:
FEATURE_CONFIGS

In [33]:
def get_per_fold_metric(df, target, metric_col):
    """Return list of per-fold metric values for given target."""
    values = df[df['target'] == target][metric_col].values
    return ast.literal_eval(values[0]) if len(values) > 0 else None

results = []

for kind in ['founding', 'current']:
    
    exp_B_base_df = pd.read_csv(MODELS_DIR / 'experiment_B' / f'{kind}_base' / 'cv_metrics_report.csv')

    for report_dir in [col for col in FEATURE_CONFIGS.keys() if kind in col]:

        for target in TARGET_COLS:
            
            comp_df = pd.read_csv(MODELS_DIR / 'experiment_B' / report_dir / 'cv_metrics_report.csv')

            for metric in ["all_roc_auc", "all_pr_auc"]:
            # Get per-fold AP for website and base
                web_ap = get_per_fold_metric(comp_df, target, metric)
                base_ap = get_per_fold_metric(exp_B_base_df, target, metric)
                web_ap, base_ap = np.array(web_ap), np.array(base_ap)
                
                mean_web, mean_base = np.mean(web_ap), np.mean(base_ap)
                std_web, std_base = np.std(web_ap, ddof=1), np.std(base_ap, ddof=1)
                n_web, n_base = len(web_ap), len(base_ap)

                # Welch's SE and df
                se_diff = np.sqrt(std_web**2/n_web + std_base**2/n_base)
                degrees_of_freedom = (std_web**2 / n_web + std_base**2 / n_base)**2 / ((std_web**2 / n_web)**2 / (n_web-1) + (std_base**2 / n_base)**2 / (n_base-1))

                diff = mean_web - mean_base
                diff_pct = diff / mean_base * 100

                t_stat = diff / se_diff if se_diff > 0 else 0
                p_value = 2 * t.sf(np.abs(t_stat), degrees_of_freedom)
                
                ci = {}
                for alpha, label in zip([0.01, 0.05, 0.1], ['99', '95', '90']):
                    t_crit = t.ppf(1 - alpha/2, degrees_of_freedom)
                    ci[f"ci_lower_{label}"] = diff - t_crit * se_diff
                    ci[f"ci_upper_{label}"] = diff + t_crit * se_diff

                results.append({
                    'model': report_dir,
                    'metric': metric,
                    'metric_value': mean_web,
                    'target': target,
                    'mean_ap_website': mean_web,
                    'mean_ap_base': mean_base,
                    'p_value': p_value,
                    **ci,
                })

results_df = pd.DataFrame(results)
results_df.to_csv(MODELS_DIR / 'experiment_B' / 'individual_significance_report.csv', index=False)

In [40]:
results = []

for kind in ['founding', 'current']:
    
    base_df = pd.read_csv(MODELS_DIR / 'experiment_B' / f'{kind}_base' / 'cv_metrics_report.csv')

    # For each target, for each metric, pool differences from all website models
    for target in TARGET_COLS:
        for metric in ["all_roc_auc", "all_pr_auc"]:
            all_diffs = []
            for report_dir in [col for col in FEATURE_CONFIGS.keys() if kind in col and col != f'{kind}_base']:
                comp_df = pd.read_csv(MODELS_DIR / 'experiment_B' / report_dir / 'cv_metrics_report.csv')
                web_scores = get_per_fold_metric(comp_df, target, metric)
                base_scores = get_per_fold_metric(base_df, target, metric)
                if web_scores is None or base_scores is None:
                    continue
                diffs = np.array(web_scores) - np.array(base_scores)
                all_diffs.extend(diffs)

            all_diffs = np.array(all_diffs)
            if len(all_diffs) == 0:
                continue
            mean_diff = np.mean(all_diffs)
            mean_diff_pct = np.round(mean_diff / np.mean(base_scores) * 100, decimals=1)
            std_diff = np.std(all_diffs, ddof=1)
            n = len(all_diffs)
            se = std_diff / np.sqrt(n)

            # t-test and p-value
            t_stat, p_value = ttest_1samp(all_diffs, 0.0)

            # Confidence intervals
            ci_99 = t.ppf(0.995, n-1) * se
            ci_95 = t.ppf(0.975, n-1) * se
            ci_90 = t.ppf(0.95, n-1) * se

            results.append({
                'kind': kind,
                'target': target,
                'metric': metric,
                'mean_improvement': mean_diff,
                'mean_improvement_pct': mean_diff_pct,
                'std': std_diff,
                'n': n,
                'p_value': p_value,
                'ci_lower_99': mean_diff - ci_99,
                'ci_upper_99': mean_diff + ci_99,
                'ci_lower_95': mean_diff - ci_95,
                'ci_upper_95': mean_diff + ci_95,
                'ci_lower_90': mean_diff - ci_90,
                'ci_upper_90': mean_diff + ci_90,
            })

results_df = pd.DataFrame(results)
results_df.to_csv(MODELS_DIR / 'experiment_B' / 'average_significance_report.csv', index=False)

In [44]:
df = pd.read_csv(MODELS_DIR / 'experiment_B' / 'individual_significance_report.csv')
for level, cutoff, previous in [('***', 0.01, 0.0), ('**', 0.05, 0.01), ('*', 0.10, 0.05)]:
    sig = df[(df['p_value'] < cutoff) & (df['p_value'] >= previous)]
    print(f"\nSignificant at {level} ({cutoff}):")
    for _, row in sig.iterrows():
        print(f"{row['model']}, {row['metric']}, {row['target']}, Δ={row['mean_ap_website'] - row['mean_ap_base']:.4f}, p={row['mean_ap_website']:.3g}")



Significant at *** (0.01):
founding_dim768_w, all_roc_auc, target_non_gov_investment, Δ=0.0237, p=0.941
current_doc2vec, all_roc_auc, target_non_gov_investment, Δ=0.0269, p=0.944
current_dim768, all_roc_auc, target_non_gov_investment, Δ=0.0266, p=0.944
current_dim768_w, all_roc_auc, target_inv_exit, Δ=0.0297, p=0.711
current_dim768_w, all_pr_auc, target_inv_exit, Δ=0.0209, p=0.0839
current_dim768_w, all_roc_auc, target_non_gov_investment, Δ=0.0271, p=0.945
current_dim300_w, all_roc_auc, target_inv_exit, Δ=0.0303, p=0.711
current_dim300_w, all_pr_auc, target_inv_exit, Δ=0.0191, p=0.0821
current_dim300_w, all_roc_auc, target_non_gov_investment, Δ=0.0266, p=0.944

Significant at ** (0.05):
founding_doc2vec, all_roc_auc, target_non_gov_investment, Δ=0.0195, p=0.937
founding_dim300_w, all_roc_auc, target_non_gov_investment, Δ=0.0149, p=0.932
founding_dim300_w, all_pr_auc, target_inno_subsidy, Δ=0.0372, p=0.192
current_doc2vec, all_roc_auc, target_inv_exit, Δ=0.0243, p=0.705
current_doc2vec

In [22]:
founding_website_df = all_feature_df[~all_feature_df['founding_vp'].isna()][[col for col in all_feature_df.columns if col not in FOUNDING_WEBSITE_FEATURE_COLS]].copy()
current_website_df = all_feature_df[~all_feature_df['current_vp'].isna()][[col for col in all_feature_df.columns if col not in FOUNDING_WEBSITE_FEATURE_COLS]].copy()

for target in TARGET_COLS:
    print('Base data:')
    print(target, len(base_df[base_df[target] == 1]) / len(base_df[base_df[target] == 0]))
    print('Founding website data:')
    print(target, len(founding_website_df[founding_website_df[target] == 1]) / len(founding_website_df[founding_website_df[target] == 0]))
    print('Current website data:')
    print(target, len(current_website_df[current_website_df[target] == 1]) / len(current_website_df[current_website_df[target] == 0]))

Base data:
target_inv_exit 0.16533611014693972
Founding website data:
target_inv_exit 0.056830514306890074
Current website data:
target_inv_exit 0.029797745870001544
Base data:
target_acquisition 0.00648630970766184
Founding website data:
target_acquisition 0.004452942212382701
Current website data:
target_acquisition 0.0031332247148765507
Base data:
target_non_gov_investment 0.005153142691246915
Founding website data:
target_non_gov_investment 0.009455906148867314
Current website data:
target_non_gov_investment 0.010019433157509527
Base data:
target_inno_subsidy 0.008244908359572648
Founding website data:
target_inno_subsidy 0.015102206854469643
Current website data:
target_inno_subsidy 0.014886009180128319


In [13]:
TARGET_COLS

['target_inv_exit',
 'target_acquisition',
 'target_non_gov_investment',
 'target_inno_subsidy']

In [3]:
"""
Experiment C: Regression analysis of Embedding Scores
"""

query_green = """ 
    SELECT * FROM zefix.green_binary WHERE is_green;
"""

with connect_database() as con:
    df_green = read_from_database(connection=con, query=query_green)

company_sample = company_sample[['ehraid', 'uid'] + TARGET_COLS + ALL_BINARY_FEATURE_COLS + ALL_CATEGORICAL_FEATURE_COLS + ALL_CONTINUOUS_FEATURE_COLS + ['founding_year']]
company_sample = company_sample.merge(df_green, on='uid', how='left')
company_sample['is_green'] = company_sample['is_green'].fillna(0).astype(int)

In [None]:
class CoefficientAnalyser:
    def __init__(self, df: pd.DataFrame, experiment_dir: str, scale: bool = True):
        self.df = df
        self.experiment_dir = Path(experiment_dir)
        self.experiment_dir.mkdir(exist_ok=True, parents=True)

    @staticmethod
    def drop_perfect_separation(df, target, col):
        keep = df.groupby(col)[target].nunique()
        keep = keep[keep > 1].index  # only keep categories that have both 0 and 1
        return df[df[col].isin(keep)]

    def analyse(
        self,
        targets: list[str | tuple[str, str]],
        score_cols,
        controls,
        interaction_terms: list[tuple[str, str]],
        save_full_summary=True,
        subfolder='reg_results'
    ):
        """
        targets: list of target column names (binary outcome)
        score_cols: list of column names (strings) OR list of list of strings (for multi-score models)
            e.g., ['current_vp', 'current_lp']
               or [ ['current_vp'], ['current_lp'], ['current_vp', 'current_lp'] ]
        controls: list of categorical control variable names (for fixed effects)
        """
        summary_rows = []
        auc_rows = []
        out_folder = self.experiment_dir / subfolder
        summary_folder = out_folder / 'summaries'
        out_folder.mkdir(exist_ok=True, parents=True)
        summary_folder.mkdir(exist_ok=True, parents=True)

        # Ensure score_cols is a list of lists
        if score_cols and isinstance(score_cols[0], str):
            score_cols = [[col] for col in score_cols]

        for target_col in targets:

            if isinstance(target_col, tuple):
                target = '_or_'.join(target_col)
                self.df[target] = self.df[list(target_col)].max(axis=1)
            else:
                target = target_col

            for score_set in score_cols:
                cols_needed = [target] + list(score_set) + controls
                reg_df = self.df.replace([np.inf, -np.inf], np.nan)\
                                .dropna(subset=cols_needed).copy()
                reg_df[target] = reg_df[target].astype(int)

                # Drop perfect separation categories
                for control in controls:
                    reg_df = self.drop_perfect_separation(reg_df, target, control)

                # Build formula
                score_formula = " + ".join(score_set)
                control_formula = " + ".join([f'C({c})' for c in controls])
                if interaction_terms:
                    interaction_formula = " + ".join([f'C({c1}):C({c2})' for c1, c2 in interaction_terms])
                    formula = f"{target} ~ {score_formula} + {control_formula} + {interaction_formula}"
                else:
                    formula = f"{target} ~ {score_formula} + {control_formula}"
                print(f"Fitting: {formula} (n={len(reg_df)})")

                try:
                    result = smf.logit(formula=formula, data=reg_df).fit(disp=0, cov_type='HC1')
                    y_pred = result.predict(reg_df)
                    auc = roc_auc_score(reg_df[target], y_pred)
                    pr_auc = average_precision_score(reg_df[target], y_pred)
                    pseudo_r2 = 1 - result.llf/result.llnull

                    # Save all score coefs
                    for score in score_set:
                        summary_rows.append({
                            'target': target,
                            'score': '+'.join(score_set),
                            'coef_name': score,
                            'coef': result.params.get(score, np.nan),
                            'std_err': result.bse.get(score, np.nan),
                            'pval': result.pvalues.get(score, np.nan),
                            'pseudo_r2': pseudo_r2,
                            'n_obs': len(reg_df)
                        })
                    auc_rows.append({
                        'target': target,
                        'score': '+'.join(score_set),
                        'auc': auc,
                        'pr_auc': pr_auc,
                        'n_obs': len(reg_df)
                    })
                    if save_full_summary:

                        fname = summary_folder / f"reg_summary_{target}_{'+'.join(score_set)}.txt"
                        with open(fname, 'w') as f:
                            f.write(result.summary().as_text())
                except Exception as e:
                    print(f"Error with {target}, {score_set}: {e}")

        pd.DataFrame(summary_rows).to_csv(out_folder / 'report_regression_results.csv', index=False)
        pd.DataFrame(auc_rows).to_csv(out_folder / 'report_auc_scores.csv', index=False)
        print(f"\nSaved regression summaries and AUC scores to {out_folder}")

In [None]:
score_cols = [
    'founding_pr_sdg_similarity', 'founding_pr_w_sdg_similarity', 'founding_pr_w_red_sdg_similarity',
    'founding_doc2vec_diff', 'founding_lp', 'founding_lp_w', 'founding_lp_w_red', 'founding_vp', 'founding_vp_w',
    'founding_vp_w_red', 'current_pr_sdg_similarity', 'current_pr_w_sdg_similarity', 'current_pr_w_red_sdg_similarity',
    'current_doc2vec_diff', 'current_lp', 'current_lp_w', 'current_lp_w_red', 'current_vp', 'current_vp_w', 'current_vp_w_red'
]

scaler = StandardScaler()
company_sample[score_cols] = scaler.fit_transform(company_sample[score_cols])

analyser = CoefficientAnalyser(company_sample, experiment_dir=MODELS_DIR / 'experiment_C')

experiment_setups = [
    {
        'title': 'Year FEs', 
        'targets': [('target_inno_subsidy', 'target_non_gov_investment', 'target_acquisition'), ('target_inno_subsidy', 'target_non_gov_investment')],
        'score_cols': score_cols,
        'controls': ['founding_year'], 
        'interaction_terms': None
    },
    {
        'title': 'Year + Industry FEs', 
        'targets': [('target_inno_subsidy', 'target_non_gov_investment', 'target_acquisition'), ('target_inno_subsidy', 'target_non_gov_investment'), 'target_inno_subsidy', 'target_non_gov_investment', 'target_acquisition'],
        'score_cols': score_cols,
        'controls': ['founding_year', 'division_1_label'], 
        'interaction_terms': None
    },
    {
        'title': 'Industry + Year x Canton + Municipality FEs', 
        'targets': [('target_inno_subsidy', 'target_non_gov_investment', 'target_acquisition'), ('target_inno_subsidy', 'target_non_gov_investment')],
        'score_cols': score_cols,
        'controls': ['division_1_label'], 
        'interaction_terms': [('founding_bfs_code', 'canton_id')]
    },
    {
        'title': 'Green - Year + Industry FEs', 
        'targets': ['is_green'],
        'score_cols': ['founding_pr_sdg_similarity','founding_pr_w_sdg_similarity','founding_pr_w_red_sdg_similarity', 'current_pr_sdg_similarity','current_pr_w_sdg_similarity','current_pr_w_red_sdg_similarity'],
        'controls': ['founding_year', 'division_1_label'], 
        'interaction_terms': None
    },
]

for experiment in experiment_setups:
    analyser.analyse(
        targets=experiment.get('targets'),
        score_cols=experiment.get('score_cols'),
        controls=experiment.get('controls'),
        interaction_terms=experiment.get('interaction_terms'),
        subfolder=experiment.get('title'),
        save_full_summary=True
    )

Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_pr_sdg_similarity + C(founding_year) (n=39927)
Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_pr_w_sdg_similarity + C(founding_year) (n=39927)
Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_pr_w_red_sdg_similarity + C(founding_year) (n=39927)
Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_doc2vec_diff + C(founding_year) (n=39700)
Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_lp + C(founding_year) (n=39927)
Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_lp_w + C(founding_year) (n=39927)
Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_lp_w_red + C(founding_year) (n=39927)
Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_