In [222]:
import ast
import joblib
import pickle
import warnings
from collections import Counter
from pathlib import Path
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

import optuna
from optuna.distributions import IntDistribution, FloatDistribution, CategoricalDistribution
from optuna.integration import OptunaSearchCV

from pocketknife.database import connect_database, read_from_database

from scipy.stats import t, ttest_1samp
import statsmodels.formula.api as smf

from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

from xgboost import XGBClassifier

from success_prediction.modelling.config import (
    ALL_BINARY_FEATURE_COLS, ALL_CATEGORICAL_FEATURE_COLS, ALL_CONTINUOUS_FEATURE_COLS, 
    FOUNDING_WEBSITE_FEATURE_COLS, CURRENT_WEBSITE_FEATURE_COLS, TARGET_COLS
)
from success_prediction.config import RAW_DATA_DIR, PROCESSED_DATA_DIR, MODELS_DIR

warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.preprocessing._encoders")
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)

optuna.logging.set_verbosity(optuna.logging.ERROR)

# READ COMPANY SAMPLE

In [223]:
company_sample = pd.read_csv(RAW_DATA_DIR / 'company_sample' / 'until_2020' / '2020_sample_encoded_features.csv')

  company_sample = pd.read_csv(RAW_DATA_DIR / 'company_sample' / 'until_2020' / '2020_sample_encoded_features.csv')


## Handle missing values in base features

In [224]:
missing_df = company_sample.isna().sum().loc[lambda x: x > 0].to_frame('n_missing')
missing_df[missing_df['n_missing'] > 0]

Unnamed: 0,n_missing
founding_name,1
current_zip_code,1536
founding_street,11
founding_town,33
population,6
canton_id,3
district_id,3
urban_rural,3
typology_9c,3
typology_25c,3


In [225]:
# Drop the row with missing firm name length
company_sample = company_sample[company_sample['firm_name_length'].notna()]

# Fill missing population with 0
company_sample['population'] = company_sample['population'].fillna(0)

# Keep the following collumns as missings since they are in locations not belonging to any canton, missing is correct in this case
for col in ['district_id', 'canton_id', 'urban_rural', 'typology_9c', 'typology_25c']:
    company_sample[col] = company_sample[col].fillna(-1)

In [226]:
for col in ALL_BINARY_FEATURE_COLS + TARGET_COLS:
    if col in company_sample.columns:
        company_sample[col] = company_sample[col].astype('int8')

for col in ALL_CONTINUOUS_FEATURE_COLS:
    if col in company_sample.columns:
        company_sample[col] = company_sample[col].astype('float32')

for col in ALL_CATEGORICAL_FEATURE_COLS:
    if col in company_sample.columns:
        company_sample[col] = company_sample[col].astype('category')

In [189]:
all_feature_df = company_sample[TARGET_COLS + ALL_BINARY_FEATURE_COLS + ALL_CATEGORICAL_FEATURE_COLS + ALL_CONTINUOUS_FEATURE_COLS]

In [190]:
all_feature_df[TARGET_COLS].corr(method='pearson')

Unnamed: 0,target_inv_exit,target_acquisition,target_non_gov_investment,target_inno_subsidy
target_inv_exit,1.0,-0.032101,-0.017236,-0.019113
target_acquisition,-0.032101,1.0,-0.001043,-0.004808
target_non_gov_investment,-0.017236,-0.001043,1.0,0.348525
target_inno_subsidy,-0.019113,-0.004808,0.348525,1.0


## LogReg Features

For logistic regression, the features listed below are included.
- High-cardinality categorical features are removed and the remaining one-hot encoded.
- Continuous features are scaled using StandardScaler().
- Binary features are used as is.

In [227]:
LOGREG_BINARY_FEATURES = [
    'firm_name_swiss_ref',
    'firm_name_holding_ref',
    'firm_name_geog_ref',
    'firm_name_founder_match',
    'firm_name_male_match',
    'firm_name_female_match',
    'bps_geographic_term',
    'bps_male_name',
    'bps_female_name',
]

# < 30 categories and not strongly correlated
LOGREG_LOW_CAT_FEATURES = [
    'founding_legal_form',
    'section_1_label',
    'typology_9c',
    'canton_id',
    'bps_length_quantiles_5',
    'founding_dominant_language',
    'current_dominant_language',
]

LOGREG_HIGH_CAT_FEATURES = []  # No features with high cardinality

LOGREG_CONTINUOUS_FEATURES = [  
    'capital_chf',
    'firm_name_length',
    'population',
    'n_firms_within_10m',
    'n_firms_within_2.5km',
    'n_founders',
    'n_inscribed_firms',
    'n_distinct_nationalities',
    'pct_female_founders',
    'pct_foreign_founders',
    'pct_dr_titles',
    'pct_founders_same_residence',
    'pct_founders_with_prior_founding',
    'n_dissolved_firms',
    'n_existing_firms',
    'bps_mean_word_length',
    'bps_lix',
    'bps_min_word_freq_norm',
    'bps_max_word_freq_norm',
    'bps_freq_ratio_norm',
    'founding_mean_text_len',
    'founding_n_internal_links_mean',
    'founding_n_external_links_mean',
    'founding_n_languages',
    'founding_pr_sdg_similarity',
    'founding_pr_w_sdg_similarity',
    'founding_pr_w_red_sdg_similarity',
    'founding_doc2vec_diff',
    'founding_lp',
    'founding_lp_w',
    'founding_lp_w_red',
    'founding_vp',
    'founding_vp_w',
    'founding_vp_w_red',
    'current_mean_text_len',
    'current_n_internal_links_mean',
    'current_n_external_links_mean',
    'current_n_languages',
    'current_pr_sdg_similarity',
    'current_pr_w_sdg_similarity',
    'current_pr_w_red_sdg_similarity',
    'current_doc2vec_diff',
    'current_lp',
    'current_lp_w',
    'current_lp_w_red',
    'current_vp',
    'current_vp_w',
    'current_vp_w_red',
    'days_of_prior_observations',
    'prediction_1_score',
]

## RandomForest Features

For RF, the features listed below are included.
- All included categorical features are encoded using an ordinal encoding.
- Binary and categorical are used as is (unscaled).

In [192]:
RF_BINARY_FEATURES = [
    'firm_name_swiss_ref',
    'firm_name_holding_ref',
    'firm_name_geog_ref',
    'firm_name_founder_match',
    'firm_name_male_match',
    'firm_name_female_match',
    'bps_geographic_term',
    'bps_male_name',
    'bps_female_name',
]

# < 30 categories
RF_LOW_CAT_FEATURES = [
    'founding_legal_form',
    'section_1_label',
    'section_2_label',
    'section_3_label',
    'canton_id',
    'urban_rural',
    'typology_9c',
    'founding_dominant_language',
    'current_dominant_language',
]

RF_HIGH_CAT_FEATURES = [
    'division_1_label',
    'group_1_label',
    'class_1_label',
    'division_2_label',
    'group_2_label',
    'class_2_label',
    'division_3_label',
    'group_3_label',
    'class_3_label',
    'founding_bfs_code',
    'district_id',
]

RF_CONTINUOUS_FEATURES = [
    'capital_chf',
    'firm_name_length',
    'latitude',
    'longitude',
    'population',
    'n_firms_within_10m',
    'n_firms_within_1km',
    'n_firms_within_2.5km',
    'n_firms_within_10km',
    'n_founders',
    'n_inscribed_firms',
    'n_distinct_nationalities',
    'pct_female_founders',
    'pct_foreign_founders',
    'pct_dr_titles',
    'pct_founders_same_residence',
    'pct_founders_with_prior_founding',
    'n_dissolved_firms',
    'n_existing_firms',
    'bps_length',
    'bps_mean_word_length',
    'bps_lix',
    'bps_min_word_freq_norm',
    'bps_max_word_freq_norm',
    'bps_freq_ratio_norm',
    'founding_mean_text_len',
    'founding_n_internal_links_mean',
    'founding_n_external_links_mean',
    'founding_n_languages',
    'founding_pr_sdg_similarity',
    'founding_pr_w_sdg_similarity',
    'founding_pr_w_red_sdg_similarity',
    'founding_doc2vec_diff',
    'founding_lp',
    'founding_lp_w',
    'founding_lp_w_red',
    'founding_vp',
    'founding_vp_w',
    'founding_vp_w_red',
    'current_mean_text_len',
    'current_n_internal_links_mean',
    'current_n_external_links_mean',
    'current_n_languages',
    'current_pr_sdg_similarity',
    'current_pr_w_sdg_similarity',
    'current_pr_w_red_sdg_similarity',
    'current_doc2vec_diff',
    'current_lp',
    'current_lp_w',
    'current_lp_w_red',
    'current_vp',
    'current_vp_w',
    'current_vp_w_red',
    'days_of_prior_observations',
    'prediction_1_score',
    'prediction_2_score',
    'prediction_3_score',
]

## XGBoost Features

For XGBoost, the features listed below are included.
- Low-cardinality categorical features are one-hot encoded.
- High-cardinality categorical features are encoded using ordinal encoding.
- Binary and continuous features are included as is (unscaled).

In [193]:
XGB_BINARY_FEATURES = [
    'firm_name_swiss_ref',
    'firm_name_holding_ref',
    'firm_name_geog_ref',
    'firm_name_founder_match',
    'firm_name_male_match',
    'firm_name_female_match',
    'bps_geographic_term',
    'bps_male_name',
    'bps_female_name',
]

XGB_LOW_CAT_FEATURES = [
    'founding_legal_form',
    'section_1_label',  # Top 3 NOGA Levels
    'section_2_label',
    'section_3_label',
    'urban_rural',  # Cat
    'typology_9c',  # Cat
    'founding_dominant_language',
    'current_dominant_language',
]

XGB_HIGH_CAT_FEATURES = [
    'division_1_label',
    'group_1_label',
    'class_1_label',

    'division_2_label',
    'group_2_label',
    'class_2_label',

    'division_3_label',
    'group_3_label',
    'class_3_label',

    'founding_bfs_code',
    'district_id',
    'canton_id',

    'typology_25c',  # Cat
]

XGB_CONTINUOUS_FEATURES = [  
    'capital_chf',
    'firm_name_length',
    'latitude',
    'longitude',
    'population',
    'n_firms_within_10m',
    'n_firms_within_1km',
    'n_firms_within_2.5km',
    'n_firms_within_10km',
    'n_founders',
    'n_inscribed_firms',
    'n_distinct_nationalities',
    'pct_female_founders',
    'pct_foreign_founders',
    'pct_dr_titles',
    'pct_founders_same_residence',
    'pct_founders_with_prior_founding',
    'n_dissolved_firms',
    'n_existing_firms',
    'bps_length',
    'bps_mean_word_length',
    'bps_lix',
    'bps_min_word_freq_norm',
    'bps_max_word_freq_norm',
    'bps_freq_ratio_norm',
    'founding_mean_text_len',
    'founding_n_internal_links_mean',
    'founding_n_external_links_mean',
    'founding_n_languages',
    'founding_pr_sdg_similarity',
    'founding_pr_w_sdg_similarity',
    'founding_pr_w_red_sdg_similarity',
    'founding_doc2vec_diff',
    'founding_lp',
    'founding_lp_w',
    'founding_lp_w_red',
    'founding_vp',
    'founding_vp_w',
    'founding_vp_w_red',
    'current_mean_text_len',
    'current_n_internal_links_mean',
    'current_n_external_links_mean',
    'current_n_languages',
    'current_pr_sdg_similarity',
    'current_pr_w_sdg_similarity',
    'current_pr_w_red_sdg_similarity',
    'current_doc2vec_diff',
    'current_lp',
    'current_lp_w',
    'current_lp_w_red',
    'current_vp',
    'current_vp_w',
    'current_vp_w_red',
    'days_of_prior_observations',
    'prediction_1_score',
    'prediction_2_score',
    'prediction_3_score',
]

# MODEL TRAINING AND EVALUATION CLASS

In [None]:
class ModelEvaluation:

    def __init__(self, target_vars: list[str], model_specs: dict, random_state: int = 42):
        """
        target_vars: list of target column names
        model_specs: dict with keys as model names, values as dicts with:
                     - 'model': a scikit-learn estimator class or a callable returning an instance
                     - 'param_grid': dict of hyperparameter grid for grid search
        Example:
        model_specs = {
            'logreg': {'model': LogisticRegression, 'param_grid': {...}},
            'rf': {'model': RandomForestClassifier, 'param_grid': {...}}
        }
        """
        self.target_vars = target_vars
        self.model_specs = model_specs
        self.random_state = random_state
        self.best_params = {m: {t: None for t in target_vars} for m in model_specs}
        self.selected_features = {m: {t: None for t in target_vars} for m in model_specs}
        self.best_models = {m: {t: None for t in target_vars} for m in model_specs}
        self.production_models = {m: {t: None for t in target_vars} for m in model_specs}
        self.metrics_report = {m: {t: None for t in target_vars} for m in model_specs}
        
        for model_name, spec in self.model_specs.items():
            self._assert_feature_lists_mutually_exclusive(spec['features'])

    def _assert_feature_lists_mutually_exclusive(self, features_dict):
        all_features = []
        for name, feature_list in features_dict.items():
            all_features.extend(feature_list)
        if len(all_features) != len(set(all_features)):
            duplicates = [item for item, count in Counter(all_features).items() if count > 1]
            raise ValueError(f"Duplicate features detected: {duplicates}")

    def _assert_all_features_present(self, features_dict):
        all_features = []
        for name, feature_list in features_dict.items():
            all_features.extend(feature_list)
        missing_features = [feature for feature in all_features if feature not in self.feature_cols]
        if missing_features:
            raise ValueError(f"Missing features detected: {missing_features}")
        
    def load_data(self, df: pd.DataFrame):
        self.df = df.copy()
        self.feature_cols = [col for col in df.columns if col not in self.target_vars]
        for model_name, spec in self.model_specs.items():
            self._assert_all_features_present(spec['features'])

    def load_best_params(self, file_path: Path, model_names: list[str], target_vars: list[str]):
        with open(file_path, 'rb') as f:
            all_params = pickle.load(f)
        self.best_params = {
            m: {t: all_params[m][t] for t in target_vars}
            for m in model_names
        }

    def load_best_features(self, file_path: Path, model_names: list[str], target_vars: list[str], additional_features: list = []):
        with open(file_path, 'rb') as f:
            all_features = pickle.load(f)
        self.selected_features = {
            m: {t: all_features[m][t] + additional_features for t in target_vars}
            for m in model_names
        }

    def _get_feature_importances(self, model):
        """Return feature importances as a pandas Series, sorted descending."""
        # Tree-based models
        if hasattr(model, "feature_importances_"):
            return model.feature_importances_
        # Linear models (coefficients)
        elif hasattr(model, "coef_"):
            return np.abs(model.coef_).flatten()
        else:
            return None

    def _add_class_weights_to_fit_params(self, fit_params, ModelClass, model_name, y):
        
        if not self.model_specs[model_name]['account_for_class_weights']:
            return fit_params

        if model_name == 'xgb':
            classes = np.unique(y)
            n_pos = np.sum(y == classes[1])
            n_neg = np.sum(y == classes[0])
            fit_params['scale_pos_weight'] = n_neg / n_pos if n_pos > 0 else 1.0

        elif hasattr(ModelClass(), 'class_weight'):
            fit_params['class_weight'] = 'balanced'

        return fit_params

    def _build_preprocessor(self, preprocessor_steps: list[tuple], model_name: str, target: str, best_features: bool = False):
        steps = []
        for name, transformer in preprocessor_steps:
            features = self.model_specs[model_name]['features'][name]
            if best_features:
                features = [col for col in features if col in self.selected_features[model_name][target]]
            steps.append((name, transformer, features))
        return ColumnTransformer(steps)

    def nested_cv_with_feature_selection(
        self,
        k_outer: int = 5,
        k_inner: int = 3,
        min_features_to_select: int = 10,
        scoring: str = "average_precision"
    ):
        """
        Perform feature selection (RFECV) using cross-validation without hyperparameter tuning.

        Saves the most frequently selected features across outer folds.
        """
        print("Starting nested CV with feature selection...")

        for model_name, spec in tqdm(self.model_specs.items(), desc="Models"):

            ModelClass = spec['model']

            for target in tqdm(self.target_vars, desc=f"{model_name} targets", leave=False):
                print(f"Model: {model_name} | Target: {target}")

                preprocessor = self._build_preprocessor(spec['preprocessor_steps'], model_name, target)

                X = self.df[self.feature_cols]
                y = self.df[target]

                outer_skf = StratifiedKFold(n_splits=k_outer, shuffle=True, random_state=self.random_state)

                selected_feature_masks = []

                for train_idx, _ in tqdm(list(outer_skf.split(X, y)), desc="Outer folds", leave=False):
                    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]

                    X_train_proc = preprocessor.fit_transform(X_train)

                    fit_params = self._add_class_weights_to_fit_params(spec['fit_params'], ModelClass, model_name, y_train)
                    model = ModelClass(**fit_params)
                    
                    inner_skf = StratifiedKFold(n_splits=k_inner, shuffle=True, random_state=self.random_state)
                    rfecv = RFECV(
                        estimator=model,
                        step=1,
                        min_features_to_select=min_features_to_select,
                        cv=inner_skf,
                        scoring=scoring,
                        n_jobs=-1
                    )

                    rfecv.fit(X_train_proc, y_train)
                    selected_feature_masks.append(rfecv.support_)

                # Aggregate selected features across folds
                selected_feature_masks = np.array(selected_feature_masks)
                mean_mask = selected_feature_masks.mean(axis=0)
                threshold = 0.6  # at least 60% of folds must have selected a feature
                final_mask = mean_mask >= threshold
                final_features = np.array(self.feature_cols)[final_mask].tolist()

                self.selected_features[model_name][target] = final_features
                print(f"[{model_name}/{target}] Selected {len(final_features)} features: {final_features}")

    def nested_cv_with_hyperparam_search(
        self,
        out_folder: Path,
        k_outer: int = 5,
        k_inner: int = 3,
        best_features: bool = False,
        n_trials: int = 200,
        scoring: str = 'average_precision'
    ) -> None:
        
        if best_features and not self.selected_features:
            raise ValueError('To use the best features execure find_best_feature_subset first!')

        print("Starting nested CV with hyperparameter search...")

        for model_name, spec in tqdm(self.model_specs.items(), desc="Models"):

            ModelClass = spec['model']
            param_grid = spec['param_grid']

            for target in tqdm(self.target_vars, desc=f"{model_name} targets", leave=False):
                print(f"[STARTED] Model: {model_name} | Target: {target}")

                # Initialize preprocessor with the specified feature columns from the model specs
                preprocessor = self._build_preprocessor(
                    spec['preprocessor_steps'], model_name, target, best_features
                )

                X = self.df[self.feature_cols]  # Always select all features, dropping of unspecified features is handled by the preprocessor
                y = self.df[target]

                outer_skf = StratifiedKFold(n_splits=k_outer, shuffle=True, random_state=self.random_state)
                
                outer_metrics = {'roc_auc': [], 'pr_auc': [], 'f1_macro': []}
                inner_best_scores = []
                inner_best_params = []
                
                # Outer loop over k_outer folds
                for train_idx, test_idx in tqdm(list(outer_skf.split(X, y)), desc="Outer folds", leave=False):
                    
                    # Init data of the current outer fold
                    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
                    
                    X_train_proc = preprocessor.fit_transform(X_train)
                    X_test_proc = preprocessor.transform(X_test)

                    # Only include relevant fit_params for this model
                    fit_params = self._add_class_weights_to_fit_params(spec['fit_params'], ModelClass, model_name, y_train)
                    model = ModelClass(**fit_params)

                    # Set up inner grid search loop for k_inner folds
                    inner_skf = StratifiedKFold(n_splits=k_inner, shuffle=True, random_state=self.random_state)
                    if spec['search_type'] == 'grid':
                        searcher = GridSearchCV(
                            estimator=model,
                            param_grid=param_grid,
                            cv=inner_skf,
                            scoring=scoring,  # Because highly imbalanced data
                            n_jobs=-1
                        )
                    elif spec['search_type'] == 'optuna':
                        searcher = OptunaSearchCV(
                            estimator=model,
                            param_distributions=param_grid,
                            cv=inner_skf,
                            scoring=scoring,
                            n_trials=n_trials,
                            n_jobs=-1,
                            random_state=self.random_state,
                            verbose=0,
                        )
                    elif spec['search_type'] is None:
                        pass
                    else:
                        raise ValueError("search_type must be 'grid' or 'optuna'")

                    if spec['search_type'] is None:
                        best_params = {}
                    else:
                        # Determine best hyperparameters of this fold using only the training data and not testing
                        # training data is then again split into k_inner folds
                        searcher.fit(X_train_proc, y_train)

                        best_params = searcher.best_params_
                        # Select and store best hyperparam config determined on the training data
                        inner_best_scores.append(searcher.best_score_)
                        inner_best_params.append(best_params)
                    
                    # Refit model with full training data to estimate auc and feature importance of the outer fold
                    temp_fit_params = {**fit_params, 'n_jobs': -1}  # For training without CV set to -1
                    params = dict(best_params, **temp_fit_params)
                    best_model = ModelClass(**params)
                    best_model.fit(X_train_proc, y_train)
                    y_pred = best_model.predict(X_test_proc)
                    y_pred_proba = best_model.predict_proba(X_test_proc)[:, 1]
                    
                    outer_metrics['roc_auc'].append(roc_auc_score(y_test, y_pred_proba))
                    outer_metrics['pr_auc'].append(average_precision_score(y_test, y_pred_proba))
                    outer_metrics['f1_macro'].append(f1_score(y_test, y_pred, average="macro"))

                if spec['search_type'] is None:
                    overall_lambda_star = {}  # No tuned hyperparams
                else:
                    # Select hyperparameters from the inner folds that achieved the highest score
                    best_idx = np.argmax(inner_best_scores)
                    overall_lambda_star = inner_best_params[best_idx]

                # Set class weights again based on the full data
                fit_params = self._add_class_weights_to_fit_params(spec['fit_params'], ModelClass, model_name, y)

                # Retrain final model on all data
                temp_fit_params = {**fit_params, 'n_jobs': -1}  # For training without CV set to -1
                best_params = dict(overall_lambda_star, **temp_fit_params)
                production_model = ModelClass(**best_params)
                
                X_proc = preprocessor.fit_transform(X)
                production_model.fit(X_proc, y)

                # Store production model
                self.best_models[model_name][target] = production_model

                # Store best hyperparameters
                self.best_params[model_name][target] = overall_lambda_star

                # Store metrics report for the avg performance of the model on the current target
                self.metrics_report[model_name][target] = {
                    'mean_roc_auc': np.mean(outer_metrics['roc_auc']),
                    'std_roc_auc': np.std(outer_metrics['roc_auc']),
                    'all_roc_auc': outer_metrics['roc_auc'],
                    'mean_pr_auc': np.mean(outer_metrics['pr_auc']),
                    'std_pr_auc': np.std(outer_metrics['pr_auc']),
                    'all_pr_auc': outer_metrics['pr_auc'],
                    'mean_f1_macro': np.mean(outer_metrics['f1_macro']),
                    'std_f1_macro': np.std(outer_metrics['f1_macro']),
                    'all_f1_macro': outer_metrics['f1_macro'],
                    'best_params': overall_lambda_star,
                }
                print(f"[FINISHED] Model: {model_name} | Target: {target} | Mean ROC-AUC: {np.mean(outer_metrics['roc_auc']):.4f} | Mean PR-AUC: {np.mean(outer_metrics['pr_auc']):.4f}")

        self._save_models_and_reports(out_folder)

    def _save_models_and_reports(self, out_folder: Path):
        """Retrains model on the full dataset and stores production ready model with additional performance reports
        from the k-fold CV evaluations."""

        out_folder.mkdir(parents=True, exist_ok=True)

        # Save best models stored in self.best_models[model_name][target]
        models_dir = out_folder / 'trained_models'
        models_dir.mkdir(exist_ok=True)
        for model_name, targets in self.best_models.items():
            for target, model in targets.items():
                model_path = models_dir / f'{model_name}_{target}.joblib'
                joblib.dump(model, model_path)
                print(f"Saved model for {model_name}/{target} to {model_path}")

        # Save metrics report as csv file
        summary_rows = []
        for model_name, model_results in self.metrics_report.items():
            for target, d in model_results.items():
                summary_rows.append({
                    'model': model_name,
                    'target': target,
                    'mean_roc_auc': d['mean_roc_auc'],
                    'std_roc_auc': d['std_roc_auc'],
                    'all_roc_auc': str(d['all_roc_auc']),
                    'mean_pr_auc': d['mean_pr_auc'],
                    'std_pr_auc': d['std_pr_auc'],
                    'all_pr_auc': str(d['all_pr_auc']),
                    'mean_f1_macro': d['mean_f1_macro'],
                    'std_f1_macro': d['std_f1_macro'],
                    'all_f1_macro': str(d['all_f1_macro']),
                    'best_params': str(d['best_params']),
                })
        pd.DataFrame(summary_rows).to_csv(out_folder / 'cv_metrics_report.csv', index=False)
        print(f"Saved metrics summary to {str(out_folder / 'cv_metrics_report.csv')}")

        # Save hyperparameters
        best_params_path = out_folder / 'best_hyperparameters.pkl'
        with open(best_params_path, 'wb') as f:
            pickle.dump(self.best_params, f)
        print(f"Saved best hyperparameters to {best_params_path}")

        # Save best feature set
        best_features_path = out_folder / 'best_features.pkl'
        with open(best_features_path, 'wb') as f:
            pickle.dump(self.selected_features, f)
        print(f"Saved best features to {best_features_path}")

In [None]:
# Global params
RANDOM_STATE = 42
WEBSITE_FEATURE_COLS = FOUNDING_WEBSITE_FEATURE_COLS + CURRENT_WEBSITE_FEATURE_COLS

## Experiment A: Reproducability experiment and model evaluation on the full data set without website data


In [None]:
# Config for replication study using only baseline features
MODEL_SPECS = {
    'vanilla_logreg': {
        'model': LogisticRegression,
        'preprocessor_steps': [
            ('categorical_low_card', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')),
            ('continuous', StandardScaler()),
            ('binary', 'passthrough'),
        ],
        'features': {
            'binary': [f for f in LOGREG_BINARY_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_low_card': [f for f in LOGREG_LOW_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_high_card': [f for f in LOGREG_HIGH_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],  # empty
            'continuous': [f for f in LOGREG_CONTINUOUS_FEATURES if f not in WEBSITE_FEATURE_COLS],
        },
        'fit_params': {
            'random_state': RANDOM_STATE,
            'n_jobs': 1,  # Avoid thread thrashing, so model n_jobs should be set to 1 because Grid Search CV and Feature Selection is set to -1
            'max_iter': 10_000,
            'solver': 'saga',  # Fixed for computational efficiency
        },
        'param_grid': {},  # No hyperparams for vanilla LogReg
        'search_type': None,
        'account_for_class_weights': True
    },
    'logreg': {
        'model': LogisticRegression,
        'preprocessor_steps': [
            ('categorical_low_card', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')),
            ('continuous', StandardScaler()),
            ('binary', 'passthrough'),
        ],
        'features': {
            'binary': [f for f in LOGREG_BINARY_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_low_card': [f for f in LOGREG_LOW_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_high_card': [f for f in LOGREG_HIGH_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],  # empty
            'continuous': [f for f in LOGREG_CONTINUOUS_FEATURES if f not in WEBSITE_FEATURE_COLS],
        },
        'fit_params': {
            'random_state': RANDOM_STATE,
            'n_jobs': 1,  # Avoid thread thrashing, so model n_jobs should be set to 1 because Grid Search CV and Feature Selection is set to -1
            'max_iter': 10_000,
            'solver': 'saga',  # Fixed for computational efficiency
        },
        'param_grid': {
            'penalty': ['l1', 'l2'],  # Test Lasso and Ridge regularization
            'C': [0.01, 0.1, 1, 10, 100],
        },
        'search_type': 'grid',
        'account_for_class_weights': True
    },
    'rf': {
        'model': RandomForestClassifier,
        'preprocessor_steps': [
            ('categorical_low_card', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
            ('categorical_high_card', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
            ('continuous', 'passthrough'),
            ('binary', 'passthrough'),
        ],
        'features': {
            'binary': [f for f in RF_BINARY_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_low_card': [f for f in RF_LOW_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_high_card': [f for f in RF_HIGH_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'continuous': [f for f in RF_CONTINUOUS_FEATURES if f not in WEBSITE_FEATURE_COLS],
        },
        'fit_params': {
            'random_state': RANDOM_STATE,
            'n_jobs': 1,
        },
        'param_grid': {
            'n_estimators': IntDistribution(100, 400, step=50),
            'max_depth':  CategoricalDistribution([None, 10, 20, 30]),
            'min_samples_split': IntDistribution(2, 20),
            'min_samples_leaf': IntDistribution(1, 10),
            'max_features': CategoricalDistribution(['sqrt', 'log2', 0.5]),
        },
        'search_type': 'optuna',
        'account_for_class_weights': True
    },
    'xgb': {
        'model': XGBClassifier,
        'preprocessor_steps': [
            ('categorical_low_card', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')),
            ('categorical_high_card', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
            ('continuous', 'passthrough'),
            ('binary', 'passthrough'),
        ],
        'features': {
            'binary': [f for f in XGB_BINARY_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_low_card': [f for f in XGB_LOW_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'categorical_high_card': [f for f in XGB_HIGH_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS],
            'continuous': [f for f in XGB_CONTINUOUS_FEATURES if f not in WEBSITE_FEATURE_COLS],
        },
        'fit_params': {
            'random_state': RANDOM_STATE,
            'n_jobs': 1,
            'objective': 'binary:logistic',
            'verbosity': 0,
            'booster': 'gbtree',
            'tree_method': 'hist',
            'use_label_encoder': False,
            'eval_metric': 'aucpr',
        },
        'param_grid': {
            'max_depth': IntDistribution(3, 10),
            'min_child_weight': IntDistribution(1, 10),
            'gamma': FloatDistribution(0, 5.0),
            'subsample': FloatDistribution(0.5, 1.0),
            'colsample_bytree': FloatDistribution(0.5, 1.0),
            'learning_rate': FloatDistribution(0.005, 0.1, log=True),
            'n_estimators': IntDistribution(100, 400, step=50),
            'reg_alpha': FloatDistribution(0, 5.0),  # L1 regularization
            'reg_lambda': FloatDistribution(1.0, 10.0),  # L2 regularization
            'max_delta_step': IntDistribution(0, 10),
        },
        'search_type': 'optuna',
        'account_for_class_weights': True
    }
}

In [None]:
# 1. Load data for experiment A
base_df = all_feature_df[[col for col in all_feature_df.columns if col not in WEBSITE_FEATURE_COLS]]

# 2. Initialize model evaluation with targets and model specs
meval = ModelEvaluation(TARGET_COLS, MODEL_SPECS, random_state=RANDOM_STATE)
meval.load_data(base_df)

# 3. Training procedure on all features for the baseline reproduction
out_folder = MODELS_DIR / 'experiment_A'
meval.nested_cv_with_hyperparam_search(out_folder=out_folder)

In [None]:
"""
Experiment B: Performance difference between Doc2vec and my implementation
"""

BEST_MODEL = 'xgb'

MODELS = {
    'logreg': LogisticRegression,
    'rf': RandomForestClassifier,
    'xgb': XGBClassifier,
}

FOUNDING_WEBSITE_STATS = [
    'founding_mean_text_len',
    'founding_n_internal_links_mean',
    'founding_n_external_links_mean',
    'founding_n_languages',
]
CURRENT_WEBSITE_STATS = [
    'current_mean_text_len',
    'current_n_internal_links_mean',
    'current_n_external_links_mean',
    'current_n_languages',
]
BINARY_BASE = [f for f in XGB_BINARY_FEATURES if f not in WEBSITE_FEATURE_COLS]
LOW_CAT_BASE = [f for f in XGB_LOW_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS]
HIGH_CAT_BASE = [f for f in XGB_HIGH_CAT_FEATURES if f not in WEBSITE_FEATURE_COLS]
CONTINUOUS_BASE = [f for f in XGB_CONTINUOUS_FEATURES if f not in WEBSITE_FEATURE_COLS]

FEATURE_CONFIGS = {
    'founding_base': {
        'cont': [],
        'low_cat': []
    },
    'founding_doc2vec': {
        'cont': ['founding_doc2vec_diff'] + FOUNDING_WEBSITE_STATS,
        'low_cat': ['founding_dominant_language']
    },
    'founding_dim768': {
        'cont': ['founding_pr_sdg_similarity', 'founding_lp', 'founding_vp'] + FOUNDING_WEBSITE_STATS,
        'low_cat': ['founding_dominant_language']
    },
    'founding_dim768_w': {
        'cont': ['founding_pr_w_sdg_similarity', 'founding_lp_w', 'founding_vp_w'] + FOUNDING_WEBSITE_STATS,
        'low_cat': ['founding_dominant_language']
    },
    'founding_dim300_w': {
        'cont': ['founding_pr_w_red_sdg_similarity', 'founding_lp_w_red', 'founding_vp_w_red'] + FOUNDING_WEBSITE_STATS,
        'low_cat': ['founding_dominant_language']
    },
    'current_base': {
        'cont': [],
        'low_cat': []
    },
    'current_doc2vec': {
        'cont': ['current_doc2vec_diff'] + CURRENT_WEBSITE_STATS,
        'low_cat': ['current_dominant_language']
    },
    'current_dim768': {
        'cont': ['current_pr_sdg_similarity', 'current_lp', 'current_vp'] + CURRENT_WEBSITE_STATS,
        'low_cat': ['current_dominant_language']
    },
    'current_dim768_w': {
        'cont': ['current_pr_w_sdg_similarity', 'current_lp_w', 'current_vp_w'] + CURRENT_WEBSITE_STATS,
        'low_cat': ['current_dominant_language']
    },
    'current_dim300_w': {
        'cont': ['current_pr_w_red_sdg_similarity', 'current_lp_w_red', 'current_vp_w_red'] + CURRENT_WEBSITE_STATS,
        'low_cat': ['current_dominant_language']
    },
}

# 1. Load data for experiment B
for experiment_config, website_features in FEATURE_CONFIGS.items():
    print(f'START CONDUCTING EXPERIMENT B FOR: {experiment_config}')

    # Set model specs
    MODEL_SPECS = {
        'xgb': {
            'model': XGBClassifier,
            'preprocessor_steps': [
                ('categorical_low_card', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')),
                ('categorical_high_card', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
                ('continuous', 'passthrough'),
                ('binary', 'passthrough'),
            ],
            'features': {
                'binary': BINARY_BASE,
                'categorical_low_card': LOW_CAT_BASE + website_features['low_cat'],
                'categorical_high_card': HIGH_CAT_BASE,
                'continuous': CONTINUOUS_BASE + website_features['cont'],
            },
            'fit_params': {
                'random_state': RANDOM_STATE,
                'n_jobs': 1,
                'objective': 'binary:logistic',
                'verbosity': 0,
                'booster': 'gbtree',
                'tree_method': 'hist',
                'use_label_encoder': False,
                'eval_metric': 'aucpr',
            },
            'param_grid': {
                'max_depth': IntDistribution(3, 10),
                'min_child_weight': IntDistribution(1, 10),
                'gamma': FloatDistribution(0, 5.0),
                'subsample': FloatDistribution(0.5, 1.0),
                'colsample_bytree': FloatDistribution(0.5, 1.0),
                'learning_rate': FloatDistribution(0.005, 0.1, log=True),
                'n_estimators': IntDistribution(100, 400, step=50),
                'reg_alpha': FloatDistribution(0, 5.0),  # L1 regularization
                'reg_lambda': FloatDistribution(1.0, 10.0),  # L2 regularization
                'max_delta_step': IntDistribution(0, 10),
            },
            'search_type': 'optuna',
            'account_for_class_weights': True
        }
    }

    # 1. Load data for experiment A
    if 'current' in experiment_config:
        website_df = all_feature_df[~all_feature_df['current_vp'].isna()][[col for col in all_feature_df.columns if col not in FOUNDING_WEBSITE_FEATURE_COLS]].copy()

    elif 'founding' in experiment_config:
        website_df = all_feature_df[~all_feature_df['founding_vp'].isna()][[col for col in all_feature_df.columns if col not in CURRENT_WEBSITE_FEATURE_COLS]].copy()

    targets = TARGET_COLS

    # 2. Initialize model evaluation with targets and model specs
    meval = ModelEvaluation(targets, MODEL_SPECS, random_state=RANDOM_STATE)
    meval.load_data(website_df)
    
    # 3. Evaluate with doc2vec scores
    out_folder = MODELS_DIR / 'experiment_B' / experiment_config
    meval.nested_cv_with_hyperparam_search(out_folder=out_folder, k_outer=10)

In [None]:
FEATURE_CONFIGS.keys()

In [None]:
FEATURE_CONFIGS

In [None]:
def get_per_fold_metric(df, target, metric_col):
    """Return list of per-fold metric values for given target."""
    values = df[df['target'] == target][metric_col].values
    return ast.literal_eval(values[0]) if len(values) > 0 else None

results = []

for kind in ['founding', 'current']:
    
    exp_B_base_df = pd.read_csv(MODELS_DIR / 'experiment_B' / f'{kind}_base' / 'cv_metrics_report.csv')

    for report_dir in [col for col in FEATURE_CONFIGS.keys() if kind in col]:

        for target in TARGET_COLS:
            
            comp_df = pd.read_csv(MODELS_DIR / 'experiment_B' / report_dir / 'cv_metrics_report.csv')

            for metric in ["all_roc_auc", "all_pr_auc"]:
            # Get per-fold AP for website and base
                web_ap = get_per_fold_metric(comp_df, target, metric)
                base_ap = get_per_fold_metric(exp_B_base_df, target, metric)
                web_ap, base_ap = np.array(web_ap), np.array(base_ap)
                
                mean_web, mean_base = np.mean(web_ap), np.mean(base_ap)
                std_web, std_base = np.std(web_ap, ddof=1), np.std(base_ap, ddof=1)
                n_web, n_base = len(web_ap), len(base_ap)

                # Welch's SE and df
                se_diff = np.sqrt(std_web**2/n_web + std_base**2/n_base)
                degrees_of_freedom = (std_web**2 / n_web + std_base**2 / n_base)**2 / ((std_web**2 / n_web)**2 / (n_web-1) + (std_base**2 / n_base)**2 / (n_base-1))

                diff = mean_web - mean_base
                diff_pct = diff / mean_base * 100

                t_stat = diff / se_diff if se_diff > 0 else 0
                p_value = 2 * t.sf(np.abs(t_stat), degrees_of_freedom)
                
                ci = {}
                for alpha, label in zip([0.01, 0.05, 0.1], ['99', '95', '90']):
                    t_crit = t.ppf(1 - alpha/2, degrees_of_freedom)
                    ci[f"ci_lower_{label}"] = diff - t_crit * se_diff
                    ci[f"ci_upper_{label}"] = diff + t_crit * se_diff

                results.append({
                    'model': report_dir,
                    'metric': metric,
                    'metric_value': mean_web,
                    'target': target,
                    'mean_ap_website': mean_web,
                    'mean_ap_base': mean_base,
                    'p_value': p_value,
                    **ci,
                })

results_df = pd.DataFrame(results)
results_df.to_csv(MODELS_DIR / 'experiment_B' / 'individual_significance_report.csv', index=False)

In [None]:
results = []

for kind in ['founding', 'current']:
    
    base_df = pd.read_csv(MODELS_DIR / 'experiment_B' / f'{kind}_base' / 'cv_metrics_report.csv')

    # For each target, for each metric, pool differences from all website models
    for target in TARGET_COLS:
        for metric in ["all_roc_auc", "all_pr_auc"]:
            all_diffs = []
            for report_dir in [col for col in FEATURE_CONFIGS.keys() if kind in col and col != f'{kind}_base']:
                comp_df = pd.read_csv(MODELS_DIR / 'experiment_B' / report_dir / 'cv_metrics_report.csv')
                web_scores = get_per_fold_metric(comp_df, target, metric)
                base_scores = get_per_fold_metric(base_df, target, metric)
                if web_scores is None or base_scores is None:
                    continue
                diffs = np.array(web_scores) - np.array(base_scores)
                all_diffs.extend(diffs)

            all_diffs = np.array(all_diffs)
            if len(all_diffs) == 0:
                continue
            mean_diff = np.mean(all_diffs)
            mean_diff_pct = np.round(mean_diff / np.mean(base_scores) * 100, decimals=1)
            std_diff = np.std(all_diffs, ddof=1)
            n = len(all_diffs)
            se = std_diff / np.sqrt(n)

            # t-test and p-value
            t_stat, p_value = ttest_1samp(all_diffs, 0.0)

            # Confidence intervals
            ci_99 = t.ppf(0.995, n-1) * se
            ci_95 = t.ppf(0.975, n-1) * se
            ci_90 = t.ppf(0.95, n-1) * se

            results.append({
                'kind': kind,
                'target': target,
                'metric': metric,
                'mean_improvement': mean_diff,
                'mean_improvement_pct': mean_diff_pct,
                'std': std_diff,
                'n': n,
                'p_value': p_value,
                'ci_lower_99': mean_diff - ci_99,
                'ci_upper_99': mean_diff + ci_99,
                'ci_lower_95': mean_diff - ci_95,
                'ci_upper_95': mean_diff + ci_95,
                'ci_lower_90': mean_diff - ci_90,
                'ci_upper_90': mean_diff + ci_90,
            })

results_df = pd.DataFrame(results)
results_df.to_csv(MODELS_DIR / 'experiment_B' / 'average_significance_report.csv', index=False)

In [None]:
df = pd.read_csv(MODELS_DIR / 'experiment_B' / 'individual_significance_report.csv')
for level, cutoff, previous in [('***', 0.01, 0.0), ('**', 0.05, 0.01), ('*', 0.10, 0.05)]:
    sig = df[(df['p_value'] < cutoff) & (df['p_value'] >= previous)]
    print(f"\nSignificant at {level} ({cutoff}):")
    for _, row in sig.iterrows():
        print(f"{row['model']}, {row['metric']}, {row['target']}, Î”={row['mean_ap_website'] - row['mean_ap_base']:.4f}, p={row['mean_ap_website']:.3g}")


In [None]:
founding_website_df = all_feature_df[~all_feature_df['founding_vp'].isna()][[col for col in all_feature_df.columns if col not in FOUNDING_WEBSITE_FEATURE_COLS]].copy()
current_website_df = all_feature_df[~all_feature_df['current_vp'].isna()][[col for col in all_feature_df.columns if col not in FOUNDING_WEBSITE_FEATURE_COLS]].copy()

for target in TARGET_COLS:
    print('Base data:')
    print(target, len(base_df[base_df[target] == 1]) / len(base_df[base_df[target] == 0]))
    print('Founding website data:')
    print(target, len(founding_website_df[founding_website_df[target] == 1]) / len(founding_website_df[founding_website_df[target] == 0]))
    print('Current website data:')
    print(target, len(current_website_df[current_website_df[target] == 1]) / len(current_website_df[current_website_df[target] == 0]))

In [None]:
TARGET_COLS

# EXPERIMENT C

In [228]:
"""
Experiment C: Regression analysis of Embedding Scores
"""

query_green = """ 
    SELECT * FROM zefix.green_binary WHERE is_green;
"""

with connect_database() as con:
    df_green = read_from_database(connection=con, query=query_green)

company_sample = company_sample[['ehraid', 'uid'] + TARGET_COLS + ALL_BINARY_FEATURE_COLS + ALL_CATEGORICAL_FEATURE_COLS + ALL_CONTINUOUS_FEATURE_COLS + ['founding_year']]
company_sample = company_sample.merge(df_green, on='uid', how='left')
company_sample['is_green'] = company_sample['is_green'].fillna(0).astype(int)

company_sample.drop(columns=['ehraid', 'uid'], inplace=True)

In [238]:
CAT_CONTROLS = ['founding_legal_form', 'division_1_label', 'typology_9c', 'canton_id', 'bps_length_quantiles_5', 'founding_year']
OTHER_CONTROLS = [c for c in LOGREG_BINARY_FEATURES + LOGREG_CONTINUOUS_FEATURES if c not in CURRENT_WEBSITE_FEATURE_COLS + FOUNDING_WEBSITE_FEATURE_COLS]

In [231]:
score_cols = [
    'founding_doc2vec_diff',
    'current_doc2vec_diff',
    'founding_pr_w_sdg_similarity',
    'founding_pr_w_red_sdg_similarity',
    'founding_lp_w',
    'founding_lp_w_red',
    'founding_vp_w',
    'founding_vp_w_red',
    'current_pr_sdg_similarity',
    'current_pr_w_sdg_similarity',
    'current_pr_w_red_sdg_similarity',
    'current_lp_w',
    'current_lp_w_red',
    'current_vp_w',
    'current_vp_w_red',
    'founding_core_diff_w_red_pca',
    'current_core_diff_w_red_pca'
]

pca = PCA(n_components=1)
mask = company_sample[['founding_lp_w_red', 'founding_vp_w_red']].notnull().all(axis=1)
company_sample.loc[mask, 'founding_core_diff_w_red_pca'] = pca.fit_transform(
    company_sample.loc[mask, ['founding_lp_w_red', 'founding_vp_w_red']]
)

mask = company_sample[['current_lp_w_red', 'current_vp_w_red']].notnull().all(axis=1)
company_sample.loc[mask, 'current_core_diff_w_red_pca'] = pca.fit_transform(
    company_sample.loc[mask, ['current_lp_w_red', 'current_vp_w_red']]
)

scaler = StandardScaler()
company_sample[score_cols + LOGREG_CONTINUOUS_FEATURES] = scaler.fit_transform(company_sample[score_cols + LOGREG_CONTINUOUS_FEATURES])

In [246]:
class CoefficientAnalyser:
    def __init__(self, df: pd.DataFrame, experiment_dir: str, maxiter: int = 1500, random_state: int = 42):
        self.df = df
        self.experiment_dir = Path(experiment_dir)
        self.experiment_dir.mkdir(exist_ok=True, parents=True)
        self.maxiter = maxiter
        self.random_state = random_state

    @staticmethod
    def drop_perfect_separation(df, target, col):
        keep = df.groupby(col, observed=False)[target].nunique()
        keep = keep[keep > 1].index  # only keep categories that have both 0 and 1
        return df[df[col].isin(keep)]

    @staticmethod
    def collapse_and_drop_sparse(df, target, cat_controls, min_count=25):
        # Collapse rare industries
        for col in cat_controls:
            df[col] = df[col].astype(str)
            vc = df[col].value_counts()
            rare = vc[vc < min_count].index
            df.loc[:, col] = df[col].replace(rare, 'Other')

        # Drop no-variation groups
        group_sizes = df.groupby(cat_controls, observed=False)[target].nunique()
        valid_groups = group_sizes[group_sizes > 1].index
        mask = df.set_index(cat_controls).index.isin(valid_groups)
        return df[mask].copy()

    def _estimate_performance(
        self, df, formula, target, score_set, cat_controls, k_folds
    ) -> dict:
        df = df.copy()
        
        skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=self.random_state)
        X, y = df.drop(columns=[target]), df[target]
        aucs, ap_aucs = [], []
        valid_folds = 0
        for train_idx, test_idx in skf.split(X, y):
            train_df, test_df = df.iloc[train_idx].copy(), df.iloc[test_idx].copy()
            """
            # Collapse and drop rare categories inside the train fold
            train_df = self.collapse_and_drop_sparse(train_df, target, cat_controls)
            print(len(train_df))
            # Keep only test samples where categories exist in train_df
            for col in cat_controls:
                allowed_cats = set(train_df[col].unique())
                test_df = test_df[test_df[col].isin(allowed_cats)]
            if len(train_df) < 25_000 or len(test_df) < 2500:
                continue  # skip if not enough data
            """
            try:
                model = smf.logit(formula=formula, data=train_df).fit(disp=0, cov_type='HC1', maxiter=self.maxiter)
            except Exception as e:
                continue
            y_pred = model.predict(test_df)
            y_true = test_df[target]
            aucs.append(roc_auc_score(y_true, y_pred))
            ap_aucs.append(average_precision_score(y_true, y_pred))
            valid_folds += 1

        return {
            'target': target,
            'score': '+'.join(score_set),
            'valid_folds': valid_folds,
            'roc_aucs': aucs,
            'mean_roc_auc': np.mean(aucs) if aucs else np.nan,
            'std_roc_auc': np.std(aucs) if aucs else np.nan,
            'pr_aucs': ap_aucs,
            'mean_pr_auc': np.mean(ap_aucs) if ap_aucs else np.nan,
            'std_pr_auc': np.std(ap_aucs) if ap_aucs else np.nan,
        }

    def estimate(
        self,
        targets: list[str | tuple[str, str]],
        score_cols,
        cat_controls,
        other_controls,
        cat_interaction_terms: list[tuple[str, str]],
        other_interaction_terms: list[tuple[str, str]],
        k_folds: int = 10,
        save_full_summary=True,
        subfolder='reg_results'
    ):
        """
        targets: list of target column names (binary outcome)
        score_cols: list of column names (strings) OR list of list of strings (for multi-score models)
            e.g., ['current_vp', 'current_lp']
               or [ ['current_vp'], ['current_lp'], ['current_vp', 'current_lp'] ]
        controls: list of categorical control variable names (for fixed effects)
        """
        summary_rows = []
        auc_rows = []
        out_folder = self.experiment_dir / subfolder
        summary_folder = out_folder / 'summaries'
        out_folder.mkdir(exist_ok=True, parents=True)
        summary_folder.mkdir(exist_ok=True, parents=True)

        # Ensure score_cols is a list of lists
        score_cols = [col if isinstance(col, list) else [col] for col in score_cols]
        for target_col in targets:

            if isinstance(target_col, tuple):
                target = '_or_'.join(target_col)
                self.df[target] = self.df[list(target_col)].max(axis=1)
            else:
                target = target_col

            for score_set in score_cols:
                cols_needed = [target] + list(score_set)
                for col in [cat_controls, other_controls]:
                    if col:
                        cols_needed += col
                reg_df = self.df.replace([np.inf, -np.inf], np.nan)\
                                .dropna(subset=cols_needed).copy()

                reg_df[target] = reg_df[target].astype(int)

                reg_df = self.collapse_and_drop_sparse(reg_df, target, cat_controls)

                # Drop perfect separation categories
                for control in cat_controls:
                    reg_df = self.drop_perfect_separation(reg_df, target, control)

                # Build formula
                terms = list(score_set)
                if cat_controls:
                    terms.extend([f'C({c})' for c in cat_controls])
                if other_controls:
                    terms.extend(other_controls)
                if cat_interaction_terms:
                    terms.extend([f'C({c1}):C({c2})' for c1, c2 in cat_interaction_terms])
                    cols_needed += cat_interaction_terms
                if other_interaction_terms:
                    terms.extend([f'{c1} * {c2}' for c1, c2 in other_interaction_terms])
                    cols_needed += other_interaction_terms

                formula = f"{target} ~ {' + '.join(terms)}"
                print(f"Fitting: {formula} (n={len(reg_df)})")

                try:
                    result = smf.logit(formula=formula, data=reg_df).fit(disp=0, cov_type='HC1', maxiter=self.maxiter)
                    pseudo_r2 = 1 - result.llf/result.llnull

                    # Save all score coefs
                    for score in score_set:
                        summary_rows.append({
                            'target': target,
                            'score': '+'.join(score_set),
                            'coef_name': score,
                            'coef': result.params.get(score, np.nan),
                            'std_err': result.bse.get(score, np.nan),
                            'pval': result.pvalues.get(score, np.nan),
                            'pseudo_r2': pseudo_r2,
                            'n_obs': len(reg_df)
                        })
                    if save_full_summary:
                        fname = summary_folder / f"reg_summary_{target}_{'+'.join(score_set)}.txt"
                        with open(fname, 'w') as f:
                            f.write(result.summary().as_text())
                    print(f"Successfully fitted full data for: {formula}")

                except Exception as e:
                    print(f"Fitting error with {target}, {score_set}: {e}")

                try:
                    auc_rows.append(self._estimate_performance(
                        reg_df[list(set(cols_needed))],
                        formula,
                        target,
                        score_set,
                        cat_controls,
                        k_folds
                    ))

                except Exception as e:
                    print(f"CV error with {target}, {score_set}: {e}")


        pd.DataFrame(summary_rows).to_csv(out_folder / 'report_regression_results.csv', index=False)
        pd.DataFrame(auc_rows).to_csv(out_folder / 'report_auc_scores.csv', index=False)
        print(f"\nSaved regression summaries and AUC scores to {out_folder}")

In [None]:
    {
        'title': 'Full Controls', 
        'targets': [
            ('target_inno_subsidy', 'target_non_gov_investment', 'target_acquisition'), 
            ('target_inno_subsidy', 'target_non_gov_investment'),
            'target_inno_subsidy', 'target_non_gov_investment', 'target_acquisition'
        ],
        'score_cols': [
            'founding_doc2vec_diff', 
            ['founding_core_diff_w_red_pca', 'founding_pr_w_red_sdg_similarity'], 
            'current_doc2vec_diff', 
            ['current_core_diff_w_red_pca', 'current_pr_w_red_sdg_similarity']
        ],
        'cat_controls': CAT_CONTROLS,
        'other_controls': OTHER_CONTROLS,
    }

In [None]:
analyser = CoefficientAnalyser(company_sample, experiment_dir=MODELS_DIR / 'experiment_B')

experiment_setups = [
    {
        'title': 'Year FEs', 
        'targets': [
            ('target_inno_subsidy', 'target_non_gov_investment', 'target_acquisition'),
            ('target_inno_subsidy', 'target_non_gov_investment'),
            'target_inno_subsidy',
            'target_non_gov_investment',
            'target_acquisition'
        ],
        'score_cols': [
            'founding_doc2vec_diff',
            ['founding_core_diff_w_red_pca', 'founding_pr_w_red_sdg_similarity'],
            'current_doc2vec_diff',
            ['current_core_diff_w_red_pca', 'current_pr_w_red_sdg_similarity']
        ],
        'cat_controls': ['founding_year'], 
    },
    {
        'title': 'Year + Industry FEs', 
        'targets': [
            ('target_inno_subsidy', 'target_non_gov_investment', 'target_acquisition'),
            ('target_inno_subsidy', 'target_non_gov_investment'),
            'target_inno_subsidy',
            'target_non_gov_investment',
            'target_acquisition'
        ],
        'score_cols': [
            'founding_doc2vec_diff',
            ['founding_core_diff_w_red_pca', 'founding_pr_w_red_sdg_similarity'],
            'current_doc2vec_diff',
            ['current_core_diff_w_red_pca', 'current_pr_w_red_sdg_similarity']
        ],
        'cat_controls': ['founding_year', 'division_1_label'], 
    },
    {
        'title': 'Year + Industry + Canton FEs', 
        'targets': [
            ('target_inno_subsidy', 'target_non_gov_investment', 'target_acquisition'),
            ('target_inno_subsidy', 'target_non_gov_investment'),
            'target_inno_subsidy',
            'target_non_gov_investment',
            'target_acquisition'
        ],
        'score_cols': [
            'founding_doc2vec_diff',
            ['founding_core_diff_w_red_pca', 'founding_pr_w_red_sdg_similarity'],
            'current_doc2vec_diff',
            ['current_core_diff_w_red_pca', 'current_pr_w_red_sdg_similarity']
        ],
        'cat_controls': ['founding_year', 'division_1_label', 'canton_id'], 
    },
    {
        'title': 'Green - Year FEs', 
        'targets': ['is_green'],
        'score_cols': [
            'founding_pr_sdg_similarity',
            'founding_pr_w_sdg_similarity',
            'founding_pr_w_red_sdg_similarity',
            'current_pr_sdg_similarity',
            'current_pr_w_sdg_similarity',
            'current_pr_w_red_sdg_similarity'
        ],
        'cat_controls': ['founding_year'],
    },
    {
        'title': 'Green - Year + Industry FEs', 
        'targets': ['is_green'],
        'score_cols': [
            'founding_pr_sdg_similarity',
            'founding_pr_w_sdg_similarity',
            'founding_pr_w_red_sdg_similarity',
            'current_pr_sdg_similarity',
            'current_pr_w_sdg_similarity',
            'current_pr_w_red_sdg_similarity'
        ],
        'cat_controls': ['founding_year', 'division_1_label'],
    },
    {
        'title': 'Green - Year + Industry + Canton FEs', 
        'targets': ['is_green'],
        'score_cols': [
            'founding_pr_sdg_similarity',
            'founding_pr_w_sdg_similarity',
            'founding_pr_w_red_sdg_similarity',
            'current_pr_sdg_similarity',
            'current_pr_w_sdg_similarity',
            'current_pr_w_red_sdg_similarity'
        ],
        'cat_controls': ['founding_year', 'division_1_label', 'canton_id'],
    },
]

for experiment in experiment_setups:
    analyser.estimate(
        targets=experiment.get('targets'),
        score_cols=experiment.get('score_cols'),
        cat_controls=experiment.get('cat_controls'),
        other_controls=experiment.get('other_controls'),
        cat_interaction_terms=experiment.get('cat_interaction_terms'),
        other_interaction_terms=experiment.get('other_interaction_terms'),
        subfolder=experiment.get('title'),
        save_full_summary=True
    )

Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_doc2vec_diff + C(founding_year) (n=39699)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_doc2vec_diff + C(founding_year)
Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) (n=39926)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year)
Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ current_doc2vec_diff + C(founding_year) (n=39891)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ current_doc2vec_diff + C(founding_year)
Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acqu

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) (n=34060)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) (n=32888)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ current_doc2vec_diff + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))


Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) (n=33028)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy_or_target_non_gov_investment ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) (n=30861)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy_or_target_non_gov_investment ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) (n=31053)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy_or_target_non_gov_investment ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) (n=29833)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment ~ current_doc2vec_diff + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy_or_target_non_gov_investment ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) (n=29965)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) (n=28990)
Successfully fitted full data for: target_inno_subsidy ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) (n=29169)
Successfully fitted full data for: target_inno_subsidy ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) (n=27981)
Successfully fitted full data for: target_inno_subsidy ~ current_doc2vec_diff + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) (n=28077)
Successfully fitted full data for: target_inno_subsidy ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_non_gov_investment ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) (n=22086)
Successfully fitted full data for: target_non_gov_investment ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_non_gov_investment ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) (n=22216)
Successfully fitted full data for: target_non_gov_investment ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))


Fitting: target_non_gov_investment ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) (n=23970)
Successfully fitted full data for: target_non_gov_investment ~ current_doc2vec_diff + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_non_gov_investment ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) (n=24084)
Successfully fitted full data for: target_non_gov_investment ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))


Fitting: target_acquisition ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) (n=24366)
Successfully fitted full data for: target_acquisition ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_acquisition ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) (n=24505)
Successfully fitted full data for: target_acquisition ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_acquisition ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) (n=20193)
Successfully fitted full data for: target_acquisition ~ current_doc2vec_diff + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_acquisition ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) (n=20396)
Successfully fitted full data for: target_acquisition ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))



Saved regression summaries and AUC scores to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_C/Year + Industry FEs
Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id) (n=12905)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id)




CV error with target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition, ['founding_doc2vec_diff']: predict requires that you use a DataFrame when predicting from a model
that was created using the formula api.

The original error message returned by patsy is:
Error converting data to categorical: observation with value '18.0' does not match any of the expected levels (expected: ['1.0', '10.0', ..., '96.0', 'Other'])
    target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id)
                                                                                                                        ^^^^^^^^^^^^^^^^^^^
Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=12988)
Successfully fitted full data for: target_inno_subsidy_or_t

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))


Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id) (n=12211)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


CV error with target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition, ['current_doc2vec_diff']: predict requires that you use a DataFrame when predicting from a model
that was created using the formula api.

The original error message returned by patsy is:
Error converting data to categorical: observation with value '42.0' does not match any of the expected levels (expected: ['1.0', '10.0', ..., '96.0', 'Other'])
    target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id)
                                                                                                                       ^^^^^^^^^^^^^^^^^^^
Fitting: target_inno_subsidy_or_target_non_gov_investment_or_target_acquisition ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=12273)
Successfully fitted full data for: target_inno_subsidy_or_target

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy_or_target_non_gov_investment ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id) (n=10717)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy_or_target_non_gov_investment ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=10786)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)




CV error with target_inno_subsidy_or_target_non_gov_investment, ['founding_core_diff_w_red_pca', 'founding_pr_w_red_sdg_similarity']: predict requires that you use a DataFrame when predicting from a model
that was created using the formula api.

The original error message returned by patsy is:
Error converting data to categorical: observation with value '6.0' does not match any of the expected levels (expected: ['1.0', '10.0', ..., '8.0', '9.0'])
    target_inno_subsidy_or_target_non_gov_investment ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)
                                                                                                                                                                  ^^^^^^^^^^^^
Fitting: target_inno_subsidy_or_target_non_gov_investment ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id) (n=10850)
Successfully fitted full data for: target_inno_subsi

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy_or_target_non_gov_investment ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=10900)
Successfully fitted full data for: target_inno_subsidy_or_target_non_gov_investment ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id) (n=9297)
Successfully fitted full data for: target_inno_subsidy ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=9357)
Successfully fitted full data for: target_inno_subsidy ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


CV error with target_inno_subsidy, ['founding_core_diff_w_red_pca', 'founding_pr_w_red_sdg_similarity']: predict requires that you use a DataFrame when predicting from a model
that was created using the formula api.

The original error message returned by patsy is:
Error converting data to categorical: observation with value '6.0' does not match any of the expected levels (expected: ['1.0', '10.0', ..., '8.0', '9.0'])
    target_inno_subsidy ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)
                                                                                                                                     ^^^^^^^^^^^^
Fitting: target_inno_subsidy ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id) (n=9216)
Successfully fitted full data for: target_inno_subsidy ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_inno_subsidy ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=9251)
Successfully fitted full data for: target_inno_subsidy ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_non_gov_investment ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id) (n=5963)
Successfully fitted full data for: target_non_gov_investment ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_non_gov_investment ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=6002)
Successfully fitted full data for: target_non_gov_investment ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_non_gov_investment ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id) (n=6686)
Successfully fitted full data for: target_non_gov_investment ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_non_gov_investment ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=6721)
Successfully fitted full data for: target_non_gov_investment ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))


Fitting: target_acquisition ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id) (n=4552)
Successfully fitted full data for: target_acquisition ~ founding_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: target_acquisition ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=4580)
Successfully fitted full data for: target_acquisition ~ founding_core_diff_w_red_pca + founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))


Fitting: target_acquisition ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id) (n=3276)
Successfully fitted full data for: target_acquisition ~ current_doc2vec_diff + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))


Fitting: target_acquisition ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=3289)
Successfully fitted full data for: target_acquisition ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


CV error with target_acquisition, ['current_core_diff_w_red_pca', 'current_pr_w_red_sdg_similarity']: predict requires that you use a DataFrame when predicting from a model
that was created using the formula api.

The original error message returned by patsy is:
Error converting data to categorical: observation with value '20.0' does not match any of the expected levels (expected: ['23.0', '28.0', ..., '88.0', '96.0'])
    target_acquisition ~ current_core_diff_w_red_pca + current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)
                                                                                                            ^^^^^^^^^^^^^^^^^^^

Saved regression summaries and AUC scores to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_C/Year + Industry + Canton FEs
Fitting: is_green ~ founding_pr_sdg_similarity + C(founding_year) (n=39926)
Successfully fitted full data for: is_green ~ founding_pr_sdg_simil

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: is_green ~ founding_pr_w_sdg_similarity + C(founding_year) + C(division_1_label) (n=33027)
Successfully fitted full data for: is_green ~ founding_pr_w_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: is_green ~ founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) (n=33027)
Successfully fitted full data for: is_green ~ founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: is_green ~ current_pr_sdg_similarity + C(founding_year) + C(division_1_label) (n=33280)
Successfully fitted full data for: is_green ~ current_pr_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: is_green ~ current_pr_w_sdg_similarity + C(founding_year) + C(division_1_label) (n=33280)
Successfully fitted full data for: is_green ~ current_pr_w_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: is_green ~ current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) (n=33280)
Successfully fitted full data for: is_green ~ current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))



Saved regression summaries and AUC scores to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_C/Green - Year + Industry FEs
Fitting: is_green ~ founding_pr_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=16236)
Successfully fitted full data for: is_green ~ founding_pr_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: is_green ~ founding_pr_w_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=16236)
Successfully fitted full data for: is_green ~ founding_pr_w_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))


Fitting: is_green ~ founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=16236)
Successfully fitted full data for: is_green ~ founding_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)




Fitting: is_green ~ current_pr_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=16214)
Successfully fitted full data for: is_green ~ current_pr_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: is_green ~ current_pr_w_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=16214)
Successfully fitted full data for: is_green ~ current_pr_w_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


Fitting: is_green ~ current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id) (n=16214)
Successfully fitted full data for: is_green ~ current_pr_w_red_sdg_similarity + C(founding_year) + C(division_1_label) + C(canton_id)


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))



Saved regression summaries and AUC scores to /Users/manuelbolz/Documents/git/for_work/company_success_prediction/models/experiment_C/Green - Year + Industry + Canton FEs


