In [None]:
!pip install sentence-transformers

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_percentage_error,mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Add to imports
from sentence_transformers import SentenceTransformer


In [None]:
df = pd.read_csv("founder_data1.csv").sample(frac=1).reset_index(drop=True)

# Define feature categories
continuous_features = [
    'number_of_roles', 'number_of_companies', 'industry_achievements', 'max_amount_raised',
    'previous_orgs_max_num_founders', 'org_num_founders', 'num_acquisitions', 'max_acquisition_amount',
    'max_ipo_amount_raised', 'repeat_ideal_days', 'press_media_coverage_count', 'experienced_funding_rounds'
]

categorical_features = [
    'perseverance', 'risk_tolerance', 'vision', 'adaptability', 'personal_branding', 'education_level',
    'education_institution', 'education_field_of_study', 'big_leadership', 'nasdaq_leadership',
    'number_of_leadership_roles', 'big_tech_position', 'worked_at_consultancy', 'worked_at_bank',
    'vc_experience', 'angel_experience', 'quant_experience', 'investor_quality_prior_startup',
    'previous_startup_funding_experience','org_state','org_city','org_category_list','domain_expertise','skill_relevance'
]

binary_features = [
    'professional_athlete', 'childhood_entrepreneurship', 'competitions', 'ten_thousand_hours_of_mastery',
    'education_international_experience', 'education_extracurricular_involvement', 'education_awards_and_honors',
    'being_lead_of_nonprofits', 'big_company_experience', 'nasdaq_company_experience', 'big_tech_experience',
    'google_experience', 'facebook_meta_experience', 'microsoft_experience', 'amazon_experience', 'apple_experience',
    'career_growth', 'moving_around', 'international_work_experience', 'worked_at_military', 'board_advisor_roles',
    'tier_1_vc_experience', 'startup_experience', 'ceo_experience', 'ipo_experience', 'founder_of_nonprofit'
]

# Convert binary features
for col in binary_features:
    df[col] = df[col].map({'TRUE': 1, 'FALSE': 0}).fillna(0).astype(int)

# Ensure numeric features
df[continuous_features] = df[continuous_features].apply(pd.to_numeric, errors='coerce')
print(len(df))
# Target variable
target = 'org_total_funding_usd'
df = df.dropna(subset=target).reset_index(drop=True)
df[target] = np.log1p(df[target])
print(len(df))
print(5198 in df.index)
# Process text features
text_model = SentenceTransformer('all-MiniLM-L6-v2')
df['combined_text'] = df['org_name'].fillna('') + " " + df['org_description'].fillna('')
text_embeddings = text_model.encode(df['combined_text'])

# Split data
X = df[continuous_features + categorical_features + binary_features]
y = df[target]
text_data = text_embeddings

X_train, X_test, y_train, y_test, text_train, text_test = train_test_split(
    X, y, text_data, test_size=0.2, random_state=42,shuffle=True
)

# Preprocessing pipelines
continuous_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

binary_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

preprocessor = ColumnTransformer([
    ('continuous', continuous_pipeline, continuous_features),
    ('categorical', categorical_pipeline, categorical_features),
    ('binary', binary_pipeline, binary_features)
])

# First-level models
model_1 = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

model_2 = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train base models
model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)

# Generate predictions
train_pred1 = model_1.predict(X_train)
train_pred2 = model_2.predict(X_train)
test_pred1 = model_1.predict(X_test)
test_pred2 = model_2.predict(X_test)

# Combine predictions with text embeddings
stacked_train = np.column_stack((train_pred1, train_pred2, text_train))
stacked_test = np.column_stack((test_pred1, test_pred2, text_test))

# Meta-model with text-enhanced features
meta_model = LinearRegression()
meta_model.fit(stacked_train, y_train)
final_predictions = meta_model.predict(stacked_test)

# Evaluation
mape = mean_absolute_percentage_error(np.expm1(y_test), np.expm1(final_predictions))
mae = mean_absolute_error(np.expm1(y_test), np.expm1(final_predictions))

print(f"Mean Absolute Percentage Error: {mape:.2f}")
print(f"Mean Absolute Error: ${mae:,.2f}")

# Save predictions
X_test = X_test.copy()
X_test["predicted_funding_usd"] = np.expm1(final_predictions)
#X_test.to_csv("test_set_with_predictions.csv", index=False)
print(X_test.head())

In [None]:
def calculate_mae(model1, model2, meta_model, X, y_true, text_embeddings):
    """Calculate MAE for the entire stacked model including text embeddings"""
    pred1 = model1.predict(X)
    pred2 = model2.predict(X)
    stacked_pred = np.column_stack((pred1, pred2, text_embeddings))
    final_pred = meta_model.predict(stacked_pred)
    return mean_absolute_error(np.expm1(y_true), np.expm1(final_pred))

# Calculate baseline MAE with original features and text embeddings
baseline_mae = calculate_mae(model_1, model_2, meta_model, X_test, y_test, text_test)
print(f"Baseline MAE: ${baseline_mae:,.2f}")

# Initialize feature importance storage
feature_impact = {}

# Calculate permutation importance for each feature
for feature in X.columns:
    # Create permuted test set
    X_permuted = X_test.copy()
    X_permuted[feature] = X_test[feature].sample(frac=1, random_state=42).values  # Shuffle feature values

    # Calculate MAE with permuted feature
    permuted_mae = calculate_mae(model_1, model_2, meta_model, X_permuted, y_test, text_test)

    # Store importance as MAE increase ratio
    feature_impact[feature] = (permuted_mae - baseline_mae) / baseline_mae * 100

# Calculate importance for text embeddings
text_permuted = np.random.permutation(text_test)
text_mae = calculate_mae(model_1, model_2, meta_model, X_test, y_test, text_permuted)
feature_impact['text_embeddings'] = (text_mae - baseline_mae) / baseline_mae * 100

# Create sorted DataFrame of feature impacts
impact_df = pd.DataFrame({
    'Feature': feature_impact.keys(),
    'Impact (%)': feature_impact.values()
}).sort_values('Impact (%)', ascending=False).reset_index(drop=True)

# Display results
print("\nTop 10 Most Impactful Features:")
print(impact_df.head(10))

print("\nTop 10 Least Impactful Features:")
print(impact_df.tail(10))

print("\nTop 20 Features:")
print(impact_df.head(20))

# Analyze text feature importance separately
text_weights = meta_model.coef_[2:]  # First 2 are base model weights
print(f"\nOverall text contribution: {np.abs(text_weights).sum()/np.abs(meta_model.coef_).sum():.1%}")

# Most influential embedding dimensions
top_dims = np.argsort(-np.abs(text_weights))[:5]
print("Top 5 impactful text embedding dimensions:", top_dims)

In [None]:
X = df[continuous_features + categorical_features + binary_features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Rerun the model training and prediction steps
# ... (rerun the model training code here)

# Categorize funding function
def categorize_funding(amount):
    if amount < 1e6:
        return "100k-1M"
    elif amount < 1e7:
        return "1M-10M"
    elif amount < 1e8:
        return "10M-100M"
    elif amount < 1e9:
        return "100M-1B"
    else:
        return "1B+"

# Add predictions and funding category to the training set
X_train_copy = X_train.copy()
X_train_copy["predicted_funding_usd"] = np.expm1(model_1.predict(X_train))
X_train_copy["funding_category"] = X_train_copy["predicted_funding_usd"].apply(categorize_funding)
X_train_copy["success"] = df.loc[X_train.index, "success"]

X_test_copy = X_test.copy()
X_test_copy["predicted_funding_usd"] = np.expm1(final_predictions)
X_test_copy["funding_category"] = X_test_copy["predicted_funding_usd"].apply(categorize_funding)
X_test_copy["success"] = df.loc[X_test.index, "success"]

# Calculate success probabilities using the training set
def calculate_success_probability(df, success_column):
    success_probabilities = df.groupby("funding_category")[success_column].mean()
    return success_probabilities

success_probabilities = calculate_success_probability(X_train_copy, "success")

print("Success probabilities by funding category (from training set):")
print(success_probabilities)

# Now apply these probabilities to the test set
X_test_copy = X_test.copy()
X_test_copy["predicted_funding_usd"] = np.expm1(final_predictions)
X_test_copy["funding_category"] = X_test_copy["predicted_funding_usd"].apply(categorize_funding)
X_test_copy["success"] = df.loc[X_test.index, "success"]
X_test_copy["predicted_success_prob"] = X_test_copy["funding_category"].map(success_probabilities)

# Calculate accuracy of the success prediction on the test set
def calculate_accuracy(df, prob_threshold=0.5):
    df["predicted_success"] = df["predicted_success_prob"] > prob_threshold
    accuracy = (df["predicted_success"] == df["success"]).mean()
    return accuracy

accuracy = calculate_accuracy(X_test_copy)
print(f"\nAccuracy of success prediction on test set: {accuracy:.2f}")

# Calculate additional metrics
true_positives = ((X_test_copy["predicted_success"] == 1) & (X_test_copy["success"] == 1)).sum()
false_positives = ((X_test_copy["predicted_success"] == 1) & (X_test_copy["success"] == 0)).sum()
total_positives = X_test_copy["success"].sum()
test_set_length = len(X_test_copy)

print("\nAdditional Metrics:")
print(f"Total number of positives predicted correctly (True Positives): {true_positives}")
print(f"Total number of positives incorrectly predicted (False Positives): {false_positives}")
print(f"Total number of actual positives in test set: {total_positives}")
print(f"Length of test data: {test_set_length}")

# Calculate precision and recall
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / total_positives if total_positives > 0 else 0

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")


In [None]:
X = df[continuous_features + categorical_features + binary_features]  # Remove 'org_total_funding_usd'
y_funding = df[target]
y_success = df["success"]

# Process text features
text_model = SentenceTransformer('all-MiniLM-L6-v2')
df['combined_text'] = df['org_name'].fillna('') + " " + df['org_description'].fillna('')
text_embeddings = text_model.encode(df['combined_text'].tolist())

# Split data while maintaining alignment
X_train, X_test, y_train_funding, y_test_funding, y_train_success, y_test_success, text_train, text_test = train_test_split(
    X, y_funding, y_success, text_embeddings, test_size=0.2, random_state=42
)

# ... (keep preprocessing and stacking model code)

# Generate funding predictions CORRECTLY
train_funding_pred = meta_model.predict(
    np.column_stack((model_1.predict(X_train), model_2.predict(X_train), text_train))
)
test_funding_pred = meta_model.predict(
    np.column_stack((model_1.predict(X_test), model_2.predict(X_test), text_test))
)

# Prepare logistic regression data CORRECTLY
X_train_logistic = pd.DataFrame({'predicted_funding': train_funding_pred})
X_test_logistic = pd.DataFrame({'predicted_funding': test_funding_pred})
print(X_train_logistic.head())
print(X_test_logistic.head())

# Rest remains the same
log_reg = LogisticRegression()
log_reg.fit(X_train_logistic, y_train_success)

success_probabilities = log_reg.predict_proba(X_test_logistic)[:, 1]
success_predictions = (success_probabilities >= 0.75).astype(int)
print(success_probabilities)

# Evaluation code remains unchanged
print("\nLogistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test_success, success_predictions):.2f}")
print("Classification Report:")
print(classification_report(y_test_success, success_predictions))

cm = confusion_matrix(y_test_success, success_predictions)
tn, fp, fn, tp = cm.ravel()

print("\nDetailed Success Prediction Metrics:")
print(f"True Positives (Correct Successes): {tp}")
print(f"False Positives (Incorrect Successes): {fp}")
print(f"True Negatives (Correct Non-Successes): {tn}")
print(f"False Negatives (Incorrect Non-Successes): {fn}")
print(f"Total Actual Positives: {tp + fn}")
print(f"Total Test Samples: {len(X_test)}")

# Ensure we're using the TEST dataframe copy
X_test = X_test.copy()
X_test["predicted_success"] = success_predictions
X_test["success_probability"] = success_probabilities
X_test["predicted_funding_usd"] = np.expm1(test_funding_pred)  # Convert back from log scale

# Add text embedding analysis
text_weights = meta_model.coef_[2:]  # First 2 are base model weights
print(f"\nText contribution in funding prediction: {np.abs(text_weights).sum()/np.abs(meta_model.coef_).sum():.1%}")

# Most influential embedding dimensions for funding prediction
top_dims = np.argsort(-np.abs(text_weights))[:5]
print("Top 5 impactful text embedding dimensions for funding:", top_dims)

# Analyze text impact on success prediction
text_success_impact = log_reg.coef_[0] * meta_model.coef_[2:]
print(f"\nText contribution in success prediction: {np.abs(text_success_impact).sum()/np.abs(log_reg.coef_[0] * meta_model.coef_).sum():.1%}")

# Most influential embedding dimensions for success prediction
top_success_dims = np.argsort(-np.abs(text_success_impact))[:5]
print("Top 5 impactful text embedding dimensions for success:", top_success_dims)