In [None]:
# 1. Imports
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    precision_recall_curve, classification_report,
    confusion_matrix, average_precision_score
)

from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.combine import SMOTETomek
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


In [None]:
# 2. Paths
RAW_DATA_PATH = Path("big_startup_secsees_dataset.csv")
CLEANED_DATA_PATH = Path("final/preprocessed_data.csv")


In [None]:
# 3. Load and Clean Raw Data
df = pd.read_csv(RAW_DATA_PATH)

df['success'] = df['status'].apply(lambda x: 0 if x == 'closed' else 1)
df.drop(columns=['permalink', 'homepage_url', 'name', 'status', 'state_code'], inplace=True)

fill_cols = ['category_list', 'country_code', 'region', 'city']
df[fill_cols] = df[fill_cols].fillna('Unknown')
df.dropna(subset=['first_funding_at'], inplace=True)

df['funding_total_usd'] = df['funding_total_usd'].replace('-', np.nan)
df['funding_total_usd'] = pd.to_numeric(df['funding_total_usd'], errors='coerce')

for col in ['founded_at', 'first_funding_at', 'last_funding_at']:
    df[col] = pd.to_datetime(df[col], errors='coerce')

df['founded_year'] = df['founded_at'].dt.year
df['first_funding_year'] = df['first_funding_at'].dt.year
df['last_funding_year'] = df['last_funding_at'].dt.year
df['days_to_first_funding'] = (df['first_funding_at'] - df['founded_at']).dt.days
df['funding_duration'] = (df['last_funding_at'] - df['first_funding_at']).dt.days

df = df[df['funding_total_usd'] <= 5_000_000_000]
df = df[(df['founded_year'] >= 1990) & (df['founded_year'] <= 2015)]
df.dropna(subset=['days_to_first_funding', 'funding_duration'], inplace=True)


In [None]:
# 4. Category Clustering via TF-IDF + KMeans
unique_categories = pd.Series(df['category_list'].dropna().unique())
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.95, min_df=2)
category_matrix = vectorizer.fit_transform(unique_categories)

kmeans = KMeans(n_clusters=20, random_state=42)
category_clusters = kmeans.fit_predict(category_matrix)
category_cluster_map = dict(zip(unique_categories, category_clusters))
df['category_cluster'] = df['category_list'].map(category_cluster_map)

df.drop(columns=['category_list', 'founded_at', 'first_funding_at', 'last_funding_at'], inplace=True)
df.to_csv(CLEANED_DATA_PATH, index=False)


In [None]:
# 5. Load and Prepare Data for Modeling
df = pd.read_csv(CLEANED_DATA_PATH)
df.drop(columns=["city"], inplace=True)

X = df.drop(columns=["success"])
y = df["success"]

categorical_features = ["country_code", "region"]
numerical_features = [
    "funding_total_usd", "funding_rounds", "founded_year",
    "first_funding_year", "last_funding_year",
    "days_to_first_funding", "funding_duration"
]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='passthrough')

pipeline = Pipeline([("preprocessor", preprocessor)])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train_scaled = pipeline.fit_transform(X_train)
X_test_scaled = pipeline.transform(X_test)


In [None]:
# 6. Resample with SMOTE + Tomek
sampler = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = sampler.fit_resample(X_train_scaled, y_train)

print(f"Resampled training shape: {X_train_resampled.shape}, {y_train_resampled.shape}")


In [None]:
# 7. Train Balanced Random Forest Model
brf_model = BalancedRandomForestClassifier(
    random_state=42,
    n_estimators=500,
    max_depth=10,
    max_features='sqrt',
    n_jobs=-1
)
brf_model.fit(X_train_resampled, y_train_resampled)


In [None]:
# 8. Threshold Optimization
y_proba = brf_model.predict_proba(X_test_scaled)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

f1_scores_class0 = [
    classification_report(y_test, (y_proba >= t).astype(int),
                          output_dict=True, zero_division=0)['0']['f1-score']
    for t in thresholds
]

best_index = np.argmax(f1_scores_class0)
optimal_threshold = thresholds[best_index]
best_f1_score = f1_scores_class0[best_index]
y_pred_optimal = (y_proba >= optimal_threshold).astype(int)


In [None]:
# 9. Final Evaluation
pr_auc = average_precision_score(y_test, y_proba)
conf_matrix = confusion_matrix(y_test, y_pred_optimal)
clf_report = classification_report(y_test, y_pred_optimal, target_names=["Failure", "Success"], zero_division=0)

print("\n=== Model Performance Summary ===")
print(f"Optimized Threshold:         {optimal_threshold:.4f}")
print(f"Class 0 Best F1-Score:       {best_f1_score:.4f}")
print(f"Precision-Recall AUC:        {pr_auc:.4f}")
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", clf_report)
