```
=== Improved Balanced Random Forest ===
Best Threshold (Class 0 F1 optimized): 0.4326
Class 0 Best F1-Score: 0.3057
Precision-Recall AUC: 0.9738

Confusion Matrix:
 [[ 260  385]
 [ 796 6719]]

Classification Report:
               precision    recall  f1-score   support

     Failure       0.25      0.40      0.31       645
     Success       0.95      0.89      0.92      7515

    accuracy                           0.86      8160
   macro avg       0.60      0.65      0.61      8160
weighted avg       0.89      0.86      0.87      8160
```

In [None]:
from pathlib import Path
file_path = Path().cwd().parent / 'data' / 'processed' /

In [None]:
# 0. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    precision_recall_curve,
    classification_report,
    confusion_matrix,
    average_precision_score
)
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.combine import SMOTETomek

# 1. Load and Preprocess
df = pd.read_csv("final/preprocessed_data.csv")
df = df.drop(columns=["city"])  # Remove high-cardinality
X = df.drop(columns=["success"])
y = df["success"]

categorical_features = ["country_code", "region"]
numerical_features = [
    "funding_total_usd", "funding_rounds", "founded_year",
    "first_funding_year", "last_funding_year",
    "days_to_first_funding", "funding_duration"
]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='passthrough')

pipeline = Pipeline([
    ("preprocessor", preprocessor)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train_processed = pipeline.fit_transform(X_train)
X_test_processed = pipeline.transform(X_test)

# 2. Smarter Sampling (SMOTE+Tomek)
smote_tomek = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train_processed, y_train)

print(f"Resampled training shape: {X_train_resampled.shape}, {y_train_resampled.shape}")

# 3. Train Improved Balanced Random Forest
brf = BalancedRandomForestClassifier(
    random_state=42,
    n_estimators=500,
    max_depth=10,
    max_features='sqrt',
    n_jobs=-1
)
brf.fit(X_train_resampled, y_train_resampled)

# 4. Threshold Optimization
y_pred_proba = brf.predict_proba(X_test_processed)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

f1_scores_class0 = []
for t in thresholds:
    y_pred_temp = (y_pred_proba >= t).astype(int)
    report = classification_report(y_test, y_pred_temp, output_dict=True, zero_division=0)
    f1_scores_class0.append(report['0']['f1-score'])

f1_scores_class0 = np.array(f1_scores_class0)
best_idx = np.argmax(f1_scores_class0)
best_threshold = thresholds[best_idx]
best_f1_class0 = f1_scores_class0[best_idx]

y_pred_best = (y_pred_proba >= best_threshold).astype(int)

# 5. Final Evaluation
pr_auc = average_precision_score(y_test, y_pred_proba)
confusion = confusion_matrix(y_test, y_pred_best)
report = classification_report(y_test, y_pred_best, target_names=["Failure", "Success"], zero_division=0)

print("\n=== Improved Balanced Random Forest ===")
print(f"Best Threshold (Class 0 F1 optimized): {best_threshold:.4f}")
print(f"Class 0 Best F1-Score: {best_f1_class0:.4f}")
print(f"Precision-Recall AUC: {pr_auc:.4f}")
print("\nConfusion Matrix:\n", confusion)
print("\nClassification Report:\n", report)






In [None]:
# 0. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    precision_recall_curve,
    classification_report,
    confusion_matrix,
    average_precision_score
)
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.combine import SMOTETomek


In [None]:
# 1. Load and Preprocess
df = pd.read_csv("final/preprocessed_data.csv")
df = df.drop(columns=["city"])  # Remove high-cardinality
X = df.drop(columns=["success"])
y = df["success"]

categorical_features = ["country_code", "region"]
numerical_features = [
    "funding_total_usd", "funding_rounds", "founded_year",
    "first_funding_year", "last_funding_year",
    "days_to_first_funding", "funding_duration"
]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='passthrough')

pipeline = Pipeline([
    ("preprocessor", preprocessor)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train_processed = pipeline.fit_transform(X_train)
X_test_processed = pipeline.transform(X_test)


In [None]:
# 2. Smarter Sampling (SMOTE+Tomek)
smote_tomek = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train_processed, y_train)

print(f"Resampled training shape: {X_train_resampled.shape}, {y_train_resampled.shape}")


In [None]:
# 3. Train Improved Balanced Random Forest
brf = BalancedRandomForestClassifier(
    random_state=42,
    n_estimators=500,
    max_depth=10,
    max_features='sqrt',
    n_jobs=-1
)
brf.fit(X_train_resampled, y_train_resampled)


In [None]:
# 4. Threshold Optimization
y_pred_proba = brf.predict_proba(X_test_processed)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

f1_scores_class0 = []
for t in thresholds:
    y_pred_temp = (y_pred_proba >= t).astype(int)
    report = classification_report(y_test, y_pred_temp, output_dict=True, zero_division=0)
    f1_scores_class0.append(report['0']['f1-score'])

f1_scores_class0 = np.array(f1_scores_class0)
best_idx = np.argmax(f1_scores_class0)
best_threshold = thresholds[best_idx]
best_f1_class0 = f1_scores_class0[best_idx]

y_pred_best = (y_pred_proba >= best_threshold).astype(int)


In [None]:
# 5. Final Evaluation
pr_auc = average_precision_score(y_test, y_pred_proba)
confusion = confusion_matrix(y_test, y_pred_best)
report = classification_report(y_test, y_pred_best, target_names=["Failure", "Success"], zero_division=0)

print("\n=== Improved Balanced Random Forest ===")
print(f"Best Threshold (Class 0 F1 optimized): {best_threshold:.4f}")
print(f"Class 0 Best F1-Score: {best_f1_class0:.4f}")
print(f"Precision-Recall AUC: {pr_auc:.4f}")
print("\nConfusion Matrix:\n", confusion)
print("\nClassification Report:\n", report)

In [5]:
print("""
### Improved Balanced Random Forest
- Best Class 0 Threshold: 0.4326
- Best Class 0 F1-Score: 0.3057
- Precision-Recall AUC: 0.9738
---

**Confusion Matrix**:
```
 [[ 260  385]
 [ 796 6719]]
```
**Classification Report**:
```
               precision    recall  f1-score   support

     Failure       0.25      0.40      0.31       645
     Success       0.95      0.89      0.92      7515

    accuracy                           0.86      8160
   macro avg       0.60      0.65      0.61      8160
weighted avg       0.89      0.86      0.87      8160
```
""")


### Improved Balanced Random Forest
- Best Class 0 Threshold: 0.4326
- Best Class 0 F1-Score: 0.3057
- Precision-Recall AUC: 0.9738
---

**Confusion Matrix**:
```
 [[ 260  385]
 [ 796 6719]]
```
**Classification Report**:
```
               precision    recall  f1-score   support

     Failure       0.25      0.40      0.31       645
     Success       0.95      0.89      0.92      7515

    accuracy                           0.86      8160
   macro avg       0.60      0.65      0.61      8160
weighted avg       0.89      0.86      0.87      8160
```



**Confusion Matrix**:
```
 [[ 260  385]
 [ 796 6719]]
```