# ðŸš€ Kickstarter Projects â€” ML Analysis

**Dataset**: 374,853 crowdfunding projects (2009â€“2018), 15 categories, 22 countries

## ML Tasks
| # | Task | Type | Target |
|---|------|------|--------|
| 1 | Project Success Prediction | Binary Classification | Successful vs Failed |
| 2 | Pledged Amount Regression | Regression | Log(USD pledged + 1) |
| 3 | Backer Count Regression | Regression | Log(backers + 1) |
| 4 | Project Category Clustering | Unsupervised | K-Means / DBSCAN |

In [None]:
import warnings, os, base64, io, pathlib
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
from jinja2 import Template

from sklearn.model_selection import (train_test_split, cross_val_score,
                                     GridSearchCV, RandomizedSearchCV,
                                     learning_curve, StratifiedKFold)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,
                              GradientBoostingClassifier, GradientBoostingRegressor,
                              AdaBoostClassifier, VotingClassifier, StackingClassifier)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report,
                             mean_absolute_error, mean_squared_error, r2_score)
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA

SEED = 42
np.random.seed(SEED)
sns.set_style('whitegrid')
plt.rcParams.update({'figure.max_open_warning': 0, 'figure.dpi': 120})

PLOT_DIR = pathlib.Path('outputs/plots')
PLOT_DIR.mkdir(parents=True, exist_ok=True)

all_results = {}
saved_plots = []

def save(fig, name, title=None):
    p = PLOT_DIR / name
    fig.savefig(p, bbox_inches='tight', facecolor='white')
    plt.close(fig)
    saved_plots.append((title or name.replace('.png','').replace('_',' ').title(), str(p)))
    print(f'  âœ“ {name}')

import sklearn
print(f'All imports successful')
print(f'  scikit-learn {sklearn.__version__}')
print(f'  pandas {pd.__version__}, numpy {np.__version__}')

## 1 Â· Data Loading & Exploration

In [None]:
DATA = pathlib.Path('Kickstarter+Projects')
raw = pd.read_csv(DATA / 'kickstarter_projects.csv')

print(f'Raw dataset: {raw.shape[0]:,} rows Ã— {raw.shape[1]} columns')
print(f'Columns: {list(raw.columns)}')
print(f'\nState distribution:')
print(raw['State'].value_counts().to_string())
print(f'\nNumeric summary:')
print(raw[['Goal', 'Pledged', 'Backers']].describe().round(1).to_string())
print(f'\nCategories: {raw["Category"].nunique()}, Subcategories: {raw["Subcategory"].nunique()}')
print(f'Countries: {raw["Country"].nunique()}')
print(f'Missing values: {raw.isnull().sum().sum()}')
raw.head()

## 2 Â· Data Cleaning & Feature Engineering

In [None]:
df = raw.copy()

# â”€â”€ Parse dates â”€â”€
df['Launched'] = pd.to_datetime(df['Launched'])
df['Deadline'] = pd.to_datetime(df['Deadline'])

# â”€â”€ Campaign duration in days â”€â”€
df['campaign_days'] = (df['Deadline'] - df['Launched']).dt.total_seconds() / 86400
df['campaign_days'] = df['campaign_days'].clip(lower=1)  # minimum 1 day

# â”€â”€ Time features â”€â”€
df['launch_year'] = df['Launched'].dt.year
df['launch_month'] = df['Launched'].dt.month
df['launch_dow'] = df['Launched'].dt.dayofweek   # 0=Mon, 6=Sun
df['launch_hour'] = df['Launched'].dt.hour

# â”€â”€ Log transforms (highly skewed numerics) â”€â”€
df['log_goal'] = np.log1p(df['Goal'])
df['log_pledged'] = np.log1p(df['Pledged'])
df['log_backers'] = np.log1p(df['Backers'])

# â”€â”€ Name length as proxy for project description detail â”€â”€
df['name_length'] = df['Name'].fillna('').str.len()
df['name_word_count'] = df['Name'].fillna('').str.split().str.len()

# â”€â”€ Category encoding â”€â”€
le_cat = LabelEncoder()
df['category_enc'] = le_cat.fit_transform(df['Category'])
cat_classes = list(le_cat.classes_)

le_country = LabelEncoder()
df['country_enc'] = le_country.fit_transform(df['Country'])

# â”€â”€ Binary target: focus on Successful vs Failed (drop others) â”€â”€
df_binary = df[df['State'].isin(['Successful', 'Failed'])].copy()
df_binary['success'] = (df_binary['State'] == 'Successful').astype(int)

print(f'Full dataset with features: {df.shape}')
print(f'Binary (Successful/Failed): {len(df_binary):,} rows')
print(f'  Successful: {df_binary["success"].sum():,} ({df_binary["success"].mean():.1%})')
print(f'  Failed:     {(1-df_binary["success"]).sum():,} ({1-df_binary["success"].mean():.1%})')
print(f'\nEngineered features: campaign_days, launch_year/month/dow/hour, '
      f'log_goal, name_length, name_word_count, category_enc, country_enc')
print(f'\nCampaign days: mean={df_binary["campaign_days"].mean():.1f}, '
      f'median={df_binary["campaign_days"].median():.1f}')

## 3 Â· Exploratory Data Analysis

In [None]:
# â”€â”€ Plot 1: Success rate by category â”€â”€
fig, ax = plt.subplots(figsize=(10, 6))
cat_success = df_binary.groupby('Category')['success'].agg(['mean', 'count']).sort_values('mean', ascending=True)
colors = plt.cm.RdYlGn(cat_success['mean'])
bars = ax.barh(cat_success.index, cat_success['mean'] * 100, color=colors, edgecolor='white')
ax.set_xlabel('Success Rate (%)')
ax.set_title('Kickstarter Success Rate by Category')
for i, (rate, count) in enumerate(zip(cat_success['mean'], cat_success['count'])):
    ax.text(rate * 100 + 0.5, i, f'{rate:.0%} ({count:,})', va='center', fontsize=8)
ax.axvline(x=df_binary['success'].mean() * 100, color='red', linestyle='--', alpha=0.7, label='Overall avg')
ax.legend()
fig.tight_layout()
save(fig, 'success_by_category.png', 'Success Rate by Category')

# â”€â”€ Plot 2: Goal distribution â€” success vs failed â”€â”€
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for i, state in enumerate(['Successful', 'Failed']):
    subset = df_binary[df_binary['State'] == state]
    axes[i].hist(subset['log_goal'], bins=50, color='#2E86AB' if i == 0 else '#E74C3C',
                 alpha=0.8, edgecolor='white')
    axes[i].set_xlabel('Log(Goal + 1)')
    axes[i].set_ylabel('Count')
    axes[i].set_title(f'{state} Projects â€” Goal Distribution')
    axes[i].axvline(subset['log_goal'].median(), color='black', linestyle='--',
                     label=f'Median: ${np.expm1(subset["log_goal"].median()):,.0f}')
    axes[i].legend()
fig.tight_layout()
save(fig, 'goal_distribution.png', 'Goal Distribution â€” Success vs Failed')

# â”€â”€ Plot 3: Projects over time â”€â”€
fig, ax = plt.subplots(figsize=(10, 5))
monthly = df_binary.groupby([df_binary['Launched'].dt.to_period('Q')])['success'].agg(['count', 'mean'])
ax.bar(range(len(monthly)), monthly['count'], color='#2E86AB', alpha=0.6, label='Total Projects')
ax2 = ax.twinx()
ax2.plot(range(len(monthly)), monthly['mean'] * 100, 'o-', color='#F18F01', linewidth=2, label='Success Rate %')
ax.set_xlabel('Quarter')
ax.set_ylabel('Number of Projects', color='#2E86AB')
ax2.set_ylabel('Success Rate %', color='#F18F01')
xtick_positions = range(0, len(monthly), max(1, len(monthly)//10))
ax.set_xticks(list(xtick_positions))
ax.set_xticklabels([str(monthly.index[i]) for i in xtick_positions], rotation=45, fontsize=8)
lines1, labels1 = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
ax.set_title('Kickstarter Projects Over Time')
fig.tight_layout()
save(fig, 'projects_over_time.png', 'Projects Over Time')

# â”€â”€ Plot 4: Success rate by campaign duration â”€â”€
fig, ax = plt.subplots(figsize=(10, 5))
dur_bins = pd.cut(df_binary['campaign_days'], bins=[0, 7, 15, 30, 45, 60, 90, 365])
dur_success = df_binary.groupby(dur_bins, observed=True)['success'].agg(['mean', 'count'])
dur_success.plot.bar(y='mean', ax=ax, color='#2E86AB', edgecolor='white', legend=False)
ax.set_xlabel('Campaign Duration (days)')
ax.set_ylabel('Success Rate')
ax.set_title('Success Rate by Campaign Duration')
ax.tick_params(axis='x', rotation=45)
for i, (rate, count) in enumerate(zip(dur_success['mean'], dur_success['count'])):
    ax.text(i, rate + 0.01, f'{rate:.0%}\n({count:,})', ha='center', fontsize=7)
fig.tight_layout()
save(fig, 'success_by_duration.png', 'Success Rate by Campaign Duration')

# â”€â”€ Plot 5: Top 10 countries â”€â”€
fig, ax = plt.subplots(figsize=(10, 5))
top_countries = df_binary.groupby('Country')['success'].agg(['mean', 'count']).nlargest(15, 'count')
top_countries = top_countries.sort_values('mean', ascending=True)
ax.barh(top_countries.index, top_countries['mean'] * 100,
        color=plt.cm.viridis(np.linspace(0.2, 0.9, len(top_countries))), edgecolor='white')
ax.set_xlabel('Success Rate (%)')
ax.set_title('Success Rate by Country (top 15 by volume)')
for i, (rate, count) in enumerate(zip(top_countries['mean'], top_countries['count'])):
    ax.text(rate * 100 + 0.3, i, f'{rate:.0%} ({count:,})', va='center', fontsize=8)
fig.tight_layout()
save(fig, 'success_by_country.png', 'Success Rate by Country')

# â”€â”€ Plot 6: Correlation heatmap â”€â”€
num_cols = ['Goal', 'log_goal', 'campaign_days', 'name_length', 'name_word_count',
            'launch_year', 'launch_month', 'launch_dow', 'launch_hour',
            'category_enc', 'country_enc', 'success']
fig, ax = plt.subplots(figsize=(10, 8))
corr = df_binary[num_cols].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r',
            center=0, square=True, linewidths=0.5, ax=ax, annot_kws={'size': 7})
ax.set_title('Feature Correlation Heatmap')
fig.tight_layout()
save(fig, 'correlation_heatmap.png', 'Feature Correlation Heatmap')

print('All EDA plots saved!')

## 4 Â· Task 1 â€” Project Success Prediction (Binary Classification)
Predict whether a Kickstarter project will be **Successful** or **Failed** using only pre-launch features
(goal, category, country, campaign duration, name length, launch timing).

In [None]:
# â”€â”€ Pre-launch features only (no pledged/backers â€” those are post-launch!) â”€â”€
clf_features = ['log_goal', 'campaign_days', 'category_enc', 'country_enc',
                'launch_year', 'launch_month', 'launch_dow', 'launch_hour',
                'name_length', 'name_word_count']

X_clf = df_binary[clf_features].astype(float)
y_clf = df_binary['success']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=SEED, stratify=y_clf)

scaler_c = StandardScaler()
X_train_cs = scaler_c.fit_transform(X_train_c)
X_test_cs = scaler_c.transform(X_test_c)

print(f'Classification dataset: {len(X_clf):,} rows')
print(f'Train: {len(X_train_c):,}  |  Test: {len(X_test_c):,}')
print(f'Positive rate: {y_clf.mean():.1%}')
print(f'Features: {clf_features}')

# â”€â”€ Use a sample for slow models (SVM, MLP) â”€â”€
SAMPLE_SIZE = 50000
if len(X_train_c) > SAMPLE_SIZE:
    idx_sample = np.random.choice(len(X_train_c), SAMPLE_SIZE, replace=False)
    X_train_sample = X_train_c.iloc[idx_sample]
    y_train_sample = y_train_c.iloc[idx_sample]
    X_train_sample_s = scaler_c.transform(X_train_sample)
    print(f'Sampled {SAMPLE_SIZE:,} rows for slow models (SVM, MLP)')
else:
    X_train_sample = X_train_c
    y_train_sample = y_train_c
    X_train_sample_s = X_train_cs

def eval_clf(name, model, Xtr, ytr, Xte, yte):
    model.fit(Xtr, ytr)
    pred = model.predict(Xte)
    acc = accuracy_score(yte, pred)
    f1 = f1_score(yte, pred, average='weighted')
    prec = precision_score(yte, pred, average='weighted')
    rec = recall_score(yte, pred, average='weighted')
    return {'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1 Score': f1,
            'model': model, 'predictions': pred}

In [None]:
clf_results = {}

# â”€â”€ Fast models on full training data â”€â”€
fast_classifiers = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=SEED), True, False),
    ('Decision Tree', DecisionTreeClassifier(max_depth=12, random_state=SEED), False, False),
    ('Random Forest', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=SEED, n_jobs=-1), False, False),
    ('Gradient Boosting', GradientBoostingClassifier(n_estimators=200, max_depth=5, random_state=SEED), False, False),
    ('AdaBoost', AdaBoostClassifier(n_estimators=100, random_state=SEED), False, False),
    ('Naive Bayes', GaussianNB(), True, False),
]

# â”€â”€ Slow models on sampled data â”€â”€
slow_classifiers = [
    ('SVM (linear)', SVC(kernel='linear', random_state=SEED), True, True),
    ('SVM (rbf)', SVC(kernel='rbf', random_state=SEED), True, True),
    ('MLP Neural Network', MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300,
                                          random_state=SEED, early_stopping=True), True, True),
]

print('Project Success Classification')
print('=' * 65)

for name, model, needs_scale, use_sample in fast_classifiers + slow_classifiers:
    if use_sample:
        Xtr = X_train_sample_s if needs_scale else X_train_sample
        ytr = y_train_sample
    else:
        Xtr = X_train_cs if needs_scale else X_train_c
        ytr = y_train_c
    Xte = X_test_cs if needs_scale else X_test_c
    r = eval_clf(name, model, Xtr, ytr, Xte, y_test_c)
    clf_results[name] = r
    print(f'  {name:30s} Acc={r["Accuracy"]:.4f}  F1={r["F1 Score"]:.4f}')

# KNN with k search (on sample)
k_scores = {}
for k in [3, 5, 7, 9, 11]:
    knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    knn.fit(X_train_sample_s, y_train_sample)
    k_scores[k] = accuracy_score(y_test_c, knn.predict(X_test_cs))
best_k = max(k_scores, key=k_scores.get)
print(f'  KNN by k: {k_scores} -> best k={best_k}')

r = eval_clf(f'KNN (k={best_k})', KNeighborsClassifier(n_neighbors=best_k, n_jobs=-1),
             X_train_sample_s, y_train_sample, X_test_cs, y_test_c)
clf_results[f'KNN (k={best_k})'] = r
print(f'  {"KNN (k="+str(best_k)+")":30s} Acc={r["Accuracy"]:.4f}  F1={r["F1 Score"]:.4f}')

print(f'\nAll {len(clf_results)} classifiers trained!')

## 5 Â· Task 2 â€” Pledged Amount Regression
Predict the **log-transformed pledged amount** for successful projects using goal, category, country, and campaign features.

In [None]:
# â”€â”€ Regression: predict log(pledged) for successful projects â”€â”€
succ = df_binary[df_binary['success'] == 1].copy()

reg_features = ['log_goal', 'campaign_days', 'category_enc', 'country_enc',
                'launch_year', 'launch_month', 'launch_dow', 'launch_hour',
                'name_length', 'name_word_count']

X_reg = succ[reg_features].astype(float)
y_reg = succ['log_pledged']

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=SEED)

scaler_r = StandardScaler()
X_train_rs = scaler_r.fit_transform(X_train_r)
X_test_rs = scaler_r.transform(X_test_r)

print(f'Regression dataset (successful projects): {len(X_reg):,} rows')
print(f'Train: {len(X_train_r):,}  |  Test: {len(X_test_r):,}')
print(f'Target: log(pledged+1), range [{y_reg.min():.2f}, {y_reg.max():.2f}]')

regressors = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.01),
    'Decision Tree': DecisionTreeRegressor(max_depth=10, random_state=SEED),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=12, random_state=SEED, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, max_depth=5, random_state=SEED),
}

reg_results = {}
print(f'\nPledged Amount Regression:')
print('=' * 70)
for name, model in regressors.items():
    use_scaled = name in ['Ridge Regression', 'Lasso Regression']
    Xtr = X_train_rs if use_scaled else X_train_r
    Xte = X_test_rs if use_scaled else X_test_r
    model.fit(Xtr, y_train_r)
    pred = model.predict(Xte)
    r2 = r2_score(y_test_r, pred)
    mae = mean_absolute_error(y_test_r, pred)
    rmse = np.sqrt(mean_squared_error(y_test_r, pred))
    reg_results[name] = {'RÂ²': r2, 'MAE': mae, 'RMSE': rmse, 'model': model}
    print(f'  {name:30s} RÂ²={r2:.4f}  MAE={mae:.2f}  RMSE={rmse:.2f}')

# â”€â”€ Plots â”€â”€
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
names = list(reg_results.keys())
r2s = [reg_results[n]['RÂ²'] for n in names]
maes = [reg_results[n]['MAE'] for n in names]

axes[0].barh(names, r2s, color='#2E86AB', edgecolor='white')
axes[0].set_xlabel('RÂ² Score'); axes[0].set_title('Regression â€” RÂ² Comparison')
for i, v in enumerate(r2s):
    axes[0].text(max(v + 0.01, 0.01), i, f'{v:.4f}', va='center', fontsize=8)

axes[1].barh(names, maes, color='#F18F01', edgecolor='white')
axes[1].set_xlabel('MAE'); axes[1].set_title('Regression â€” MAE Comparison')
for i, v in enumerate(maes):
    axes[1].text(v + 0.02, i, f'{v:.2f}', va='center', fontsize=8)
fig.tight_layout()
save(fig, 'regression_results.png', 'Pledged Amount Regression Results')

# Feature importance
best_tree_name = max(['Decision Tree', 'Random Forest', 'Gradient Boosting'],
                      key=lambda n: reg_results[n]['RÂ²'])
best_tree = reg_results[best_tree_name]['model']
imp = pd.Series(best_tree.feature_importances_, index=reg_features).sort_values()

fig, ax = plt.subplots(figsize=(8, 5))
imp.plot.barh(ax=ax, color='#2E86AB', edgecolor='white')
ax.set_xlabel('Feature Importance'); ax.set_title(f'Regression Feature Importance ({best_tree_name})')
fig.tight_layout()
save(fig, 'regression_feature_importance.png', 'Regression Feature Importance')

all_results['regression'] = {k: {m: v for m, v in v.items() if m != 'model'}
                              for k, v in reg_results.items()}
best_reg = max(reg_results, key=lambda n: reg_results[n]['RÂ²'])
print(f'\nBest: {best_reg} (RÂ² = {reg_results[best_reg]["RÂ²"]:.4f})')

## 6 Â· Task 3 â€” Backer Count Regression
Predict the **log-transformed number of backers** using pre-launch features.

In [None]:
# â”€â”€ Backer count regression (all projects) â”€â”€
backer_features = ['log_goal', 'campaign_days', 'category_enc', 'country_enc',
                   'launch_year', 'launch_month', 'launch_dow', 'launch_hour',
                   'name_length', 'name_word_count', 'success']

X_bk = df_binary[backer_features].astype(float)
y_bk = df_binary['log_backers']

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_bk, y_bk, test_size=0.2, random_state=SEED)

print(f'Backer regression dataset: {len(X_bk):,} rows')
print(f'Train: {len(X_train_b):,}  |  Test: {len(X_test_b):,}')

backer_regressors = {
    'Linear': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=12, random_state=SEED, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, max_depth=5, random_state=SEED),
}

backer_results = {}
print(f'\nBacker Count Regression:')
print('=' * 70)
for name, model in backer_regressors.items():
    model.fit(X_train_b, y_train_b)
    pred = model.predict(X_test_b)
    r2 = r2_score(y_test_b, pred)
    mae = mean_absolute_error(y_test_b, pred)
    rmse = np.sqrt(mean_squared_error(y_test_b, pred))
    backer_results[name] = {'RÂ²': r2, 'MAE': mae, 'RMSE': rmse}
    print(f'  {name:30s} RÂ²={r2:.4f}  MAE={mae:.2f}  RMSE={rmse:.2f}')

fig, ax = plt.subplots(figsize=(8, 5))
names_b = list(backer_results.keys())
r2s_b = [backer_results[n]['RÂ²'] for n in names_b]
ax.barh(names_b, r2s_b, color='#2E86AB', edgecolor='white')
ax.set_xlabel('RÂ² Score'); ax.set_title('Backer Count Regression â€” RÂ² Comparison')
for i, v in enumerate(r2s_b):
    ax.text(max(v + 0.01, 0.01), i, f'{v:.4f}', va='center', fontsize=8)
fig.tight_layout()
save(fig, 'backer_regression.png', 'Backer Count Regression')

all_results['backer_regression'] = backer_results
print(f'\nBest: {max(backer_results, key=lambda n: backer_results[n]["RÂ²"])}')

## 7 Â· Task 4 â€” Project Category Clustering
Cluster project categories by their success rate, average goal, average pledged, average backers, and campaign duration.

In [None]:
# â”€â”€ Category-level profiles â”€â”€
cat_profiles = df_binary.groupby('Category').agg(
    success_rate=('success', 'mean'),
    avg_goal=('log_goal', 'mean'),
    avg_pledged=('log_pledged', 'mean'),
    avg_backers=('log_backers', 'mean'),
    avg_duration=('campaign_days', 'mean'),
    n_projects=('success', 'count'),
).sort_values('n_projects', ascending=False)

print(f'Category profiles: {len(cat_profiles)} categories')
print(cat_profiles.round(3).to_string())

cluster_feats = ['success_rate', 'avg_goal', 'avg_pledged', 'avg_backers', 'avg_duration']
X_clust = cat_profiles[cluster_feats].values
scaler_cl = StandardScaler()
X_clust_s = scaler_cl.fit_transform(X_clust)

# â”€â”€ Elbow + Silhouette â”€â”€
from sklearn.metrics import silhouette_score

K_range = range(2, min(11, len(cat_profiles)))
inertias, sils = [], []
for k in K_range:
    km = KMeans(n_clusters=k, random_state=SEED, n_init=10)
    km.fit(X_clust_s)
    inertias.append(km.inertia_)
    sils.append(silhouette_score(X_clust_s, km.labels_))

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(list(K_range), inertias, 'o-', color='#2E86AB', linewidth=2)
axes[0].set_xlabel('k'); axes[0].set_ylabel('Inertia'); axes[0].set_title('Elbow Method')
axes[1].plot(list(K_range), sils, 'o-', color='#F18F01', linewidth=2)
axes[1].set_xlabel('k'); axes[1].set_ylabel('Silhouette Score'); axes[1].set_title('Silhouette Scores')
fig.tight_layout()
save(fig, 'elbow_silhouette.png', 'Elbow & Silhouette Analysis')

best_k = list(K_range)[np.argmax(sils)]
print(f'Best k = {best_k} (silhouette = {max(sils):.4f})')

# â”€â”€ Final clustering â”€â”€
km_final = KMeans(n_clusters=best_k, random_state=SEED, n_init=10)
cat_profiles['cluster'] = km_final.fit_predict(X_clust_s)

# PCA
pca = PCA(n_components=2, random_state=SEED)
X_pca = pca.fit_transform(X_clust_s)

# DBSCAN
db = DBSCAN(eps=1.5, min_samples=2)
cat_profiles['dbscan_cluster'] = db.fit_predict(X_clust_s)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))
for cl in range(best_k):
    mask = cat_profiles['cluster'] == cl
    axes[0].scatter(X_pca[mask, 0], X_pca[mask, 1], label=f'Cluster {cl}',
                     alpha=0.8, s=100, edgecolors='white', linewidths=0.5)
for i, cat in enumerate(cat_profiles.index):
    axes[0].annotate(cat, (X_pca[i, 0], X_pca[i, 1]), fontsize=7, alpha=0.8,
                      xytext=(5, 5), textcoords='offset points')
axes[0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
axes[0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
axes[0].set_title(f'K-Means Clustering (k={best_k})')
axes[0].legend(fontsize=8)

unique_db = sorted(cat_profiles['dbscan_cluster'].unique())
for cl in unique_db:
    mask = cat_profiles['dbscan_cluster'] == cl
    label = f'Cluster {cl}' if cl >= 0 else 'Noise'
    axes[1].scatter(X_pca[mask, 0], X_pca[mask, 1], label=label, alpha=0.8, s=100,
                     edgecolors='white', linewidths=0.5)
for i, cat in enumerate(cat_profiles.index):
    axes[1].annotate(cat, (X_pca[i, 0], X_pca[i, 1]), fontsize=7, alpha=0.8,
                      xytext=(5, 5), textcoords='offset points')
axes[1].set_xlabel('PC1'); axes[1].set_ylabel('PC2')
axes[1].set_title('DBSCAN Clustering')
axes[1].legend(fontsize=8)
fig.tight_layout()
save(fig, 'clustering_results.png', 'Category Clustering Results')

print(f'\nCluster Profiles:')
print(cat_profiles.groupby('cluster')[cluster_feats].mean().round(3).to_string())

all_results['clustering'] = {
    'k': best_k, 'silhouette': max(sils),
    'profiles': cat_profiles.groupby('cluster')[cluster_feats].mean().round(3).to_dict()
}

## 8 Â· Hyperparameter Tuning
GridSearchCV and RandomizedSearchCV on the best classifiers for project success prediction.

In [None]:
# â”€â”€ Use a subsample for tuning (full dataset is 300K+ rows) â”€â”€
TUNE_SIZE = 80000
idx_tune = np.random.choice(len(X_train_c), TUNE_SIZE, replace=False)
X_tune = X_train_c.iloc[idx_tune]
y_tune = y_train_c.iloc[idx_tune]

# â”€â”€ GridSearchCV â€” Random Forest â”€â”€
print('GridSearchCV: Random Forest ...')
rf_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5],
}
gs_rf = GridSearchCV(RandomForestClassifier(random_state=SEED, n_jobs=-1),
                      rf_grid, cv=3, scoring='f1_weighted', n_jobs=-1, verbose=0)
gs_rf.fit(X_tune, y_tune)
pred_gs = gs_rf.predict(X_test_c)
acc_gs = accuracy_score(y_test_c, pred_gs)
f1_gs = f1_score(y_test_c, pred_gs, average='weighted')
print(f'  Best params: {gs_rf.best_params_}')
print(f'  Best CV F1:  {gs_rf.best_score_:.4f}')
print(f'  Test Acc:    {acc_gs:.4f}  F1: {f1_gs:.4f}')

clf_results['RF (Tuned)'] = {
    'Accuracy': acc_gs, 'F1 Score': f1_gs,
    'Precision': precision_score(y_test_c, pred_gs, average='weighted'),
    'Recall': recall_score(y_test_c, pred_gs, average='weighted'),
    'model': gs_rf.best_estimator_, 'predictions': pred_gs}

# â”€â”€ RandomizedSearchCV â€” Gradient Boosting â”€â”€
print(f'\nRandomizedSearchCV: Gradient Boosting ...')
gb_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'min_samples_split': [2, 5, 10],
}
rs_gb = RandomizedSearchCV(GradientBoostingClassifier(random_state=SEED),
                            gb_dist, n_iter=15, cv=3, scoring='f1_weighted',
                            random_state=SEED, n_jobs=-1, verbose=0)
rs_gb.fit(X_tune, y_tune)
pred_rs = rs_gb.predict(X_test_c)
acc_rs = accuracy_score(y_test_c, pred_rs)
f1_rs = f1_score(y_test_c, pred_rs, average='weighted')
print(f'  Best params: {rs_gb.best_params_}')
print(f'  Test Acc:    {acc_rs:.4f}  F1: {f1_rs:.4f}')

clf_results['GB (Tuned)'] = {
    'Accuracy': acc_rs, 'F1 Score': f1_rs,
    'Precision': precision_score(y_test_c, pred_rs, average='weighted'),
    'Recall': recall_score(y_test_c, pred_rs, average='weighted'),
    'model': rs_gb.best_estimator_, 'predictions': pred_rs}

print(f'\nHyperparameter tuning complete!')

## 9 Â· Cross-Validation, Confusion Matrices & Learning Curves

In [None]:
# â”€â”€ 5-Fold Cross-Validation (on subsample for speed) â”€â”€
CV_SIZE = 60000
idx_cv = np.random.choice(len(X_train_c), CV_SIZE, replace=False)
X_cv_data = X_train_c.iloc[idx_cv]
y_cv_data = y_train_c.iloc[idx_cv]
X_cv_data_s = scaler_c.transform(X_cv_data)

cv_models = {
    'Logistic Regression': (LogisticRegression(max_iter=1000, random_state=SEED), True),
    'Random Forest': (RandomForestClassifier(n_estimators=200, max_depth=15, random_state=SEED, n_jobs=-1), False),
    'Gradient Boosting': (GradientBoostingClassifier(n_estimators=200, max_depth=5, random_state=SEED), False),
}

cv_scores = {}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for name, (model, needs_scale) in cv_models.items():
    X_cv = X_cv_data_s if needs_scale else X_cv_data
    scores = cross_val_score(model, X_cv, y_cv_data, cv=skf, scoring='f1_weighted', n_jobs=-1)
    cv_scores[name] = scores
    print(f'{name:30s} CV F1: {scores.mean():.4f} Â± {scores.std():.4f}')

fig, ax = plt.subplots(figsize=(8, 5))
ax.boxplot(cv_scores.values(), labels=cv_scores.keys(), patch_artist=True,
           boxprops=dict(facecolor='#2E86AB', alpha=0.7))
ax.set_ylabel('F1 Score'); ax.set_title('5-Fold Cross-Validation â€” F1 Scores')
ax.tick_params(axis='x', rotation=15)
fig.tight_layout()
save(fig, 'cv_comparison.png', 'Cross-Validation Comparison')

# â”€â”€ Feature Importance â€” RF â”€â”€
rf_model = clf_results.get('RF (Tuned)', clf_results.get('Random Forest'))['model']
if hasattr(rf_model, 'feature_importances_'):
    imp_clf = pd.Series(rf_model.feature_importances_, index=clf_features).sort_values()
    fig, ax = plt.subplots(figsize=(8, 5))
    imp_clf.plot.barh(ax=ax, color='#2E86AB', edgecolor='white')
    ax.set_xlabel('Feature Importance')
    ax.set_title('Success Prediction â€” Feature Importance (Random Forest)')
    fig.tight_layout()
    save(fig, 'feature_importance.png', 'Success Prediction Feature Importance')

# â”€â”€ Confusion Matrices (top 4) â”€â”€
top4 = sorted(clf_results, key=lambda n: clf_results[n]['F1 Score'], reverse=True)[:4]
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
class_names = ['Failed', 'Successful']
for ax, name in zip(axes.ravel(), top4):
    cm = confusion_matrix(y_test_c, clf_results[name]['predictions'])
    sns.heatmap(cm, annot=True, fmt=',d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names, ax=ax)
    ax.set_title(f'{name}\n(F1={clf_results[name]["F1 Score"]:.4f})')
    ax.set_xlabel('Predicted'); ax.set_ylabel('Actual')
fig.suptitle('Confusion Matrices â€” Top 4 Classifiers', fontsize=14, y=1.02)
fig.tight_layout()
save(fig, 'confusion_matrices.png', 'Confusion Matrices â€” Top 4')

# â”€â”€ Learning Curves â”€â”€
best_clf_name = top4[0]
# Use GB or RF for learning curve
if 'Random Forest' in best_clf_name or 'RF' in best_clf_name:
    lc_model = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=SEED, n_jobs=-1)
elif 'Gradient Boosting' in best_clf_name or 'GB' in best_clf_name:
    lc_model = GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=SEED)
else:
    lc_model = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=SEED, n_jobs=-1)

# Subsample for learning curve speed
LC_SIZE = 40000
idx_lc = np.random.choice(len(X_cv_data), LC_SIZE, replace=False)
X_lc = X_cv_data.iloc[idx_lc]
y_lc = y_cv_data.iloc[idx_lc]

train_sizes, train_scores, val_scores = learning_curve(
    lc_model, X_lc, y_lc, cv=5, scoring='f1_weighted',
    train_sizes=np.linspace(0.1, 1.0, 8), n_jobs=-1)

fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(train_sizes, train_scores.mean(axis=1), 'o-', color='#2E86AB', label='Train')
ax.fill_between(train_sizes, train_scores.mean(axis=1) - train_scores.std(axis=1),
                train_scores.mean(axis=1) + train_scores.std(axis=1), alpha=0.1, color='#2E86AB')
ax.plot(train_sizes, val_scores.mean(axis=1), 'o-', color='#F18F01', label='Validation')
ax.fill_between(train_sizes, val_scores.mean(axis=1) - val_scores.std(axis=1),
                val_scores.mean(axis=1) + val_scores.std(axis=1), alpha=0.1, color='#F18F01')
ax.set_xlabel('Training Size'); ax.set_ylabel('F1 Score')
ax.set_title(f'Learning Curve â€” {best_clf_name}')
ax.legend(); ax.grid(True, alpha=0.3)
fig.tight_layout()
save(fig, 'learning_curves.png', f'Learning Curve â€” {best_clf_name}')

print('CV, confusion matrices, and learning curves complete!')

## 10 Â· Ensemble Methods & Final Comparison

In [None]:
# â”€â”€ Voting Classifier â”€â”€
print('Training Voting Classifier ...')
voting = VotingClassifier(estimators=[
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=SEED, n_jobs=-1)),
    ('gb', GradientBoostingClassifier(n_estimators=200, max_depth=5, random_state=SEED)),
    ('lr', LogisticRegression(max_iter=1000, random_state=SEED)),
], voting='hard', n_jobs=-1)
voting.fit(X_train_c, y_train_c)
pred_v = voting.predict(X_test_c)
clf_results['Voting Ensemble'] = {
    'Accuracy': accuracy_score(y_test_c, pred_v),
    'F1 Score': f1_score(y_test_c, pred_v, average='weighted'),
    'Precision': precision_score(y_test_c, pred_v, average='weighted'),
    'Recall': recall_score(y_test_c, pred_v, average='weighted'),
    'model': voting, 'predictions': pred_v}
print(f'  Voting Ensemble               Acc={clf_results["Voting Ensemble"]["Accuracy"]:.4f}  '
      f'F1={clf_results["Voting Ensemble"]["F1 Score"]:.4f}')

# â”€â”€ Stacking Classifier â”€â”€
print('Training Stacking Classifier ...')
stacking = StackingClassifier(estimators=[
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=SEED, n_jobs=-1)),
    ('gb', GradientBoostingClassifier(n_estimators=200, max_depth=5, random_state=SEED)),
    ('dt', DecisionTreeClassifier(max_depth=12, random_state=SEED)),
], final_estimator=LogisticRegression(max_iter=1000, random_state=SEED),
    cv=3, n_jobs=-1)
stacking.fit(X_train_c, y_train_c)
pred_s = stacking.predict(X_test_c)
clf_results['Stacking Ensemble'] = {
    'Accuracy': accuracy_score(y_test_c, pred_s),
    'F1 Score': f1_score(y_test_c, pred_s, average='weighted'),
    'Precision': precision_score(y_test_c, pred_s, average='weighted'),
    'Recall': recall_score(y_test_c, pred_s, average='weighted'),
    'model': stacking, 'predictions': pred_s}
print(f'  Stacking Ensemble             Acc={clf_results["Stacking Ensemble"]["Accuracy"]:.4f}  '
      f'F1={clf_results["Stacking Ensemble"]["F1 Score"]:.4f}')

# â”€â”€ Final Comparison â”€â”€
comp = pd.DataFrame({
    name: {k: v for k, v in metrics.items() if k not in ('model', 'predictions')}
    for name, metrics in clf_results.items()
}).T.sort_values('F1 Score', ascending=False)
comp.index.name = 'Model'

print(f'\nFinal Model Comparison:')
print(comp.to_string(float_format=lambda x: f'{x:.6f}'))

fig, ax = plt.subplots(figsize=(10, 7))
colors_bar = plt.cm.viridis(np.linspace(0.2, 0.9, len(comp)))
bars = ax.barh(range(len(comp)), comp['F1 Score'], color=colors_bar, edgecolor='white')
ax.set_yticks(range(len(comp)))
ax.set_yticklabels(comp.index, fontsize=9)
ax.set_xlabel('F1 Score (weighted)')
ax.set_title('Project Success Classification â€” All Models')
for i, v in enumerate(comp['F1 Score']):
    ax.text(v + 0.001, i, f'{v:.4f}', va='center', fontsize=8)
ax.invert_yaxis()
fig.tight_layout()
save(fig, 'model_comparison.png', 'Model Comparison â€” Success Prediction')

all_results['classification'] = comp.to_dict('index')
best_overall = comp.index[0]
print(f'\nBest model: {best_overall} (F1 = {comp.loc[best_overall, "F1 Score"]:.4f})')

## 11 Â· HTML Report Generation

In [None]:
def img_to_b64(path):
    with open(path, 'rb') as f:
        return base64.b64encode(f.read()).decode()

HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Kickstarter ML Analysis Report</title>
<style>
  :root { --primary: #0B3D91; --accent: #1E88E5; --light: #E3F2FD; --bg: #F5F7FA; --success: #43A047; }
  * { margin: 0; padding: 0; box-sizing: border-box; }
  body { font-family: 'Segoe UI', system-ui, sans-serif; background: var(--bg); color: #333; line-height: 1.6; }
  .header { background: linear-gradient(135deg, var(--primary), var(--accent)); color: white;
             padding: 3rem 2rem; text-align: center; }
  .header h1 { font-size: 2.2rem; margin-bottom: 0.5rem; }
  .header p { opacity: 0.9; font-size: 1.1rem; }
  .container { max-width: 1200px; margin: 0 auto; padding: 2rem; }
  .section { background: white; border-radius: 12px; padding: 2rem; margin-bottom: 2rem;
             box-shadow: 0 2px 8px rgba(0,0,0,0.08); }
  .section h2 { color: var(--primary); border-bottom: 3px solid var(--accent);
                 padding-bottom: 0.5rem; margin-bottom: 1.5rem; font-size: 1.5rem; }
  .section h3 { color: var(--accent); margin: 1rem 0 0.5rem; }
  table { width: 100%%; border-collapse: collapse; margin: 1rem 0; font-size: 0.9rem; }
  th { background: var(--accent); color: white; padding: 10px 12px; text-align: left; }
  td { padding: 8px 12px; border-bottom: 1px solid #e0e0e0; }
  tr:nth-child(even) { background: var(--light); }
  tr:hover { background: #BBDEFB; }
  .best { background: #C8E6C9 !important; font-weight: bold; }
  img { max-width: 100%%; border-radius: 8px; margin: 1rem 0; }
  .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); gap: 1.5rem; }
  .metric-card { background: var(--light); border-radius: 8px; padding: 1.5rem; text-align: center; }
  .metric-card .value { font-size: 2rem; font-weight: bold; color: var(--primary); }
  .metric-card .label { color: #666; font-size: 0.9rem; }
  .footer { text-align: center; padding: 2rem; color: #888; font-size: 0.85rem; }
</style>
</head>
<body>
<div class="header">
  <h1>ðŸš€ Kickstarter Projects â€” ML Report</h1>
  <p>Comprehensive Machine Learning Analysis â€¢ {{ n_projects }} projects â€¢ {{ n_categories }} categories â€¢ {{ year_range }}</p>
</div>
<div class="container">

  <div class="grid" style="margin-bottom:2rem;">
    <div class="metric-card"><div class="value">{{ n_projects }}</div><div class="label">Total Projects</div></div>
    <div class="metric-card"><div class="value">{{ success_rate }}</div><div class="label">Overall Success Rate</div></div>
    <div class="metric-card"><div class="value">{{ n_categories }}</div><div class="label">Categories</div></div>
    <div class="metric-card"><div class="value">{{ n_countries }}</div><div class="label">Countries</div></div>
  </div>

  <div class="section">
    <h2>1 Â· Exploratory Data Analysis</h2>
    <div class="grid">
    {% for title, b64 in eda_plots %}
      <div><h3>{{ title }}</h3><img src="data:image/png;base64,{{ b64 }}" alt="{{ title }}"></div>
    {% endfor %}
    </div>
  </div>

  <div class="section">
    <h2>2 Â· Project Success Classification</h2>
    <p>Binary classification: predict Successful vs Failed using only pre-launch features.</p>
    <table>
      <tr><th>Model</th><th>Accuracy</th><th>Precision</th><th>Recall</th><th>F1 Score</th></tr>
      {% for name, m in clf_table.items() %}
      <tr{% if loop.index == 1 %} class="best"{% endif %}>
        <td>{{ name }}</td><td>{{ "%.4f"|format(m['Accuracy']) }}</td>
        <td>{{ "%.4f"|format(m['Precision']) }}</td><td>{{ "%.4f"|format(m['Recall']) }}</td>
        <td>{{ "%.4f"|format(m['F1 Score']) }}</td></tr>
      {% endfor %}
    </table>
    <div class="grid">
    {% for title, b64 in clf_plots %}
      <div><h3>{{ title }}</h3><img src="data:image/png;base64,{{ b64 }}" alt="{{ title }}"></div>
    {% endfor %}
    </div>
  </div>

  <div class="section">
    <h2>3 Â· Pledged Amount Regression</h2>
    <p>Predict log(pledged+1) for successful projects.</p>
    <table>
      <tr><th>Model</th><th>RÂ²</th><th>MAE</th><th>RMSE</th></tr>
      {% for name, m in reg_results.items() %}
      <tr{% if loop.index == 1 %} class="best"{% endif %}>
        <td>{{ name }}</td><td>{{ "%.4f"|format(m['RÂ²']) }}</td>
        <td>{{ "%.2f"|format(m['MAE']) }}</td><td>{{ "%.2f"|format(m['RMSE']) }}</td></tr>
      {% endfor %}
    </table>
    <div class="grid">
    {% for title, b64 in reg_plots %}
      <div><h3>{{ title }}</h3><img src="data:image/png;base64,{{ b64 }}" alt="{{ title }}"></div>
    {% endfor %}
    </div>
  </div>

  <div class="section">
    <h2>4 Â· Backer Count Regression</h2>
    <table>
      <tr><th>Model</th><th>RÂ²</th><th>MAE</th><th>RMSE</th></tr>
      {% for name, m in backer_results.items() %}
      <tr{% if loop.index == 1 %} class="best"{% endif %}>
        <td>{{ name }}</td><td>{{ "%.4f"|format(m['RÂ²']) }}</td>
        <td>{{ "%.2f"|format(m['MAE']) }}</td><td>{{ "%.2f"|format(m['RMSE']) }}</td></tr>
      {% endfor %}
    </table>
    <div class="grid">
    {% for title, b64 in backer_plots %}
      <div><h3>{{ title }}</h3><img src="data:image/png;base64,{{ b64 }}" alt="{{ title }}"></div>
    {% endfor %}
    </div>
  </div>

  <div class="section">
    <h2>5 Â· Category Clustering</h2>
    <div class="grid" style="margin-bottom:1rem;">
      <div class="metric-card"><div class="value">{{ cluster_k }}</div><div class="label">Optimal Clusters</div></div>
      <div class="metric-card"><div class="value">{{ "%.4f"|format(cluster_sil) }}</div><div class="label">Silhouette Score</div></div>
    </div>
    <div class="grid">
    {% for title, b64 in cluster_plots %}
      <div><h3>{{ title }}</h3><img src="data:image/png;base64,{{ b64 }}" alt="{{ title }}"></div>
    {% endfor %}
    </div>
  </div>

  <div class="section">
    <h2>6 Â· Cross-Validation & Analysis</h2>
    <div class="grid">
    {% for title, b64 in analysis_plots %}
      <div><h3>{{ title }}</h3><img src="data:image/png;base64,{{ b64 }}" alt="{{ title }}"></div>
    {% endfor %}
    </div>
  </div>

  <div class="section">
    <h2>7 Â· Final Model Comparison</h2>
    {% for title, b64 in final_plots %}
      <h3>{{ title }}</h3><img src="data:image/png;base64,{{ b64 }}" alt="{{ title }}">
    {% endfor %}
  </div>

</div>
<div class="footer">
  Generated automatically Â· Kickstarter ML Analysis Â· scikit-learn {{ sklearn_ver }}
</div>
</body></html>
"""

plot_categories = {
    'eda': ['success_by_category', 'goal_distribution', 'projects_over_time',
            'success_by_duration', 'success_by_country', 'correlation_heatmap'],
    'reg': ['regression_results', 'regression_feature_importance'],
    'backer': ['backer_regression'],
    'clf': ['confusion_matrices', 'feature_importance'],
    'cluster': ['elbow_silhouette', 'clustering_results'],
    'analysis': ['cv_comparison', 'learning_curves'],
    'final': ['model_comparison'],
}

def get_plots(category):
    keys = plot_categories[category]
    result = []
    for title, path in saved_plots:
        fname = pathlib.Path(path).stem
        if fname in keys:
            result.append((title, img_to_b64(path)))
    return result

# Sort regression results
reg_sorted = dict(sorted(all_results['regression'].items(),
                          key=lambda x: x[1]['RÂ²'], reverse=True))
backer_sorted = dict(sorted(all_results['backer_regression'].items(),
                              key=lambda x: x[1]['RÂ²'], reverse=True))

# Sort classification
clf_sorted = dict(sorted(
    {k: {m: v for m, v in met.items() if m not in ('model', 'predictions')}
     for k, met in clf_results.items()}.items(),
    key=lambda x: x[1]['F1 Score'], reverse=True))

import sklearn
tmpl = Template(HTML_TEMPLATE)
html = tmpl.render(
    n_projects=f'{len(df_binary):,}',
    success_rate=f'{df_binary["success"].mean():.1%}',
    n_categories=df_binary['Category'].nunique(),
    n_countries=df_binary['Country'].nunique(),
    year_range=f'{df_binary["launch_year"].min()}â€“{df_binary["launch_year"].max()}',
    eda_plots=get_plots('eda'),
    clf_table=clf_sorted,
    clf_plots=get_plots('clf'),
    reg_results=reg_sorted,
    reg_plots=get_plots('reg'),
    backer_results=backer_sorted,
    backer_plots=get_plots('backer'),
    cluster_k=all_results['clustering']['k'],
    cluster_sil=all_results['clustering']['silhouette'],
    cluster_plots=get_plots('cluster'),
    analysis_plots=get_plots('analysis'),
    final_plots=get_plots('final'),
    sklearn_ver=sklearn.__version__,
)

report_path = pathlib.Path('outputs/kickstarter_ml_report.html')
report_path.write_text(html)
print(f'âœ… HTML Report generated: {report_path}')
print(f'   File size: {report_path.stat().st_size / 1024:.1f} KB')
print(f'   Embedded images: {html.count("data:image/png;base64,")}')
print(f'\nðŸŽ‰ Analysis complete! Open the HTML file to view the full report.')