# üöÄ Space Missions ‚Äì ML Analysis**Dataset**: 4,630 space launches (1957‚Äì2022) with 62 companies, mission outcomes, rocket status, and pricing.| # | Task | Type | Target ||---|------|------|--------|| 1 | Mission Success Classification | Binary Classification | Success vs Failure || 2 | Mission Price Regression | Regression | Launch price (millions USD) || 3 | Rocket Status Prediction | Binary Classification | Active vs Retired || 4 | Launch Clustering | Unsupervised | K-Means on mission features |

## 1 ¬∑ Imports

In [None]:
import warnings, os, pathlibwarnings.filterwarnings("ignore")import numpy as npimport pandas as pdimport matplotlibmatplotlib.use("Agg")import matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.model_selection import (train_test_split, GridSearchCV,                                     RandomizedSearchCV, cross_val_score,                                     learning_curve, StratifiedKFold)from sklearn.preprocessing import StandardScaler, LabelEncoderfrom sklearn.metrics import (accuracy_score, f1_score, classification_report,                             confusion_matrix, mean_squared_error, r2_score,                             mean_absolute_error, silhouette_score)from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNetfrom sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressorfrom sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,                              GradientBoostingClassifier, GradientBoostingRegressor,                              AdaBoostClassifier, VotingClassifier, StackingClassifier)from sklearn.svm import SVCfrom sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressorfrom sklearn.neural_network import MLPClassifierfrom sklearn.naive_bayes import GaussianNBfrom sklearn.cluster import KMeansfrom xgboost import XGBClassifierimport jinja2, base64from io import BytesIOSEED = 42PLOT_DIR = pathlib.Path("outputs/plots")PLOT_DIR.mkdir(parents=True, exist_ok=True)sns.set_theme(style="whitegrid", palette="viridis")print("‚úÖ Imports OK")

## 2 ¬∑ Load & Feature Engineering

In [None]:
df_raw = pd.read_csv("Space+Missions/space_missions.csv", encoding="latin-1")print(f"Raw shape: {df_raw.shape}")print(f"Missing values:\n{df_raw.isnull().sum()}")df = df_raw.copy()# ‚îÄ‚îÄ Parse Date ‚îÄ‚îÄdf["Date"] = pd.to_datetime(df["Date"], errors="coerce")df["launch_year"] = df["Date"].dt.yeardf["launch_month"] = df["Date"].dt.monthdf["launch_dow"] = df["Date"].dt.dayofweek  # 0=Mondf["launch_decade"] = (df["launch_year"] // 10) * 10# ‚îÄ‚îÄ Parse Time ‚îÄ‚îÄdf["Time"] = pd.to_datetime(df["Time"], format="%H:%M:%S", errors="coerce")df["launch_hour"] = df["Time"].dt.hour# ‚îÄ‚îÄ Parse Price ‚îÄ‚îÄdf["Price_clean"] = (df["Price"]    .str.replace(",", "", regex=False)    .str.strip()    .apply(pd.to_numeric, errors="coerce"))print(f"\nPrice stats (millions USD):")print(df["Price_clean"].describe())# ‚îÄ‚îÄ Extract country from Location ‚îÄ‚îÄdf["country"] = df["Location"].str.split(",").str[-1].str.strip()print(f"\nCountries ({df['country'].nunique()}):")print(df["country"].value_counts().head(10).to_string())# ‚îÄ‚îÄ Encode categorical ‚îÄ‚îÄle_company = LabelEncoder()df["company_enc"] = le_company.fit_transform(df["Company"])le_country = LabelEncoder()df["country_enc"] = le_country.fit_transform(df["country"])le_rocket = LabelEncoder()df["rocket_enc"] = le_rocket.fit_transform(df["Rocket"])# ‚îÄ‚îÄ Rocket status binary ‚îÄ‚îÄdf["rocket_active"] = (df["RocketStatus"] == "Active").astype(int)# ‚îÄ‚îÄ Mission success binary ‚îÄ‚îÄdf["mission_success"] = (df["MissionStatus"] == "Success").astype(int)# ‚îÄ‚îÄ Company launch count (historical frequency) ‚îÄ‚îÄcompany_counts = df["Company"].value_counts().to_dict()df["company_launches"] = df["Company"].map(company_counts)# ‚îÄ‚îÄ Rocket launch count ‚îÄ‚îÄrocket_counts = df["Rocket"].value_counts().to_dict()df["rocket_launches"] = df["Rocket"].map(rocket_counts)# ‚îÄ‚îÄ Company success rate (global) ‚îÄ‚îÄcompany_sr = df.groupby("Company")["mission_success"].mean().to_dict()df["company_success_rate"] = df["Company"].map(company_sr)# ‚îÄ‚îÄ Mission name length ‚îÄ‚îÄdf["mission_name_len"] = df["Mission"].str.len()print(f"\nFinal shape: {df.shape}")print(f"Success rate: {df['mission_success'].mean():.4f}")print(f"Active rockets: {df['rocket_active'].mean():.4f}")df.head(3)

## 3 ¬∑ Exploratory Data Analysis

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(20, 12))# 1 ‚Äì Mission status distributiondf["MissionStatus"].value_counts().plot.bar(ax=axes[0, 0],    color=sns.color_palette("viridis", 4))axes[0, 0].set_title("Mission Status Distribution")axes[0, 0].set_ylabel("Count"); axes[0, 0].tick_params(axis="x", rotation=15)# 2 ‚Äì Launches per yearyear_counts = df["launch_year"].dropna().astype(int).value_counts().sort_index()axes[0, 1].plot(year_counts.index, year_counts.values, color="darkgreen", linewidth=2)axes[0, 1].fill_between(year_counts.index, year_counts.values, alpha=0.2, color="green")axes[0, 1].set_title("Launches Per Year (1957‚Äì2022)")axes[0, 1].set_xlabel("Year"); axes[0, 1].set_ylabel("Count")# 3 ‚Äì Top 15 companiestop_comp = df["Company"].value_counts().head(15)top_comp.plot.barh(ax=axes[0, 2], color=sns.color_palette("rocket", 15))axes[0, 2].set_title("Top 15 Launch Companies")axes[0, 2].set_xlabel("Launches"); axes[0, 2].invert_yaxis()# 4 ‚Äì Success rate by decadedecade_sr = df.groupby("launch_decade")["mission_success"].mean()axes[1, 0].bar(decade_sr.index.astype(str), decade_sr.values, color="teal")axes[1, 0].set_title("Success Rate by Decade")axes[1, 0].set_ylabel("Success Rate"); axes[1, 0].set_ylim(0, 1)axes[1, 0].tick_params(axis="x", rotation=45)# 5 ‚Äì Country distribution (top 10)df["country"].value_counts().head(10).plot.bar(ax=axes[1, 1],    color=sns.color_palette("mako", 10))axes[1, 1].set_title("Top 10 Launch Countries")axes[1, 1].set_ylabel("Count"); axes[1, 1].tick_params(axis="x", rotation=45)# 6 ‚Äì Price distribution (where available)prices = df["Price_clean"].dropna()prices[prices < 1000].hist(bins=40, ax=axes[1, 2], color="orange", edgecolor="white")axes[1, 2].set_title(f"Price Distribution (n={len(prices)})")axes[1, 2].set_xlabel("Price (millions USD)"); axes[1, 2].set_ylabel("Count")plt.tight_layout()plt.savefig(PLOT_DIR / "eda_overview.png", dpi=150, bbox_inches="tight")plt.show()print("‚úÖ EDA overview saved")# Success rate by company (top 15)fig, ax = plt.subplots(figsize=(12, 7))top15 = df["Company"].value_counts().head(15).indexsr_df = df[df["Company"].isin(top15)].groupby("Company").agg(    launches=("mission_success", "count"),    success_rate=("mission_success", "mean")).sort_values("launches", ascending=True)colors = plt.cm.RdYlGn(sr_df["success_rate"])ax.barh(sr_df.index, sr_df["success_rate"], color=colors)for i, (idx, row) in enumerate(sr_df.iterrows()):    ax.text(row["success_rate"] + 0.01, i, f"{row['success_rate']:.1%} ({int(row['launches'])})",            va="center", fontsize=9)ax.set_title("Success Rate by Top 15 Companies")ax.set_xlabel("Success Rate"); ax.set_xlim(0, 1.15)plt.tight_layout()plt.savefig(PLOT_DIR / "success_by_company.png", dpi=150, bbox_inches="tight")plt.show()# Correlation heatmapnum_cols = ["launch_year", "launch_month", "launch_dow", "launch_hour",            "company_enc", "country_enc", "rocket_enc", "rocket_active",            "company_launches", "rocket_launches", "company_success_rate",            "mission_name_len", "mission_success"]fig, ax = plt.subplots(figsize=(12, 9))sns.heatmap(df[num_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax,            linewidths=0.5, vmin=-1, vmax=1)ax.set_title("Correlation Heatmap")plt.tight_layout()plt.savefig(PLOT_DIR / "correlation_heatmap.png", dpi=150, bbox_inches="tight")plt.show()print("‚úÖ All EDA plots saved")

## 4 ¬∑ Prepare Features for Classification

In [None]:
feature_cols = [    "launch_year", "launch_month", "launch_dow", "launch_hour",    "company_enc", "country_enc", "rocket_enc", "rocket_active",    "company_launches", "rocket_launches", "company_success_rate",    "mission_name_len"]df_clf = df.dropna(subset=feature_cols + ["mission_success"]).copy()X_clf = df_clf[feature_cols].valuesy_clf = df_clf["mission_success"].valuesprint(f"Classification dataset: {X_clf.shape}")print(f"Success: {y_clf.sum()}, Failure: {len(y_clf) - y_clf.sum()}")print(f"Success rate: {y_clf.mean():.4f}")X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(    X_clf, y_clf, test_size=0.2, random_state=SEED, stratify=y_clf)scaler_c = StandardScaler()X_train_cs = scaler_c.fit_transform(X_train_c)X_test_cs = scaler_c.transform(X_test_c)print(f"Train: {X_train_cs.shape}, Test: {X_test_cs.shape}")

## 5 ¬∑ Mission Success Classification (10 Models)

In [None]:
classifiers = {    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=SEED),    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=7),    "Decision Tree": DecisionTreeClassifier(random_state=SEED),    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=SEED, n_jobs=-1),    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, random_state=SEED),    "AdaBoost": AdaBoostClassifier(n_estimators=150, random_state=SEED),    "XGBoost": XGBClassifier(n_estimators=200, eval_metric="logloss", random_state=SEED, n_jobs=-1),    "SVM": SVC(kernel="rbf", probability=True, random_state=SEED),    "Naive Bayes": GaussianNB(),    "MLP": MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=400, random_state=SEED),}clf_results = {}for name, model in classifiers.items():    print(f"Training {name}...", end=" ")    model.fit(X_train_cs, y_train_c)    y_pred = model.predict(X_test_cs)    acc = accuracy_score(y_test_c, y_pred)    f1 = f1_score(y_test_c, y_pred, average="weighted")    clf_results[name] = {"accuracy": acc, "f1": f1, "model": model, "y_pred": y_pred}    print(f"Acc={acc:.4f}  F1={f1:.4f}")best_clf_name = max(clf_results, key=lambda k: clf_results[k]["f1"])print(f"\nüèÜ Best classifier: {best_clf_name} (F1={clf_results[best_clf_name]['f1']:.4f})")# Bar chartfig, ax = plt.subplots(figsize=(12, 6))names = list(clf_results.keys())accs = [clf_results[n]["accuracy"] for n in names]f1s = [clf_results[n]["f1"] for n in names]x = np.arange(len(names))ax.bar(x - 0.2, accs, 0.4, label="Accuracy", color="steelblue")ax.bar(x + 0.2, f1s, 0.4, label="F1 (weighted)", color="coral")ax.set_xticks(x); ax.set_xticklabels(names, rotation=45, ha="right")ax.set_ylim(0, 1); ax.set_title("Mission Success ‚Äì Model Comparison")ax.legend(); plt.tight_layout()plt.savefig(PLOT_DIR / "model_comparison.png", dpi=150, bbox_inches="tight")plt.show()print("‚úÖ Model comparison saved")

## 6 ¬∑ Mission Price Regression

In [None]:
reg_features = [    "launch_year", "launch_month", "launch_dow", "launch_hour",    "company_enc", "country_enc", "rocket_enc", "rocket_active",    "company_launches", "rocket_launches", "mission_success",    "mission_name_len"]df_reg = df.dropna(subset=reg_features + ["Price_clean"]).copy()X_reg = df_reg[reg_features].valuesy_reg = df_reg["Price_clean"].valuesprint(f"Regression dataset: {X_reg.shape} (rows with known price)")# Log-transform price for better regressiony_reg_log = np.log1p(y_reg)X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(    X_reg, y_reg_log, test_size=0.2, random_state=SEED)scaler_r = StandardScaler()X_train_rs = scaler_r.fit_transform(X_train_r)X_test_rs = scaler_r.transform(X_test_r)regressors = {    "Ridge": Ridge(alpha=1.0),    "Lasso": Lasso(alpha=0.1, max_iter=2000),    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=2000),    "Decision Tree": DecisionTreeRegressor(random_state=SEED),    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=SEED, n_jobs=-1),    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, random_state=SEED),}reg_results = {}for name, model in regressors.items():    print(f"Training {name}...", end=" ")    model.fit(X_train_rs, y_train_r)    y_pred_log = model.predict(X_test_rs)    # Evaluate in log space    r2 = r2_score(y_test_r, y_pred_log)    rmse = np.sqrt(mean_squared_error(y_test_r, y_pred_log))    mae = mean_absolute_error(y_test_r, y_pred_log)    # Also evaluate in original space    y_pred_orig = np.expm1(y_pred_log)    y_test_orig = np.expm1(y_test_r)    r2_orig = r2_score(y_test_orig, y_pred_orig)    reg_results[name] = {"r2": r2, "rmse": rmse, "mae": mae, "r2_orig": r2_orig,                         "model": model, "y_pred": y_pred_log}    print(f"R¬≤(log)={r2:.4f}  R¬≤(orig)={r2_orig:.4f}  RMSE(log)={rmse:.3f}")best_reg_name = max(reg_results, key=lambda k: reg_results[k]["r2"])print(f"\nüèÜ Best regressor: {best_reg_name} (R¬≤={reg_results[best_reg_name]['r2']:.4f})")# Feature importance (best model)best_reg_model = reg_results[best_reg_name]["model"]if hasattr(best_reg_model, "feature_importances_"):    imp = best_reg_model.feature_importances_    idx = np.argsort(imp)    fig, ax = plt.subplots(figsize=(10, 7))    ax.barh(np.array(reg_features)[idx], imp[idx], color="darkcyan")    ax.set_title(f"Price Regression ‚Äì Feature Importance ({best_reg_name})")    ax.set_xlabel("Importance")    plt.tight_layout()    plt.savefig(PLOT_DIR / "price_feature_importance.png", dpi=150, bbox_inches="tight")    plt.show()# Actual vs predicted scatterfig, axes = plt.subplots(2, 3, figsize=(18, 10))for ax, (name, res) in zip(axes.flat, reg_results.items()):    y_test_o = np.expm1(y_test_r)    y_pred_o = np.expm1(res["y_pred"])    ax.scatter(y_test_o, y_pred_o, alpha=0.4, s=15, c="teal")    lims = [0, max(y_test_o.max(), y_pred_o.max()) * 1.05]    ax.plot(lims, lims, "r--", linewidth=1.5)    ax.set_title(f"{name}\nR¬≤={res['r2_orig']:.4f}")    ax.set_xlabel("Actual (M USD)"); ax.set_ylabel("Predicted (M USD)")plt.suptitle("Price Regression ‚Äì Actual vs Predicted", fontsize=14, y=1.01)plt.tight_layout()plt.savefig(PLOT_DIR / "price_regression_results.png", dpi=150, bbox_inches="tight")plt.show()print("‚úÖ Price regression plots saved")

## 7 ¬∑ Rocket Status Prediction (Active vs Retired)

In [None]:
rocket_features = [    "launch_year", "launch_month", "company_enc", "country_enc",    "company_launches", "rocket_launches", "company_success_rate",    "mission_success", "mission_name_len"]df_rock = df.dropna(subset=rocket_features + ["rocket_active"]).copy()X_rock = df_rock[rocket_features].valuesy_rock = df_rock["rocket_active"].valuesprint(f"Rocket status dataset: {X_rock.shape}")print(f"Active: {y_rock.sum()}, Retired: {len(y_rock) - y_rock.sum()}")X_train_rk, X_test_rk, y_train_rk, y_test_rk = train_test_split(    X_rock, y_rock, test_size=0.2, random_state=SEED, stratify=y_rock)scaler_rk = StandardScaler()X_train_rks = scaler_rk.fit_transform(X_train_rk)X_test_rks = scaler_rk.transform(X_test_rk)rocket_models = {    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=SEED),    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=SEED, n_jobs=-1),    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, random_state=SEED),    "XGBoost": XGBClassifier(n_estimators=200, eval_metric="logloss", random_state=SEED, n_jobs=-1),    "SVM": SVC(kernel="rbf", probability=True, random_state=SEED),    "MLP": MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=400, random_state=SEED),}rocket_results = {}for name, model in rocket_models.items():    print(f"Training {name}...", end=" ")    model.fit(X_train_rks, y_train_rk)    y_pred = model.predict(X_test_rks)    acc = accuracy_score(y_test_rk, y_pred)    f1 = f1_score(y_test_rk, y_pred, average="weighted")    rocket_results[name] = {"accuracy": acc, "f1": f1, "model": model, "y_pred": y_pred}    print(f"Acc={acc:.4f}  F1={f1:.4f}")best_rock_name = max(rocket_results, key=lambda k: rocket_results[k]["f1"])print(f"\nüèÜ Best rocket status predictor: {best_rock_name} (F1={rocket_results[best_rock_name]['f1']:.4f})")fig, ax = plt.subplots(figsize=(10, 5))names_r = list(rocket_results.keys())f1s_r = [rocket_results[n]["f1"] for n in names_r]ax.barh(names_r, f1s_r, color=sns.color_palette("rocket", len(names_r)))ax.set_title("Rocket Status Prediction ‚Äì F1 Scores"); ax.set_xlabel("F1 (weighted)")ax.set_xlim(0, 1)for i, v in enumerate(f1s_r):    ax.text(v + 0.01, i, f"{v:.4f}", va="center")plt.tight_layout()plt.savefig(PLOT_DIR / "rocket_status_comparison.png", dpi=150, bbox_inches="tight")plt.show()print("‚úÖ Rocket status comparison saved")

## 8 ¬∑ Launch Clustering

In [None]:
clust_features = [    "launch_year", "launch_month", "launch_dow", "company_enc",    "country_enc", "rocket_active", "company_launches", "rocket_launches",    "company_success_rate", "mission_success"]df_clust = df.dropna(subset=clust_features).copy()X_clust = df_clust[clust_features].valuesscaler_cl = StandardScaler()X_clust_s = scaler_cl.fit_transform(X_clust)K_range = range(2, 11)inertias, sils = [], []for k in K_range:    km = KMeans(n_clusters=k, random_state=SEED, n_init=10)    labels = km.fit_predict(X_clust_s)    inertias.append(km.inertia_)    sils.append(silhouette_score(X_clust_s, labels, sample_size=3000, random_state=SEED))fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))ax1.plot(list(K_range), inertias, "bo-"); ax1.set_title("Elbow Method")ax1.set_xlabel("k"); ax1.set_ylabel("Inertia")ax2.plot(list(K_range), sils, "rs-"); ax2.set_title("Silhouette Score")ax2.set_xlabel("k"); ax2.set_ylabel("Score")plt.tight_layout()plt.savefig(PLOT_DIR / "elbow_silhouette.png", dpi=150, bbox_inches="tight")plt.show()best_k = list(K_range)[np.argmax(sils)]print(f"Best k={best_k}, silhouette={max(sils):.4f}")km_final = KMeans(n_clusters=best_k, random_state=SEED, n_init=10)df_clust["cluster"] = km_final.fit_predict(X_clust_s)# Cluster profilescluster_profile = df_clust.groupby("cluster")[clust_features].mean()print("\nCluster profiles:")print(cluster_profile.round(2).to_string())fig, ax = plt.subplots(figsize=(12, 6))cluster_profile_norm = (cluster_profile - cluster_profile.min()) / (cluster_profile.max() - cluster_profile.min() + 1e-9)cluster_profile_norm.T.plot(kind="bar", ax=ax, colormap="viridis")ax.set_title(f"Cluster Profiles (k={best_k})")ax.set_ylabel("Normalized value"); ax.set_xlabel("Feature")ax.legend(title="Cluster", bbox_to_anchor=(1.05, 1)); ax.tick_params(axis="x", rotation=45)plt.tight_layout()plt.savefig(PLOT_DIR / "clustering_results.png", dpi=150, bbox_inches="tight")plt.show()print("‚úÖ Clustering plots saved")

## 9 ¬∑ Hyperparameter Tuning

In [None]:
# GridSearchCV ‚Äì Random Forestprint("GridSearchCV on Random Forest...")rf_grid = {    "n_estimators": [100, 200, 300],    "max_depth": [10, 20, None],    "min_samples_split": [2, 5],}gs_rf = GridSearchCV(RandomForestClassifier(random_state=SEED, n_jobs=-1),                     rf_grid, cv=3, scoring="f1_weighted", n_jobs=-1)gs_rf.fit(X_train_cs, y_train_c)print(f"  Best params: {gs_rf.best_params_}")print(f"  Best CV F1:  {gs_rf.best_score_:.4f}")# RandomizedSearchCV ‚Äì Gradient Boostingprint("\nRandomizedSearchCV on Gradient Boosting...")gb_dist = {    "n_estimators": [100, 200, 300],    "max_depth": [3, 5, 7],    "learning_rate": [0.01, 0.05, 0.1, 0.2],    "subsample": [0.7, 0.8, 1.0],}rs_gb = RandomizedSearchCV(GradientBoostingClassifier(random_state=SEED),                           gb_dist, n_iter=20, cv=3, scoring="f1_weighted",                           random_state=SEED, n_jobs=-1)rs_gb.fit(X_train_cs, y_train_c)print(f"  Best params: {rs_gb.best_params_}")print(f"  Best CV F1:  {rs_gb.best_score_:.4f}")# Evaluate tuned modelsfor label, model in [("Tuned RF (Grid)", gs_rf.best_estimator_),                     ("Tuned GB (Random)", rs_gb.best_estimator_)]:    y_pred = model.predict(X_test_cs)    acc = accuracy_score(y_test_c, y_pred)    f1 = f1_score(y_test_c, y_pred, average="weighted")    clf_results[label] = {"accuracy": acc, "f1": f1, "model": model, "y_pred": y_pred}    print(f"  {label}: Acc={acc:.4f}  F1={f1:.4f}")print("\n‚úÖ Hyperparameter tuning complete")

## 10 ¬∑ Cross-Validation, Feature Importance, Confusion Matrices & Learning Curves

In [None]:
# ‚îÄ‚îÄ 5-fold CV ‚îÄ‚îÄcv_models = {    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=SEED),    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=SEED, n_jobs=-1),    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, random_state=SEED),    "XGBoost": XGBClassifier(n_estimators=200, eval_metric="logloss", random_state=SEED, n_jobs=-1),}scaler_cv = StandardScaler()X_cvs = scaler_cv.fit_transform(X_clf)cv_scores = {}for name, model in cv_models.items():    scores = cross_val_score(model, X_cvs, y_clf, cv=5, scoring="f1_weighted", n_jobs=-1)    cv_scores[name] = scores    print(f"{name}: mean F1={scores.mean():.4f} ¬± {scores.std():.4f}")fig, ax = plt.subplots(figsize=(10, 5))ax.boxplot(cv_scores.values(), labels=cv_scores.keys())ax.set_title("5-Fold Cross-Validation F1 Scores (Mission Success)")ax.set_ylabel("F1 (weighted)"); ax.tick_params(axis="x", rotation=20)plt.tight_layout()plt.savefig(PLOT_DIR / "cv_comparison.png", dpi=150, bbox_inches="tight")plt.show()# ‚îÄ‚îÄ Feature importance ‚îÄ‚îÄbest_clf_model = clf_results[best_clf_name]["model"]if hasattr(best_clf_model, "feature_importances_"):    imp = best_clf_model.feature_importances_    idx = np.argsort(imp)    fig, ax = plt.subplots(figsize=(10, 7))    ax.barh(np.array(feature_cols)[idx], imp[idx], color="coral")    ax.set_title(f"Feature Importance ‚Äì {best_clf_name}")    ax.set_xlabel("Importance")    plt.tight_layout()    plt.savefig(PLOT_DIR / "feature_importance.png", dpi=150, bbox_inches="tight")    plt.show()# ‚îÄ‚îÄ Confusion matrices (top 4) ‚îÄ‚îÄtop4 = sorted(clf_results, key=lambda k: clf_results[k]["f1"], reverse=True)[:4]fig, axes = plt.subplots(1, 4, figsize=(20, 5))for ax, name in zip(axes, top4):    cm = confusion_matrix(y_test_c, clf_results[name]["y_pred"])    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax,                xticklabels=["Failure", "Success"], yticklabels=["Failure", "Success"])    ax.set_title(f"{name}\nF1={clf_results[name]['f1']:.4f}")    ax.set_xlabel("Predicted"); ax.set_ylabel("Actual")plt.suptitle("Confusion Matrices ‚Äì Top 4 Models", fontsize=14, y=1.02)plt.tight_layout()plt.savefig(PLOT_DIR / "confusion_matrices.png", dpi=150, bbox_inches="tight")plt.show()# ‚îÄ‚îÄ Learning curves ‚îÄ‚îÄfig, axes = plt.subplots(1, 2, figsize=(14, 5))for ax, (name, model) in zip(axes, [    ("Random Forest", RandomForestClassifier(n_estimators=100, random_state=SEED, n_jobs=-1)),    ("Gradient Boosting", GradientBoostingClassifier(n_estimators=100, random_state=SEED)),]):    train_sizes, train_scores, val_scores = learning_curve(        model, X_cvs, y_clf, cv=3, n_jobs=-1,        train_sizes=np.linspace(0.1, 1.0, 8), scoring="f1_weighted"    )    ax.plot(train_sizes, train_scores.mean(axis=1), "o-", label="Train")    ax.plot(train_sizes, val_scores.mean(axis=1), "s-", label="Validation")    ax.set_title(f"Learning Curve ‚Äì {name}")    ax.set_xlabel("Training Size"); ax.set_ylabel("F1 (weighted)")    ax.legend(); ax.grid(True)plt.tight_layout()plt.savefig(PLOT_DIR / "learning_curves.png", dpi=150, bbox_inches="tight")plt.show()print("‚úÖ CV, feature importance, confusion matrices & learning curves saved")

## 11 ¬∑ Voting & Stacking Ensembles

In [None]:
# Voting Classifierprint("Training Voting Classifier...")voting = VotingClassifier(    estimators=[        ("rf", RandomForestClassifier(n_estimators=200, random_state=SEED, n_jobs=-1)),        ("gb", GradientBoostingClassifier(n_estimators=200, random_state=SEED)),        ("xgb", XGBClassifier(n_estimators=200, eval_metric="logloss", random_state=SEED, n_jobs=-1)),    ],    voting="soft")voting.fit(X_train_cs, y_train_c)y_pred_v = voting.predict(X_test_cs)acc_v = accuracy_score(y_test_c, y_pred_v)f1_v = f1_score(y_test_c, y_pred_v, average="weighted")clf_results["Voting Ensemble"] = {"accuracy": acc_v, "f1": f1_v, "model": voting, "y_pred": y_pred_v}print(f"  Voting: Acc={acc_v:.4f}  F1={f1_v:.4f}")# Stacking Classifierprint("Training Stacking Classifier...")stacking = StackingClassifier(    estimators=[        ("rf", RandomForestClassifier(n_estimators=200, random_state=SEED, n_jobs=-1)),        ("gb", GradientBoostingClassifier(n_estimators=200, random_state=SEED)),        ("xgb", XGBClassifier(n_estimators=200, eval_metric="logloss", random_state=SEED, n_jobs=-1)),    ],    final_estimator=LogisticRegression(max_iter=1000, random_state=SEED),    cv=3, n_jobs=-1)stacking.fit(X_train_cs, y_train_c)y_pred_s = stacking.predict(X_test_cs)acc_s = accuracy_score(y_test_c, y_pred_s)f1_s = f1_score(y_test_c, y_pred_s, average="weighted")clf_results["Stacking Ensemble"] = {"accuracy": acc_s, "f1": f1_s, "model": stacking, "y_pred": y_pred_s}print(f"  Stacking: Acc={acc_s:.4f}  F1={f1_s:.4f}")# Final rankingprint("\n" + "="*60)print("FINAL MODEL RANKING ‚Äì Mission Success Classification")print("="*60)ranking = sorted(clf_results.items(), key=lambda x: x[1]["f1"], reverse=True)for i, (name, res) in enumerate(ranking, 1):    print(f"  {i:>2}. {name:<25s} Acc={res['accuracy']:.4f}  F1={res['f1']:.4f}")best_overall = ranking[0][0]print(f"\nüèÜ Best overall: {best_overall} (F1={clf_results[best_overall]['f1']:.4f})")print("\n" + "="*60)print("ROCKET STATUS PREDICTION RANKING")print("="*60)r_ranking = sorted(rocket_results.items(), key=lambda x: x[1]["f1"], reverse=True)for i, (name, res) in enumerate(r_ranking, 1):    print(f"  {i:>2}. {name:<25s} Acc={res['accuracy']:.4f}  F1={res['f1']:.4f}")

## 12 ¬∑ Generate HTML Report

In [None]:
def img_to_base64(path):    with open(path, "rb") as f:        return base64.b64encode(f.read()).decode()images = {}for p in sorted(PLOT_DIR.glob("*.png")):    images[p.stem] = img_to_base64(p)TEMPLATE = """<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><meta name="viewport" content="width=device-width,initial-scale=1"><title>üöÄ Space Missions ‚Äì ML Report</title><style>:root{--bg:#0f172a;--card:#1e293b;--accent:#06b6d4;--text:#e2e8f0;--muted:#94a3b8}*{margin:0;padding:0;box-sizing:border-box}body{background:var(--bg);color:var(--text);font-family:'Segoe UI',system-ui,sans-serif;padding:2rem}h1{text-align:center;font-size:2.2rem;margin-bottom:.4rem;color:var(--accent)}.subtitle{text-align:center;color:var(--muted);margin-bottom:2rem}.card{background:var(--card);border-radius:12px;padding:1.5rem;margin-bottom:1.5rem;box-shadow:0 4px 24px #0004}.card h2{color:var(--accent);margin-bottom:1rem;font-size:1.3rem}table{width:100%;border-collapse:collapse;margin:1rem 0}th,td{padding:.55rem .8rem;text-align:left;border-bottom:1px solid #334155}th{color:var(--accent);font-size:.85rem;text-transform:uppercase}tr:hover{background:#ffffff08}.best{background:#06b6d415;font-weight:700}img{width:100%;border-radius:8px;margin:.8rem 0}.grid2{display:grid;grid-template-columns:1fr 1fr;gap:1.2rem}@media(max-width:800px){.grid2{grid-template-columns:1fr}}.tag{display:inline-block;padding:2px 10px;border-radius:6px;font-size:.82rem;background:#06b6d422;color:var(--accent);margin:2px}.stat-row{display:flex;gap:1.5rem;flex-wrap:wrap;margin:.8rem 0}.stat-box{background:#06b6d410;border:1px solid #06b6d433;border-radius:8px;padding:1rem 1.5rem;text-align:center;flex:1;min-width:150px}.stat-box .val{font-size:1.6rem;font-weight:700;color:var(--accent)}.stat-box .lbl{font-size:.8rem;color:var(--muted);margin-top:.3rem}</style></head><body><h1>üöÄ Space Missions ‚Äì ML Report</h1><p class="subtitle">4,630 launches ¬∑ 62 companies ¬∑ 1957‚Äì2022</p><div class="card"><h2>üìä Exploratory Data Analysis</h2><img src="data:image/png;base64,{{images.eda_overview}}" alt="EDA Overview"><div class="grid2"><img src="data:image/png;base64,{{images.success_by_company}}" alt="Success by Company"><img src="data:image/png;base64,{{images.correlation_heatmap}}" alt="Correlation Heatmap"></div></div><div class="card"><h2>üéØ Task 1 ‚Äì Mission Success Classification</h2><table><tr><th>#</th><th>Model</th><th>Accuracy</th><th>F1 (weighted)</th></tr>{% for name, res in clf_ranking %}<tr{% if loop.first %} class="best"{% endif %}><td>{{loop.index}}</td><td>{{name}}</td><td>{{"{:.4f}".format(res.accuracy)}}</td><td>{{"{:.4f}".format(res.f1)}}</td></tr>{% endfor %}</table><img src="data:image/png;base64,{{images.model_comparison}}" alt="Model Comparison"></div><div class="card"><h2>üí∞ Task 2 ‚Äì Mission Price Regression</h2><table><tr><th>#</th><th>Model</th><th>R¬≤ (log)</th><th>R¬≤ (orig)</th><th>RMSE (log)</th></tr>{% for name, res in reg_ranking %}<tr{% if loop.first %} class="best"{% endif %}><td>{{loop.index}}</td><td>{{name}}</td><td>{{"{:.4f}".format(res.r2)}}</td><td>{{"{:.4f}".format(res.r2_orig)}}</td><td>{{"{:.3f}".format(res.rmse)}}</td></tr>{% endfor %}</table><div class="grid2">{% if images.price_feature_importance %}<img src="data:image/png;base64,{{images.price_feature_importance}}" alt="Price Feature Importance">{% endif %}<img src="data:image/png;base64,{{images.price_regression_results}}" alt="Price Regression Results"></div></div><div class="card"><h2>üîß Task 3 ‚Äì Rocket Status Prediction (Active / Retired)</h2><table><tr><th>#</th><th>Model</th><th>Accuracy</th><th>F1 (weighted)</th></tr>{% for name, res in rocket_ranking %}<tr{% if loop.first %} class="best"{% endif %}><td>{{loop.index}}</td><td>{{name}}</td><td>{{"{:.4f}".format(res.accuracy)}}</td><td>{{"{:.4f}".format(res.f1)}}</td></tr>{% endfor %}</table><img src="data:image/png;base64,{{images.rocket_status_comparison}}" alt="Rocket Status Comparison"></div><div class="card"><h2>üî¨ Task 4 ‚Äì Launch Clustering</h2><p>Best k={{best_k}}, Silhouette={{"{:.4f}".format(best_sil)}}</p><div class="grid2"><img src="data:image/png;base64,{{images.elbow_silhouette}}" alt="Elbow & Silhouette"><img src="data:image/png;base64,{{images.clustering_results}}" alt="Clustering Results"></div></div><div class="card"><h2>‚öôÔ∏è Hyperparameter Tuning & Cross-Validation</h2><div class="grid2"><img src="data:image/png;base64,{{images.cv_comparison}}" alt="CV Comparison">{% if images.feature_importance %}<img src="data:image/png;base64,{{images.feature_importance}}" alt="Feature Importance">{% endif %}</div><img src="data:image/png;base64,{{images.confusion_matrices}}" alt="Confusion Matrices"><img src="data:image/png;base64,{{images.learning_curves}}" alt="Learning Curves"></div></body></html>"""from types import SimpleNamespaceclf_ranking = [(n, SimpleNamespace(**{k: v for k, v in r.items() if k != "model" and k != "y_pred"}))               for n, r in sorted(clf_results.items(), key=lambda x: x[1]["f1"], reverse=True)]reg_ranking = [(n, SimpleNamespace(**{k: v for k, v in r.items() if k != "model" and k != "y_pred"}))               for n, r in sorted(reg_results.items(), key=lambda x: x[1]["r2"], reverse=True)]rocket_ranking = [(n, SimpleNamespace(**{k: v for k, v in r.items() if k != "model" and k != "y_pred"}))                  for n, r in sorted(rocket_results.items(), key=lambda x: x[1]["f1"], reverse=True)]html = jinja2.Template(TEMPLATE).render(    images=images,    clf_ranking=clf_ranking,    reg_ranking=reg_ranking,    rocket_ranking=rocket_ranking,    best_k=best_k, best_sil=max(sils),)out_path = pathlib.Path("outputs/space_ml_report.html")out_path.write_text(html)print(f"‚úÖ HTML Report generated: {out_path}")print(f"   File size: {out_path.stat().st_size / 1024:.1f} KB")print(f"   Embedded images: {len(images)}")