In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import xgboost as xgb


In [None]:
import os
import pandas as pd

input_dir = "/kaggle/input/wormhole-stability"
print("Available files:", os.listdir(input_dir))

train_df = pd.read_csv(f"{input_dir}/train.csv")
test_df = pd.read_csv(f"{input_dir}/test.csv")
sample_submission = pd.read_csv(f"{input_dir}/sample_submission.csv")

print(train_df.head())

# Display dataset information
print(train_df.info())
print(test_df.info())

# Check missing values
print(train_df.isnull().sum().sum(), "missing values in train")
print(test_df.isnull().sum().sum(), "missing values in test")


Available files: ['sample_submission.csv', 'train.csv', 'test.csv']
   ID  Gravitational Flux  Dark Matter Density  Quantum Variance  \
0   0            0.411568           -13.761699          0.962127   
1   1            0.372928           -66.702856         -3.110865   
2   2            0.475778          -184.794916         -1.783947   
3   3           -0.569379          -260.749026         -1.582171   
4   4           -0.068484           572.566093          2.819685   

   Space-Time Curvature  Neutrino Activity  Hawking Radiation  \
0              0.018980           4.670974          -0.041122   
1              0.071914          19.835546          -0.214718   
2             -0.082711          28.923294          -0.239985   
3              0.062023           2.577614          -0.032810   
4             -0.159080         -23.035645           0.077040   

   Magnetar Field Strength  Cosmic Drift  Event Horizon Stability  ...  \
0                -0.305560      0.084127                -0

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer


train_cleaned = train_df.copy()
test_cleaned = test_df.copy()

# Identify feature columns (excluding 'ID' and 'Wormhole Stability')
feature_cols = [col for col in train_cleaned.columns if col not in ["ID", "Wormhole Stability"]]

# Fill missing values with mean
imputer = SimpleImputer(strategy="mean")
train_cleaned[feature_cols] = imputer.fit_transform(train_cleaned[feature_cols])
test_cleaned[feature_cols] = imputer.transform(test_cleaned[feature_cols])

print("Missing values handled successfully")


Missing values handled successfully


In [None]:
# Convert target variable to numeric
label_encoder = LabelEncoder()
train_cleaned["Wormhole Stability"] = label_encoder.fit_transform(train_cleaned["Wormhole Stability"])

print("Target variable encoded")


Target variable encoded


In [None]:
X = train_cleaned[feature_cols]
y = train_cleaned["Wormhole Stability"]
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("Data balanced successfully")


Data balanced successfully


In [None]:
# Standardize features
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)
test_cleaned[feature_cols] = scaler.transform(test_cleaned[feature_cols])

print("Feature scaling done")


Feature scaling done


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

log_reg = LogisticRegression()
rf= RandomForestClassifier(n_estimators=100, max_depth=10,
                                  min_samples_split=5, min_samples_leaf=2,
                                  random_state=42)
xgb_model = xgb.XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss")

log_reg.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

log_reg_pred = log_reg.predict(X_val)
rf_pred = rf.predict(X_val)
xgb_pred = xgb_model.predict(X_val)

# Compute F1 Scores
log_reg_f1 = f1_score(y_val, log_reg_pred)
rf_f1 = f1_score(y_val, rf_pred)
xgb_f1 = f1_score(y_val, xgb_pred)

# Print F1 Scores
print(f"F1 Score - Logistic Regression: {log_reg_f1}")
print(f"F1 Score - Random Forest: {rf_f1}")


F1 Score - Logistic Regression: 0.7730337078651686
F1 Score - Random Forest: 0.8925531914893616


In [None]:
from sklearn.metrics import accuracy_score

train_preds = xgb_model.predict(X_train)

# Calculate training accuracy
train_accuracy = accuracy_score(y_train, train_preds)

print(f"XGBoost Training Accuracy: {train_accuracy:.4f}")


XGBoost Training Accuracy: 1.0000


In [None]:
from sklearn.metrics import accuracy_score

val_preds = xgb_model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_preds)

print(f"XGBoost Validation Accuracy: {val_accuracy:.4f}")


XGBoost Validation Accuracy: 0.9467


In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

cat_model = CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=6,
                               l2_leaf_reg=10, random_seed=42, verbose=200)

cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=200)

val_preds_cat = cat_model.predict(X_val)
cat_f1 = f1_score(y_val, val_preds_cat, average="binary")

print(f"CatBoost Validation F1 Score: {cat_f1:.4f}")


0:	learn: 0.6662702	test: 0.6684321	best: 0.6684321 (0)	total: 97.4ms	remaining: 1m 37s
200:	learn: 0.2246878	test: 0.2792817	best: 0.2792817 (200)	total: 1.61s	remaining: 6.39s
400:	learn: 0.1266982	test: 0.2083993	best: 0.2083993 (400)	total: 3.38s	remaining: 5.04s
600:	learn: 0.0777519	test: 0.1738090	best: 0.1738090 (600)	total: 5.13s	remaining: 3.4s
800:	learn: 0.0509133	test: 0.1530462	best: 0.1530462 (800)	total: 6.63s	remaining: 1.65s
999:	learn: 0.0351601	test: 0.1400109	best: 0.1400109 (999)	total: 8.14s	remaining: 0us

bestTest = 0.1400109033
bestIteration = 999

CatBoost Validation F1 Score: 0.9480


# Best submission - CatBoost

In [None]:
import os
from catboost import CatBoostClassifier
import pandas as pd

input_dir = "/kaggle/input/wormhole-stability"

train_df = pd.read_csv(f"{input_dir}/train.csv")
test_df = pd.read_csv(f"{input_dir}/test.csv")

train_df["Wormhole Stability"] = train_df["Wormhole Stability"].map({"Success": 1, "Fail": 0})

X_train = train_df.drop(columns=["ID", "Wormhole Stability"])
y_train = train_df["Wormhole Stability"]

cat_model = CatBoostClassifier(
    iterations=1500,
    learning_rate=0.04, 
    depth=7,
    l2_leaf_reg=6,
    random_seed=42,
    loss_function='Logloss',
    eval_metric='F1',
    auto_class_weights="SqrtBalanced",
    verbose=False
)

cat_model.fit(X_train, y_train, early_stopping_rounds=50, verbose=False)

# Ensure test features match train features
test_features = test_df.drop(columns=["ID"], errors='ignore')
test_features = test_features[X_train.columns] 

# Predict probabilities instead of direct class labels
probs = cat_model.predict_proba(test_features)[:, 1]

# Adjust threshold for better balance 
threshold = 0.35 
test_predictions = ["Success" if p > threshold else "Fail" for p in probs]

submission = pd.DataFrame({
    "ID": test_df["ID"],
    "Wormhole Stability": test_predictions
})

submission.to_csv("submission3.csv", index=False)
print("Submission file 'submission3.csv' created successfully")

print(submission["Wormhole Stability"].value_counts(normalize=True))


Submission file 'submission3.csv' created successfully
Wormhole Stability
Fail       0.762308
Success    0.237692
Name: proportion, dtype: float64


In [None]:
print(train_df["Wormhole Stability"].value_counts(normalize=True))


Wormhole Stability
0    0.884615
1    0.115385
Name: proportion, dtype: float64
