In [70]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, confusion_matrix

In [71]:
pip install xgboost



In [72]:
asteroid_data_df = pd.read_csv(r'/content/model/Asteroid_Updated.csv',low_memory=False)
asteroid_data_df.isnull().sum()

Unnamed: 0,0
name,817747
a,2
e,0
i,0
om,0
w,0
q,0
ad,6
per_y,1
data_arc,15474


In [73]:
orginal_data = asteroid_data_df.copy()
missing_percent = (asteroid_data_df.isnull().sum() / len(asteroid_data_df)) * 100
missing_percent.sort_values(ascending=False)

Unnamed: 0,0
IR,99.999881
GM,99.998333
extent,99.997856
G,99.985829
UB,99.883413
spec_T,99.883294
BV,99.878411
spec_B,99.801599
rot_per,97.761619
name,97.38399


In [74]:
cols_to_drop = missing_percent[missing_percent >= 50].index.tolist()
cols_to_drop

['name',
 'diameter',
 'extent',
 'albedo',
 'rot_per',
 'GM',
 'BV',
 'UB',
 'IR',
 'spec_B',
 'spec_T',
 'G']

In [75]:
asteroid_data_df = asteroid_data_df.drop(columns=cols_to_drop)

In [76]:
asteroid_data_df.isnull().sum().sort_values(ascending=False).head(10)


Unnamed: 0,0
pha,16442
moid,16442
data_arc,15474
H,2689
condition_code,867
ma,8
ad,6
per,6
neo,6
a,2


In [77]:
asteroid_data_df.shape

(839714, 19)

In [78]:
asteroid_data_df.columns

Index(['a', 'e', 'i', 'om', 'w', 'q', 'ad', 'per_y', 'data_arc',
       'condition_code', 'n_obs_used', 'H', 'neo', 'pha', 'moid', 'class', 'n',
       'per', 'ma'],
      dtype='object')

In [79]:
asteroid_data_df.head(10)

Unnamed: 0,a,e,i,om,w,q,ad,per_y,data_arc,condition_code,n_obs_used,H,neo,pha,moid,class,n,per,ma
0,2.769165,0.076009,10.594067,80.305532,73.597694,2.558684,2.979647,4.608202,8822.0,0,1002,3.34,N,N,1.59478,MBA,0.213885,1683.145708,77.372096
1,2.772466,0.230337,34.836234,173.080063,310.048857,2.133865,3.411067,4.616444,72318.0,0,8490,4.13,N,N,1.23324,MBA,0.213503,1686.155999,59.699133
2,2.66915,0.256942,12.988919,169.85276,248.138626,1.983332,3.354967,4.360814,72684.0,0,7104,5.33,N,N,1.03454,MBA,0.226019,1592.787285,34.925016
3,2.361418,0.088721,7.141771,103.810804,150.728541,2.151909,2.570926,3.628837,24288.0,0,9325,3.2,N,N,1.13948,MBA,0.271609,1325.432765,95.861936
4,2.574249,0.191095,5.366988,141.576605,358.687607,2.082324,3.066174,4.130323,63507.0,0,2916,6.85,N,N,1.09589,MBA,0.238632,1508.600458,282.366289
5,2.42516,0.203007,14.737901,138.640203,239.80749,1.932835,2.917485,3.776755,62329.0,0,6034,5.71,N,N,0.973965,MBA,0.260972,1379.459705,86.197923
6,2.385334,0.231206,5.523651,259.563231,145.265106,1.833831,2.936837,3.684105,62452.0,0,5206,5.51,N,N,0.8461,MBA,0.267535,1345.619196,140.419656
7,2.201764,0.156499,5.886955,110.88933,285.287462,1.85719,2.546339,3.267115,62655.0,0,2744,6.49,N,N,0.874176,MBA,0.301681,1193.313717,194.882895
8,2.385637,0.123114,5.576816,68.908577,6.417369,2.091931,2.679342,3.684806,61821.0,0,2649,6.28,N,N,1.10691,MBA,0.267484,1345.875362,276.861623
9,3.141539,0.112461,3.83156,283.202167,312.315206,2.78824,3.494839,5.568291,62175.0,0,3409,5.43,N,N,1.77839,MBA,0.177007,2033.818284,152.184851


In [80]:
#Encoding
asteroid_data_df["pha"] = asteroid_data_df["pha"].map({"Y":1,"N":0})
asteroid_data_df["neo"] = asteroid_data_df["neo"].map({"Y":1, "N":0})


In [81]:
asteroid_data_df = pd.get_dummies(asteroid_data_df, columns=["class"], drop_first=True)

In [82]:
asteroid_data_df.dtypes

Unnamed: 0,0
a,float64
e,float64
i,float64
om,float64
w,float64
q,float64
ad,float64
per_y,float64
data_arc,float64
condition_code,object


In [83]:
asteroid_data_df = asteroid_data_df.dropna(subset=["pha","neo"])
asteroid_data_df["pha"] = asteroid_data_df["pha"].astype(int)
asteroid_data_df["neo"] = asteroid_data_df["neo"].astype(int)

asteroid_data_df.head(10)

Unnamed: 0,a,e,i,om,w,q,ad,per_y,data_arc,condition_code,...,class_CEN,class_HYA,class_IEO,class_IMB,class_MBA,class_MCA,class_OMB,class_PAA,class_TJN,class_TNO
0,2.769165,0.076009,10.594067,80.305532,73.597694,2.558684,2.979647,4.608202,8822.0,0,...,False,False,False,False,True,False,False,False,False,False
1,2.772466,0.230337,34.836234,173.080063,310.048857,2.133865,3.411067,4.616444,72318.0,0,...,False,False,False,False,True,False,False,False,False,False
2,2.66915,0.256942,12.988919,169.85276,248.138626,1.983332,3.354967,4.360814,72684.0,0,...,False,False,False,False,True,False,False,False,False,False
3,2.361418,0.088721,7.141771,103.810804,150.728541,2.151909,2.570926,3.628837,24288.0,0,...,False,False,False,False,True,False,False,False,False,False
4,2.574249,0.191095,5.366988,141.576605,358.687607,2.082324,3.066174,4.130323,63507.0,0,...,False,False,False,False,True,False,False,False,False,False
5,2.42516,0.203007,14.737901,138.640203,239.80749,1.932835,2.917485,3.776755,62329.0,0,...,False,False,False,False,True,False,False,False,False,False
6,2.385334,0.231206,5.523651,259.563231,145.265106,1.833831,2.936837,3.684105,62452.0,0,...,False,False,False,False,True,False,False,False,False,False
7,2.201764,0.156499,5.886955,110.88933,285.287462,1.85719,2.546339,3.267115,62655.0,0,...,False,False,False,False,True,False,False,False,False,False
8,2.385637,0.123114,5.576816,68.908577,6.417369,2.091931,2.679342,3.684806,61821.0,0,...,False,False,False,False,True,False,False,False,False,False
9,3.141539,0.112461,3.83156,283.202167,312.315206,2.78824,3.494839,5.568291,62175.0,0,...,False,False,False,False,True,False,False,False,False,False


In [84]:
listColumnsNumeric = ["a", "ad", "per_y", "n", "per", "ma", "H"]
for i in listColumnsNumeric:
    asteroid_data_df[i].fillna(asteroid_data_df[i].mean(), inplace=True)
asteroid_data_df.dropna(inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  asteroid_data_df[i].fillna(asteroid_data_df[i].mean(), inplace=True)


In [85]:
asteroid_data_df.shape

(822849, 31)

In [86]:
x = asteroid_data_df.drop("pha", axis=1)
y = asteroid_data_df["pha"]
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.20, random_state=42, stratify=y)

In [87]:
bool_cols = xTrain.select_dtypes(include='bool').columns
xTrain[bool_cols] = xTrain[bool_cols].astype(int)
xTest[bool_cols] = xTest[bool_cols].astype(int)



In [88]:
xTrain.dtypes


Unnamed: 0,0
a,float64
e,float64
i,float64
om,float64
w,float64
q,float64
ad,float64
per_y,float64
data_arc,float64
condition_code,object


In [89]:
scaler = StandardScaler()
xTrain = scaler.fit_transform(xTrain)
xTest = scaler.transform(xTest)

In [90]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}
xTrain = xTrain.astype('float32')
xTest = xTest.astype('float32')

X_train_small, _, y_train_small, _ = train_test_split(
    xTrain, yTrain,
    train_size=0.6,
    stratify=yTrain,
    random_state=42
)

X_test_small, _, y_test_small, _ = train_test_split(
    xTest, yTest,
    train_size=0.6,
    stratify=yTest,
    random_state=42
)


# New Section

In [91]:
results = []

for name, model in models.items():
    print(f"Training {name}...")

    model.fit(xTrain, yTrain)

    y_pred = model.predict(xTest)

    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(xTest)[:, 1]
    else:
        y_proba = y_pred

    acc = accuracy_score(yTest, y_pred)
    auc = roc_auc_score(yTest, y_proba)
    prec = precision_score(yTest, y_pred, zero_division=0)
    rec = recall_score(yTest, y_pred, zero_division=0)
    f1 = f1_score(yTest, y_pred, zero_division=0)
    mcc = matthews_corrcoef(yTest, y_pred)

    results.append([name, acc, auc, prec, rec, f1, mcc])

    joblib.dump(model, f"{name.replace(' ', '_').lower()}.pkl")

joblib.dump(scaler, "scaler.pkl")

metrics_df = pd.DataFrame(results, columns=["Model", "Accuracy", "AUC", "Precision", "Recall", "F1", "MCC"])
metrics_df = metrics_df.round(4)

metrics_df


Training Logistic Regression...
Training Decision Tree...
Training KNN...
Training Naive Bayes...
Training Random Forest...
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.9975,0.9977,0.4615,0.2382,0.3142,0.3304
1,Decision Tree,0.9999,0.9838,0.9824,0.9677,0.975,0.975
2,KNN,0.9978,0.9328,0.5789,0.4094,0.4797,0.4858
3,Naive Bayes,0.9794,0.9901,0.1061,1.0,0.1918,0.3223
4,Random Forest,0.9999,1.0,0.9898,0.9653,0.9774,0.9774
5,XGBoost,0.9996,0.975,0.9278,0.8933,0.9102,0.9102


In [94]:
observations = [
    ["Logistic Regression", "High accuracy but low recall; misses many hazardous asteroids"],
    ["Decision Tree", "Excellent balance; almost perfect detection"],
    ["KNN", "Moderate performance; struggles with rare class"],
    ["Naive Bayes", "Good performance; may produce some false positives due to independence assumption"],
    ["Random Forest", "Best overall; strong metrics across the board with high stability"],
    ["XGBoost", "Very strong performance; close to Random Forest with slightly different trade-off"]
]

observations_df = pd.DataFrame(observations, columns=["Model", "Observation"])
observations_df


Unnamed: 0,Model,Observation
0,Logistic Regression,High accuracy but low recall; misses many haza...
1,Decision Tree,Excellent balance; almost perfect detection
2,KNN,Moderate performance; struggles with rare class
3,Naive Bayes,Good performance; may produce some false posit...
4,Random Forest,Best overall; strong metrics across the board ...
5,XGBoost,Very strong performance; close to Random Fores...


In [93]:
train_columns = x.columns   # after dropping target pha
joblib.dump(train_columns.tolist(), "model/train_columns.pkl")

['model/train_columns.pkl']