In [1]:
import polars as pl
import json
import numpy as np
pl.Config.set_fmt_float("mixed")

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
from sklearn.model_selection import cross_val_score

In [None]:
with open('data/raw/astroid_2010-01-01_2010-01-08.json', 'r') as f:
    data = json.load(f)

with open('data/raw/astroid_2010-01-08_2010-01-15.json', 'r') as af:
    data_2 = json.load(af)

In [3]:
def format_data(data):
    data_list = []
    for date in data['near_earth_objects']:
        result = {'date': date}
        for idx, obj in enumerate(data['near_earth_objects'][date]):
            new_result = result | {
                # General stuff about astroid
                'obj_that_day': idx,
                'id': str(obj['id']),
                'name': obj['name'],
                'absolute_magniutude_h': obj['absolute_magnitude_h'],
                # Estimated Diameter
                'estimated_diameter_min_km': obj['estimated_diameter']['kilometers']['estimated_diameter_min'],
                'estimated_diameter_max_km': obj['estimated_diameter']['kilometers']['estimated_diameter_max'],
                'estimated_diameter_min_m': obj['estimated_diameter']['meters']['estimated_diameter_min'],
                'estimated_diameter_max_m': obj['estimated_diameter']['meters']['estimated_diameter_max'],
                'estimated_diameter_min_miles': obj['estimated_diameter']['miles']['estimated_diameter_min'],
                'estimated_diameter_max_miles': obj['estimated_diameter']['miles']['estimated_diameter_max'],
                'estimated_diameter_min_feet': obj['estimated_diameter']['feet']['estimated_diameter_min'],
                'estimated_diameter_max_feet': obj['estimated_diameter']['feet']['estimated_diameter_max'],
                # Potentially hazardous, Sentry object refers to if the astroid is tracked by nasa's sentry system 
                'is_potentially_hazardous': obj['is_potentially_hazardous_asteroid'],
                'is_sentry_object': obj['is_sentry_object']
            }
            for val in obj['close_approach_data']:
                final_result = new_result | {
                    # Close approaching dates
                    'close_approach_date': val['close_approach_date_full'],
                    'epoch_date_close_approach': val['epoch_date_close_approach'],
                    # Velocity values
                    'relative_velocity_km/sec': float(val['relative_velocity']['kilometers_per_second']),
                    'relative_velocity_km/hr': float(val['relative_velocity']['kilometers_per_hour']),
                    'relative_velocity_mph': float(val['relative_velocity']['miles_per_hour']),
                    # Miss distance
                    'miss_distance_astronomical': float(val['miss_distance']['astronomical']),
                    'miss_distance_lunar': float(val['miss_distance']['lunar']),
                    'miss_distance_kilometers': float(val['miss_distance']['kilometers']),
                    'miss_distance_miles': float(val['miss_distance']['miles']),
                    # Orbiting body 
                    'oribiting_body': val['orbiting_body']
                }

                data_list.append(final_result)
    return data_list

In [4]:
df = pl.DataFrame(format_data(data))

In [5]:
df.columns

['date',
 'obj_that_day',
 'id',
 'name',
 'absolute_magniutude_h',
 'estimated_diameter_min_km',
 'estimated_diameter_max_km',
 'estimated_diameter_min_m',
 'estimated_diameter_max_m',
 'estimated_diameter_min_miles',
 'estimated_diameter_max_miles',
 'estimated_diameter_min_feet',
 'estimated_diameter_max_feet',
 'is_potentially_hazardous',
 'is_sentry_object',
 'close_approach_date',
 'epoch_date_close_approach',
 'relative_velocity_km/sec',
 'relative_velocity_km/hr',
 'relative_velocity_mph',
 'miss_distance_astronomical',
 'miss_distance_lunar',
 'miss_distance_kilometers',
 'miss_distance_miles',
 'oribiting_body']

In [None]:
# Feature engineering
def feature_engineering(data: pl.DataFrame) -> pl.DataFrame:
    

In [7]:
new_data = feature_engineering(data=df)

In [8]:
new_data.head()

date,obj_that_day,id,name,absolute_magniutude_h,estimated_diameter_min_km,estimated_diameter_max_km,estimated_diameter_min_m,estimated_diameter_max_m,estimated_diameter_min_miles,estimated_diameter_max_miles,estimated_diameter_min_feet,estimated_diameter_max_feet,is_potentially_hazardous,is_sentry_object,close_approach_date,epoch_date_close_approach,relative_velocity_km/sec,relative_velocity_km/hr,relative_velocity_mph,miss_distance_astronomical,miss_distance_lunar,miss_distance_kilometers,miss_distance_miles,oribiting_body,avg_diameter_km,diameter_uncertainty_km,estimated_volume,cross_section_area_km2,diameter_uncertainty_ratio,size_category,kenetic_energy,momentum,velocity_per_au,velocity_distance_ratio,velocity_category,lunar_distance_ratio,earth_radii_distance,close_approach_score,impact_potential,destruction_potential,hazard_index,proximity_level,approach_datetime,approach_year,approach_month,approach_day,approach_hour,day_of_week,day_of_year,month_sin,month_cos,hour_sin,hour_cos,brightness_size_ratio,apparent_density_inverse,brightness_category,size_velocity_product,size_squared_velocity,escape_velocity_ratio,threat_score,size_percentile,velocity_percentile,distance_percentile,size_zscore,velocity_zscore,distance_zscore,log_diameter,log_velocity,log_distance
str,i64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,bool,bool,str,i64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,str,datetime[ms],i32,i8,i8,i8,i8,i16,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""2020-01-01""",0,"""3564720""","""(2011 HS60)""",21.34,0.143402,0.320656,143.401923,320.656449,0.089106,0.199247,470.478767,1052.022504,False,False,"""2020-Jan-01 21:59""",1577915940000,17.774433,63987.957021,39759.628072,0.198878,77.363533,29752000.0,18487000.0,"""Earth""",0.232029,0.177255,0.006541,0.042284,0.763932,"""medium""",2.066417,0.116258,89.373558,0.597425,"""medium""",0.000201,4669.866872,5.028209,20.737274,10.390376,5.717e-07,"""moderate""",2020-01-01 21:59:00,2020,1,1,21,3,1,0.5,0.866025,-0.707107,0.707107,91.971188,152.888045,"""bright""",4.124187,0.956932,1.587003,20.633524,0.865079,0.793651,0.547619,0.493881,0.669826,-0.049103,0.208663,2.932496,17.208398
"""2020-01-01""",1,"""3591759""","""(2011 YE40)""",25.2,0.024241,0.054205,24.241248,54.205079,0.015063,0.033681,79.531656,177.83819,False,False,"""2020-Jan-01 11:55""",1577879700000,12.780287,46009.033071,28588.22391,0.061832,24.052806,9250000.0,5747700.0,"""Earth""",0.039223,0.029964,3.2e-05,0.001208,0.763932,"""tiny""",0.005161,0.000404,206.692375,1.381653,"""medium""",6.3e-05,1451.890804,16.172749,8.107129,0.083463,2.7166e-08,"""close""",2020-01-01 11:55:00,2020,1,1,11,3,1,0.5,0.866025,0.258819,-0.965926,642.477502,31649.986408,"""very_dim""",0.501283,0.019662,1.141097,7.978101,0.357143,0.547619,0.214286,-0.478038,-0.023414,-0.986705,0.038473,2.623239,16.040134
"""2020-01-01""",2,"""3630817""","""(2013 EC20)""",29.0,0.004213,0.00942,4.212646,9.419763,0.002618,0.005853,13.821018,30.904735,False,True,"""2020-Jan-01 03:23""",1577848980000,2.793701,10057.324955,6249.230609,0.162019,63.025272,24238000.0,15061000.0,"""Earth""",0.006816,0.005207,1.6582e-07,3.6e-05,0.763932,"""tiny""",1e-06,4.6324e-07,17.243081,0.115263,"""slow""",0.000164,3804.37159,6.172127,0.117532,8e-06,1.4961e-11,"""moderate""",2020-01-01 03:23:00,2020,1,1,3,3,1,0.5,0.866025,0.707107,0.707107,4254.567124,6030800.0,"""very_dim""",0.019042,0.00013,0.249438,0.116811,0.02381,0.02381,0.468254,-0.641399,-1.409659,-0.301277,0.006793,1.333342,17.003418
"""2020-01-01""",3,"""3747497""","""(2016 EF195)""",25.5,0.021113,0.047211,21.113244,47.21065,0.013119,0.029335,69.269177,154.890589,False,False,"""2020-Jan-01 08:44""",1577868240000,17.548446,63174.405279,39254.118658,0.276326,107.490916,41338000.0,25686000.0,"""Earth""",0.034162,0.026097,2.1e-05,0.000917,0.763932,"""tiny""",0.006428,0.000366,63.506254,0.424513,"""medium""",0.00028,6488.435131,3.61891,2.169497,0.023264,8.6939e-09,"""far""",2020-01-01 08:44:00,2020,1,1,8,3,1,0.5,0.866025,0.866025,-0.5,746.44457,47904.192775,"""very_dim""",0.599489,0.02048,1.566826,2.161674,0.293651,0.777778,0.65873,-0.503551,0.638457,0.480761,0.033591,2.920386,17.537288
"""2020-01-01""",4,"""3893737""","""(2019 WE5)""",23.3,0.058151,0.130029,58.150704,130.028927,0.036133,0.080796,190.783156,426.604105,False,False,"""2020-Jan-01 14:55""",1577890500000,5.002825,18010.17068,11190.819665,0.134597,52.3584,20135000.0,12512000.0,"""Earth""",0.09409,0.071878,0.000436,0.006953,0.763932,"""small""",0.010916,0.002182,37.168802,0.248458,"""slow""",0.000136,3160.491096,7.429562,3.497206,0.0811,1.1004e-08,"""moderate""",2020-01-01 14:55:00,2020,1,1,14,3,1,0.5,0.866025,-0.5,-0.866025,247.635728,2292.838824,"""dim""",0.470715,0.044289,0.446681,3.471415,0.59127,0.095238,0.420635,-0.20146,-1.103009,-0.488881,0.089923,1.79223,16.817994


In [None]:
new_data.write_csv('feature_engineered_data.csv')

In [None]:
print("\n=== MISSING VALUES ===")
null_counts = new_data.null_count()
print("Columns with missing values:")
null_counts

In [None]:
print("=== DATA SHAPE & TYPES ===")
print(f"Shape: {new_data.shape}")
print(f"Columns: {len(new_data.columns)}")
print("\nData types:")
print(new_data.dtypes)

print("\n=== TARGET VARIABLE ===")
print(f"Target distribution:")
print(new_data['is_potentially_hazardous'].value_counts())

print("\n=== MISSING VALUES ===")
null_counts = new_data.null_count().to_series().to_list()
print("Columns with missing values:")
print(null_counts)

print("\n=== SAMPLE ROWS ===")
print(new_data.head(3))

In [27]:
numerical_cols = new_data.select(pl.col(pl.Float64, pl.Int64, pl.Int32, pl.Int8, pl.Int16)).columns
numerical_cols = [col for col in numerical_cols if col not in ['obj_that_day', 'epoch_date_close_approach']]

In [28]:
numerical_cols

['absolute_magniutude_h',
 'estimated_diameter_min_km',
 'estimated_diameter_max_km',
 'estimated_diameter_min_m',
 'estimated_diameter_max_m',
 'estimated_diameter_min_miles',
 'estimated_diameter_max_miles',
 'estimated_diameter_min_feet',
 'estimated_diameter_max_feet',
 'relative_velocity_km/sec',
 'relative_velocity_km/hr',
 'relative_velocity_mph',
 'miss_distance_astronomical',
 'miss_distance_lunar',
 'miss_distance_kilometers',
 'miss_distance_miles',
 'avg_diameter_km',
 'diameter_uncertainty_km',
 'estimated_volume',
 'cross_section_area_km2',
 'diameter_uncertainty_ratio',
 'kenetic_energy',
 'momentum',
 'velocity_per_au',
 'velocity_distance_ratio',
 'lunar_distance_ratio',
 'earth_radii_distance',
 'close_approach_score',
 'impact_potential',
 'destruction_potential',
 'hazard_index',
 'approach_year',
 'approach_month',
 'approach_day',
 'approach_hour',
 'day_of_week',
 'day_of_year',
 'month_sin',
 'month_cos',
 'hour_sin',
 'hour_cos',
 'brightness_size_ratio',
 'app

In [None]:
def feature_selection_corr(data):
    

    correlations = []
    for col in numerical_cols:
        try:
            corr = data.select(pl.corr('is_potentially_hazardous', col)).item()
            if corr is not None:
                correlations.append((col, abs(col)))
        except:
            continue
    
    correlations.sort(key=lambda x: x[1], reverse=True)
    return correlations


In [None]:
correlations = feature_selection_corr(new_data) 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

temp_data = new_data.select(numerical_cols)

correlations = {}
for col in numerical_cols:
    corr = new_data.select(pl.corr('is_potentially_hazardous', col)).item()
    correlations[col] = corr

In [26]:
numerical_cols

In [56]:
# Put all imports at the top
from sklearn.model_selection import train_test_split, StratifiedKFold
def get_all_numerical_features_simple(data):
    """
    Simple approach to get numerical features
    """
    numerical_features = []
    
    for col in data.columns:
        dtype = str(data[col].dtype)
        print(f"Column '{col}': {dtype}")
        
        # Check if it's a numerical type
        if any(num_type in dtype.lower() for num_type in ['int', 'float']):
            if col not in ['obj_that_day', 'epoch_date_close_approach', 'approach_year']:
                numerical_features.append(col)
    
    print(f"\n📊 Selected numerical features: {len(numerical_features)}")
    return numerical_features

# Try this simpler approach
selected_features = get_all_numerical_features_simple(new_data)

def prepare_data_splits(data, selected_features):
    X = data.select(selected_features).to_pandas()
    y = data.select('is_potentially_hazardous').to_pandas().squeeze()
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, stratify=y, random_state=42
    )
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    return X_train, X_test, y_train, y_test, cv

Column 'date': String
Column 'obj_that_day': Int64
Column 'id': String
Column 'name': String
Column 'absolute_magniutude_h': Float64
Column 'estimated_diameter_min_km': Float64
Column 'estimated_diameter_max_km': Float64
Column 'estimated_diameter_min_m': Float64
Column 'estimated_diameter_max_m': Float64
Column 'estimated_diameter_min_miles': Float64
Column 'estimated_diameter_max_miles': Float64
Column 'estimated_diameter_min_feet': Float64
Column 'estimated_diameter_max_feet': Float64
Column 'is_potentially_hazardous': Boolean
Column 'is_sentry_object': Boolean
Column 'close_approach_date': String
Column 'epoch_date_close_approach': Int64
Column 'relative_velocity_km/sec': Float64
Column 'relative_velocity_km/hr': Float64
Column 'relative_velocity_mph': Float64
Column 'miss_distance_astronomical': Float64
Column 'miss_distance_lunar': Float64
Column 'miss_distance_kilometers': Float64
Column 'miss_distance_miles': Float64
Column 'oribiting_body': String
Column 'avg_diameter_km': Flo

In [57]:
selected_features = get_all_numerical_features_simple(new_data)
X_train, X_test, y_train, y_test, cv = prepare_data_splits(new_data, selected_features)

Column 'date': String
Column 'obj_that_day': Int64
Column 'id': String
Column 'name': String
Column 'absolute_magniutude_h': Float64
Column 'estimated_diameter_min_km': Float64
Column 'estimated_diameter_max_km': Float64
Column 'estimated_diameter_min_m': Float64
Column 'estimated_diameter_max_m': Float64
Column 'estimated_diameter_min_miles': Float64
Column 'estimated_diameter_max_miles': Float64
Column 'estimated_diameter_min_feet': Float64
Column 'estimated_diameter_max_feet': Float64
Column 'is_potentially_hazardous': Boolean
Column 'is_sentry_object': Boolean
Column 'close_approach_date': String
Column 'epoch_date_close_approach': Int64
Column 'relative_velocity_km/sec': Float64
Column 'relative_velocity_km/hr': Float64
Column 'relative_velocity_mph': Float64
Column 'miss_distance_astronomical': Float64
Column 'miss_distance_lunar': Float64
Column 'miss_distance_kilometers': Float64
Column 'miss_distance_miles': Float64
Column 'oribiting_body': String
Column 'avg_diameter_km': Flo

In [60]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

def smart_preprocessing(X_train, X_test, selected_features):
    """
    Smart preprocessing that handles only the features that actually exist
    """
    
    # Debug: Check what columns we actually have
    print("🔍 Available columns in X_train:")
    print(X_train.columns.tolist())
    print(f"\n🔍 Selected features we're looking for:")
    print(selected_features)
    
    # Find boolean features that actually exist in the data
    potential_boolean_features = ['is_sentry_object']
    actual_boolean_features = [col for col in potential_boolean_features if col in X_train.columns]
    
    # All other features are numerical
    numerical_features = [col for col in selected_features if col not in actual_boolean_features]
    
    print(f"\n📊 Feature breakdown:")
    print(f"Boolean features found: {actual_boolean_features}")
    print(f"Numerical features: {len(numerical_features)}")
    
    # Create appropriate preprocessor based on what we actually have
    if actual_boolean_features:
        # We have both numerical and boolean features
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numerical_features),
                ('bool', 'passthrough', actual_boolean_features)
            ],
            remainder='drop'
        )
    else:
        # Only numerical features - use simple StandardScaler
        print("ℹ️  Only numerical features found, using StandardScaler")
        preprocessor = StandardScaler()
    
    # Fit and transform
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)  # Use transform, not fit_transform!
    
    print(f"✅ Preprocessing complete:")
    print(f"   Training shape: {X_train_processed.shape}")
    print(f"   Test shape: {X_test_processed.shape}")
    
    return X_train_processed, X_test_processed

In [62]:
X_train_processed, X_test_processed = smart_preprocessing(X_train, X_test, selected_features=selected_features)

🔍 Available columns in X_train:
['absolute_magniutude_h', 'estimated_diameter_min_km', 'estimated_diameter_max_km', 'estimated_diameter_min_m', 'estimated_diameter_max_m', 'estimated_diameter_min_miles', 'estimated_diameter_max_miles', 'estimated_diameter_min_feet', 'estimated_diameter_max_feet', 'relative_velocity_km/sec', 'relative_velocity_km/hr', 'relative_velocity_mph', 'miss_distance_astronomical', 'miss_distance_lunar', 'miss_distance_kilometers', 'miss_distance_miles', 'avg_diameter_km', 'diameter_uncertainty_km', 'estimated_volume', 'cross_section_area_km2', 'diameter_uncertainty_ratio', 'kenetic_energy', 'momentum', 'velocity_per_au', 'velocity_distance_ratio', 'lunar_distance_ratio', 'earth_radii_distance', 'close_approach_score', 'impact_potential', 'destruction_potential', 'hazard_index', 'approach_month', 'approach_day', 'approach_hour', 'day_of_week', 'day_of_year', 'month_sin', 'month_cos', 'hour_sin', 'hour_cos', 'brightness_size_ratio', 'apparent_density_inverse', 'si

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


def baseline_model(X_train_processed, X_test_processed, y_train, y_test, cv):
    # Model 1: Random Forest
    rf = RandomForestClassifier(
        n_estimators=200, # More trees for better performance
        class_weight='balanced', # Handle 12 vs 114 imbalance
        random_state=42,
        max_depth=4, # Shallow to prevent overfitting
        min_samples_split=5, # Conservastive splitting
        min_samples_leaf=2 # Ensure meaning full leaves
    )

    # Cross val scores
    rf_cv_scores = cross_val_score(rf, X_train_processed, y_train, cv=cv, scoring='f1')
    print(f"F1 Scores: {rf_cv_scores}")
    print(f"Mean F1: {rf_cv_scores.mean():.3f} (+/- {rf_cv_scores.std() * 2:.3f})")

    # Fit and evaluate
    rf.fit(X_train_processed, y_train)
    rf_pred = rf.predict(X_test_processed)
    rf_prob = rf.predict_proba(X_test_processed)[:, 1]
    print("-------------RANDOM FOREST TEST RESULTS-------------")
    print(classification_report(y_test, rf_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, rf_prob):.3f}")

    # Model 2: Logistic Regression
    lr = LogisticRegression(
        class_weight='balanced',
        random_state=42,
        max_iter=2000,
        C=0.1
    )

    lr_cv_scores = cross_val_score(lr, X_train_processed, y_train, cv=cv, scoring='f1')
    print(f"F1 Scores: {lr_cv_scores}")
    print(f"Mean F1: {lr_cv_scores.mean():.3f} (+/- {lr_cv_scores.std() * 2:.3f})")

    lr.fit(X_train_processed, y_train)
    lr_pred = lr.predict(X_test_processed)
    lr_prob = lr.predict_proba(X_test_processed)[:, 1]
    print("-------------LOGISITIC REGRESSION TEST RESULTS-------------")
    print(classification_report(y_test, lr_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, lr_prob):.3f}")

    return rf_pred, rf_prob, lr_pred, lr_prob, rf, lr

In [None]:
rf_pred, rf_prob, lr_pred, lr_prob, rf, lr = baseline_model(X_train_processed=X_train_processed, X_test_processed=X_test_processed, y_train=y_train, y_test=y_test, cv=cv)

In [None]:
import pandas as pd
def feature_importance(model):
    feature_names = [col for col in selected_features]
    importances = model.feature_importances_

    feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
    }).sort_values('importance', ascending=False)

    return feature_importance_df

In [None]:
feature_importance(rf)

In [None]:
%pip install xgboost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

import xgboost as xgb

def xgb_model():
    # XGBBoost
    xgb_model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        scale_pos_weight=len(y_train[y_train == False]) / len(y_train[y_train == False]),
        random_state=42
    )

    xgb_cv_scores = cross_val_score(xgb_model, X_train_processed, y_train, cv=cv, scoring='f1')
    print("🚀 XGBOOST CROSS-VALIDATION:")
    print(f"Mean F1: {xgb_cv_scores.mean():.3f} (+/- {xgb_cv_scores.std() * 2:.3f})")

    # Test performance
    xgb_model.fit(X_train_processed, y_train)
    xgb_pred = xgb_model.predict(X_test_processed)
    xgb_prob = xgb_model.predict_proba(X_test_processed)[:, 1]

    print("\n🚀 XGBOOST TEST RESULTS:")
    print(classification_report(y_test, xgb_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, xgb_prob):.3f}")

    return xgb_pred, xgb_prob, xgb_model

In [None]:
xgb_pred, xgb_prob, xgb_model = xgb_model()

In [63]:
from sklearn.svm import SVC
def svc_model():
    svm = SVC(
        probability=True, 
        class_weight='balanced',
        kernel='rbf',
        C=1.0,
        random_state=42
    )

    svm_cross_scores = cross_val_score(svm, X_train_processed, y_train, cv=cv, scoring='f1')
    print("🚀 SVM CROSS-VALIDATION:")
    print(f"Mean F1: {svm_cross_scores.mean():.3f} (+/- {svm_cross_scores.std() * 2:.3f})")

    # Test performance
    svm.fit(X_train_processed, y_train)
    svm_pred = svm.predict(X_test_processed)
    svm_prob = svm.predict_proba(X_test_processed)[:, 1]

    print("\n🚀 SVM TEST RESULTS:")
    print(classification_report(y_test, svm_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, svm_prob):.3f}")

    return svm_pred, svm_prob, svm

In [64]:
svm_pred, svm_prob, svm = svc_model()

🚀 SVM CROSS-VALIDATION:
Mean F1: 0.460 (+/- 0.098)

🚀 SVM TEST RESULTS:
              precision    recall  f1-score   support

       False       1.00      0.86      0.93        29
        True       0.43      1.00      0.60         3

    accuracy                           0.88        32
   macro avg       0.71      0.93      0.76        32
weighted avg       0.95      0.88      0.90        32

ROC-AUC: 0.943


In [None]:
def gradient_boosting_model():
    gb = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        random_state=42
    )

    gb_cross_scores = cross_val_score(gb, X_train_processed, y_train, cv=cv, scoring='f1')
    print("🚀 Gradient Boosting CROSS-VALIDATION:")
    print(f"Mean F1: {gb_cross_scores.mean():.3f} (+/- {gb_cross_scores.std() * 2:.3f})")

    # Test performance
    gb.fit(X_train_processed, y_train)
    gb_pred = gb.predict(X_test_processed)
    gb_prob = gb.predict_proba(X_test_processed)[:, 1]

    print("\n🚀 Gradient Boosting TEST RESULTS:")
    print(classification_report(y_test, gb_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, gb_prob):.3f}")

    return gb_pred, gb_prob, gb

    

In [None]:
gb_pred, gb_prob, gb = gradient_boosting_model()

In [None]:
from sklearn.neural_network import MLPClassifier

def basic_neural_net():
    mlp = MLPClassifier(
        hidden_layer_sizes=(20,10),
        max_iter=1000,
        random_state=42,
        early_stopping=True,
        validation_fraction=0.2
    )

    mlp_cross_scores = cross_val_score(mlp, X_train_processed, y_train, cv=cv, scoring='f1')
    print("🚀 Basic Neural Net CROSS-VALIDATION:")
    print(f"Mean F1: {mlp_cross_scores.mean():.3f} (+/- {mlp_cross_scores.std() * 2:.3f})")

    # Test performance
    mlp.fit(X_train_processed, y_train)
    mlp_pred = mlp.predict(X_test_processed)
    mlp_prob = mlp.predict_proba(X_test_processed)[:, 1]

    print("\n🚀 Basic Neural Net TEST RESULTS:")
    print(classification_report(y_test, mlp_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, mlp_prob):.3f}")

    return mlp_pred, mlp_prob, mlp

In [None]:
mlp_pred, mlp_prob, mlp = basic_neural_net()

In [65]:
from sklearn.model_selection import GridSearchCV
def tune_svv():
    svm_param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.01, 0.1, 1],
        'kernel': ['rbf', 'poly'],
        'class_weight': ['blanced', {False: 1, True: 5}, {False: 1, True: 10}]
    }

    svm_grid = GridSearchCV(
        svm,
        svm_param_grid,
        cv=cv,
        scoring='f1',
        n_jobs=-1
    )

    print(f"Tuning SVM")
    svm_grid.fit(X_train_processed, y_train)

    print("🎯 BEST SVM PARAMETERS:")
    print(svm_grid.best_params_)
    print(f"Best CV F1: {svm_grid.best_score_:.3f}")

    best_svm = svm_grid.best_estimator_
    best_svm_pred = best_svm.predict(X_test_processed)
    best_svm_prob = best_svm.predict_proba(X_test_processed)[:, 1]

    print('OPTIMISED SVM RESULTS')
    print(classification_report(y_test, best_svm_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, best_svm_prob):.3f}")

    return best_svm_pred, best_svm_prob, best_svm

In [66]:
svm_pred_new, svm_prob_new, svm_new_model = tune_svv()

Tuning SVM
🎯 BEST SVM PARAMETERS:
{'C': 0.1, 'class_weight': {False: 1, True: 10}, 'gamma': 0.01, 'kernel': 'rbf'}
Best CV F1: 0.541
OPTIMISED SVM RESULTS
              precision    recall  f1-score   support

       False       1.00      0.83      0.91        29
        True       0.38      1.00      0.55         3

    accuracy                           0.84        32
   macro avg       0.69      0.91      0.73        32
weighted avg       0.94      0.84      0.87        32

ROC-AUC: 0.943


200 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
200 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python313\Lib\site-packages\sklearn\base.py", line 1356, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Python313\Lib\site-packages\sklearn\base.py", line 469, in _validate_params
    validate_parameter_constraints(
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        self._parameter_constraints,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^

In [67]:
from sklearn.model_selection import RandomizedSearchCV
def svm_tuning_advanced():
    svm_params = {
        'C': [0.01, 0.1, 1, 10, 50, 100, 200],
        'gamma':  ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10],
        'kernel': ['rbf', 'poly', 'sigmoid'],
        'class_weight': [
            'balanced',
            {False: 1, True: 3},
            {False: 1, True: 5},
            {False: 1, True: 8},
            {False: 1, True: 10},
            {False: 1, True: 15},
            {False: 1, True: 20},
        ],
        'degree': [2,3,4]
    }

    svm_random_search = RandomizedSearchCV(
        svm_new_model,
        svm_params,
        n_iter=200,
        cv=cv,
        scoring='f1',
        n_jobs=1,
        random_state=42,
        verbose=1
    )

    print(f"Starting SVM random search")
    svm_random_search.fit(X_train_processed, y_train)
    print("\n🏆 BEST SVM PARAMETERS:")
    print(svm_random_search.best_params_)
    print(f"Best CV F1: {svm_random_search.best_score_:.3f}")

    # Test best SVM
    best_svm_tuned = svm_random_search.best_estimator_
    svm_tuned_pred = best_svm_tuned.predict(X_test_processed)
    svm_tuned_prob = best_svm_tuned.predict_proba(X_test_processed)[:, 1]

    print("\n🎯 OPTIMIZED SVM TEST RESULTS:")
    print(classification_report(y_test, svm_tuned_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, svm_tuned_prob):.3f}")

    return svm_tuned_pred, svm_tuned_prob, best_svm_tuned

In [68]:
svm_tuned_pred, svm_tuned_prob, best_svm_tuned = svm_tuning_advanced()

Starting SVM random search
Fitting 5 folds for each of 200 candidates, totalling 1000 fits

🏆 BEST SVM PARAMETERS:
{'kernel': 'sigmoid', 'gamma': 10, 'degree': 4, 'class_weight': {False: 1, True: 8}, 'C': 0.01}
Best CV F1: 0.562

🎯 OPTIMIZED SVM TEST RESULTS:
              precision    recall  f1-score   support

       False       1.00      0.76      0.86        29
        True       0.30      1.00      0.46         3

    accuracy                           0.78        32
   macro avg       0.65      0.88      0.66        32
weighted avg       0.93      0.78      0.83        32

ROC-AUC: 0.885


In [72]:
from sklearn.metrics import precision_recall_curve
def svm_threshold_tuning():
    svm_probability = best_svm_tuned.predict_proba(X_test_processed)[:, 1]

    precision, recall, thresholds = precision_recall_curve(y_test, svm_probability)

    target_recall = 0.8
    idx = np.argmax(recall >= target_recall)
    optimal_threshold = thresholds[idx] if idx < len(thresholds) else 0.5

    print(f"Optimal threshold for {target_recall:.0%} recall: {optimal_threshold:.3f}")

    svm_pred_optimised = (svm_probability >= optimal_threshold).astype(int)
    print(classification_report(y_test, svm_pred_optimised))
    return svm_pred_optimised

In [73]:
svm_pred_optimised = svm_threshold_tuning()

Optimal threshold for 80% recall: 0.060
              precision    recall  f1-score   support

       False       0.00      0.00      0.00        29
        True       0.09      1.00      0.17         3

    accuracy                           0.09        32
   macro avg       0.05      0.50      0.09        32
weighted avg       0.01      0.09      0.02        32



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [74]:
# Let's do a more systematic threshold analysis
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, f1_score

# Get probabilities
svm_prob = best_svm_tuned.predict_proba(X_test_processed)[:, 1]

print("🔍 PROBABILITY DISTRIBUTION ANALYSIS:")
print(f"Min probability: {svm_prob.min():.3f}")
print(f"Max probability: {svm_prob.max():.3f}")
print(f"Mean probability: {svm_prob.mean():.3f}")
print(f"Default threshold (0.5) predictions: {(svm_prob >= 0.5).sum()} hazardous")

# Test multiple thresholds systematically
thresholds_to_test = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

print("\n📊 THRESHOLD ANALYSIS:")
print("Threshold | Precision | Recall | F1-Score | False Alarms")
print("-" * 55)

best_f1 = 0
best_threshold = 0.5

for thresh in thresholds_to_test:
    pred = (svm_prob >= thresh).astype(int)
    
    # Calculate metrics
    from sklearn.metrics import precision_score, recall_score, f1_score
    
    precision = precision_score(y_test, pred, zero_division=0)
    recall = recall_score(y_test, pred, zero_division=0)
    f1 = f1_score(y_test, pred, zero_division=0)
    false_alarms = ((y_test == False) & (pred == True)).sum()
    
    print(f"  {thresh:.1f}     |   {precision:.2f}    |  {recall:.2f}  |   {f1:.2f}   |     {false_alarms}")
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = thresh

print(f"\n🏆 BEST THRESHOLD: {best_threshold} (F1: {best_f1:.3f})")

# Apply best threshold
best_pred = (svm_prob >= best_threshold).astype(int)
print(f"\n🎯 BEST THRESHOLD RESULTS:")
print(classification_report(y_test, best_pred))

🔍 PROBABILITY DISTRIBUTION ANALYSIS:
Min probability: 0.060
Max probability: 0.178
Mean probability: 0.108
Default threshold (0.5) predictions: 0 hazardous

📊 THRESHOLD ANALYSIS:
Threshold | Precision | Recall | F1-Score | False Alarms
-------------------------------------------------------
  0.1     |   0.20    |  1.00  |   0.33   |     12
  0.2     |   0.00    |  0.00  |   0.00   |     0
  0.3     |   0.00    |  0.00  |   0.00   |     0
  0.4     |   0.00    |  0.00  |   0.00   |     0
  0.5     |   0.00    |  0.00  |   0.00   |     0
  0.6     |   0.00    |  0.00  |   0.00   |     0
  0.7     |   0.00    |  0.00  |   0.00   |     0
  0.8     |   0.00    |  0.00  |   0.00   |     0
  0.9     |   0.00    |  0.00  |   0.00   |     0

🏆 BEST THRESHOLD: 0.1 (F1: 0.333)

🎯 BEST THRESHOLD RESULTS:
              precision    recall  f1-score   support

       False       1.00      0.59      0.74        29
        True       0.20      1.00      0.33         3

    accuracy                   

In [75]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    class_weight='balanced',
    random_state=42
)

rf_model.fit(X_train_processed, y_train)
rf_pred = rf_model.predict(X_test_processed)
rf_prob = rf_model.predict_proba(X_test_processed)[:, 1]

print(f"\n🌲 RANDOM FOREST RESULTS:")
print(f"Probability range: {rf_prob.min():.3f} - {rf_prob.max():.3f}")
print(classification_report(y_test, rf_pred))
print(f"ROC-AUC: {roc_auc_score(y_test, rf_prob):.3f}")


🌲 RANDOM FOREST RESULTS:
Probability range: 0.000 - 0.896
              precision    recall  f1-score   support

       False       0.94      1.00      0.97        29
        True       1.00      0.33      0.50         3

    accuracy                           0.94        32
   macro avg       0.97      0.67      0.73        32
weighted avg       0.94      0.94      0.92        32

ROC-AUC: 0.966


In [47]:
from sklearn.inspection import permutation_importance
import pandas as pd

def feature_importance_analysis(model):
    perm_importance = permutation_importance(
        model, X_test_processed, y_test,
        n_repeats=20, random_state=42, scoring='f1'
    )

    feature_importance_df = pd.DataFrame({
        'feature': selected_features,
        'importance': perm_importance.importances_mean,
        'std': perm_importance.importances_std
    }).sort_values('importance', ascending=False)

    return feature_importance_df

In [48]:
feature_importance_df = feature_importance_analysis(best_svm_tuned)

In [49]:
feature_importance_df

Unnamed: 0,feature,importance,std
0,threat_score,0.079563,0.103559
1,impact_potential,0.047421,0.088985
7,log_velocity,0.039286,0.12894
3,log_diameter,0.024603,0.049621
2,size_percentile,0.022738,0.250494
11,absolute_magniutude_h,0.014286,0.089214
8,velocity_percentile,0.009167,0.151441
4,avg_diameter_km,0.005556,0.016667
6,relative_velocity_km/sec,0.001984,0.02366
5,velocity_zscore,0.001984,0.02366
