# Preprocessing

### Libraries Importing

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

In [3]:
df= pd.read_csv('Cleaned_df.csv')
df.head()

Unnamed: 0,neo_id,name,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,relative_velocity,miss_distance,is_hazardous
0,2162117,162117 (1998 SD15),19.14,0.394962,0.883161,71745.401048,58143620.0,False
1,2349507,349507 (2008 QY),18.5,0.530341,1.185878,109949.757148,55801050.0,True
2,2455415,455415 (2003 GA),21.45,0.136319,0.304818,24865.506798,67206890.0,False
3,3132126,(2002 PB),20.63,0.198863,0.444672,78890.076805,30396440.0,False
4,3557844,(2011 DW),22.7,0.076658,0.171412,56036.519484,63118630.0,False


#### Drop unvaluable columns

In [4]:
df.nunique()

neo_id                     33511
name                       33511
absolute_magnitude          1778
estimated_diameter_min      1778
estimated_diameter_max      1778
relative_velocity         338161
miss_distance             337798
is_hazardous                   2
dtype: int64

In [5]:
#dropping columns with high collinearity with absolute_magnitude
df.drop(columns = ['estimated_diameter_min','estimated_diameter_max'], inplace= True)

#dropping columns with high dimentionality
df.drop(columns = ['neo_id','name'], inplace= True)


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338171 entries, 0 to 338170
Data columns (total 4 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   absolute_magnitude  338171 non-null  float64
 1   relative_velocity   338171 non-null  float64
 2   miss_distance       338171 non-null  float64
 3   is_hazardous        338171 non-null  bool   
dtypes: bool(1), float64(3)
memory usage: 8.1 MB


#### Split the Data to taregt and features

In [7]:
X = df.drop(columns = ['is_hazardous'])
print(X.shape)
y=df['is_hazardous']
print(y.shape)

(338171, 3)
(338171,)


#### Target Encoding

Since our target datatype is bolean, we don't need to encode it.
We can replance the encoding step with converting the datatype to int.

In [8]:
y = y.astype('int')
y.unique()

array([0, 1])

#### Handling Target Imbalance


Here we chose to apply SMOTE on the training set only to keep the testing set unseen. also the test set should be from the real world so should be always balanced

In [9]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X,y)

#### Train test split

To prevent data leakage from the test set, we need to split the data into training and testing sets before proceeding with the preprocessing.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size= 0.2, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(472014, 3)
(472014,)
(118004, 3)
(118004,)


#### Scaling the numeric data


To make sure that all features are contributing in the training equally without one dominating others, we need to standardize the numeric features.

In [12]:
#instantiate the scaler
scaler = StandardScaler()

#Fit and transform the scaler to the training data
scaled_X_train = scaler.fit_transform(X_train)

#use the same scaler for the test data

scaled_X_test = scaler.transform(X_test)

In [13]:
scaled_X_test

array([[ 1.17532275, -1.30969399, -1.41548426],
       [-0.30599469,  0.96867945,  0.20956135],
       [-0.27315419, -0.59985394,  0.76435095],
       ...,
       [-1.050858  ,  1.20189206,  0.96088757],
       [ 1.46125586, -1.60543574, -0.98960451],
       [-1.65391013,  2.52489902,  1.44587736]])

In [14]:
#Convert back to DataFrame
X_train_scaled = pd.DataFrame( scaled_X_train , columns=X_train.columns)
print(X_train_scaled.head())
X_test_scaled = pd.DataFrame( scaled_X_test , columns=X_train.columns)
print(X_train_scaled.head())

   absolute_magnitude  relative_velocity  miss_distance
0            0.059125          -0.661234       0.482430
1            0.072975          -1.045081      -0.169162
2            0.054164          -0.628035       1.097483
3           -0.382260          -0.603823       0.645634
4            1.762238          -0.728276       0.031490
   absolute_magnitude  relative_velocity  miss_distance
0            0.059125          -0.661234       0.482430
1            0.072975          -1.045081      -0.169162
2            0.054164          -0.628035       1.097483
3           -0.382260          -0.603823       0.645634
4            1.762238          -0.728276       0.031490


# Model Building

#### Model Training

In [17]:
# instantiate the model
rf_model = RandomForestClassifier(n_estimators=50, random_state=42,class_weight='balanced')

# Train the model
rf_model.fit(X_train_scaled, y_train)

In [18]:
# Make predictions
y_pred = rf_model.predict(X_test_scaled)

In [19]:
# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.8878

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.82      0.88     59040
           1       0.84      0.96      0.90     58964

    accuracy                           0.89    118004
   macro avg       0.90      0.89      0.89    118004
weighted avg       0.90      0.89      0.89    118004


Confusion Matrix:
[[48237 10803]
 [ 2438 56526]]


In [22]:
y_proba = rf_model.predict_proba(X_test)[:, 1]
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba):.4f}")

AUC-ROC: 0.5000


In [24]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [20, 30],
    'class_weight': ['balanced', None]
}

random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_grid,
    n_iter=10,
    cv=3,
    scoring='recall',  
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train_scaled, y_train)

In [25]:
random_search.best_estimator_

In [26]:
random_search.best_score_

0.9890825907221185

In [28]:
y_proba = rf_model.predict_proba(X_test_scaled)[:, 1]
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba):.4f}")

AUC-ROC: 0.9551


# Model Deployment

In [30]:
import joblib

# Save the trained model
joblib.dump(rf_model, 'random_forest_model.pkl')


['random_forest_model.pkl']