In [25]:
! pwd

/home/khalyl/code/KhalylDammas/neo-hazardous-classification/neo/notebooks


In [27]:
import os
os.chdir("/home/khalyl/code/KhalylDammas/neo-hazardous-classification")

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from neo.params import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


In [29]:
df = pd.read_csv(DATA_LOCAL_PATH)
df.head()

Unnamed: 0,neo_id,name,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,orbiting_body,relative_velocity,miss_distance,is_hazardous
0,2162117,162117 (1998 SD15),19.14,0.394962,0.883161,Earth,71745.401048,58143620.0,False
1,2349507,349507 (2008 QY),18.5,0.530341,1.185878,Earth,109949.757148,55801050.0,True
2,2455415,455415 (2003 GA),21.45,0.136319,0.304818,Earth,24865.506798,67206890.0,False
3,3132126,(2002 PB),20.63,0.198863,0.444672,Earth,78890.076805,30396440.0,False
4,3557844,(2011 DW),22.7,0.076658,0.171412,Earth,56036.519484,63118630.0,False


In [38]:
df.dropna(inplace=True, ignore_index=True)

In [39]:
#Separate features and target
X = df[['absolute_magnitude','estimated_diameter_min','relative_velocity','miss_distance']]
y = df['is_hazardous']

In [40]:
# print the X columns (features)
X.head()

Unnamed: 0,absolute_magnitude,estimated_diameter_min,relative_velocity,miss_distance
0,19.14,0.394962,71745.401048,58143620.0
1,18.5,0.530341,109949.757148,55801050.0
2,21.45,0.136319,24865.506798,67206890.0
3,20.63,0.198863,78890.076805,30396440.0
4,22.7,0.076658,56036.519484,63118630.0


In [41]:
X.describe()

Unnamed: 0,absolute_magnitude,estimated_diameter_min,relative_velocity,miss_distance
count,338171.0,338171.0,338171.0,338171.0
mean,22.932525,0.157812,51060.017994,41535470.0
std,2.911216,0.313885,26399.92283,20774110.0
min,9.25,0.000511,203.346433,6745.533
25%,20.74,0.025384,30710.626399,24944950.0
50%,22.8,0.073207,47557.046397,43327240.0
75%,25.1,0.189041,66673.223798,59339610.0
max,33.58,37.545248,291781.106613,74798650.0


In [47]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(135268, 4) (202903, 4) (135268,) (202903,)


### piplene multiple models

In [48]:
# Set up the Pipeline with MinMaxScaler
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('classifier', LogisticRegression())  # Default classifier, will be replaced in GridSearchCV
])

In [49]:
# Define hyperparameter grids for each model
param_grid = [
    # Logistic Regression parameters
    {'classifier': [LogisticRegression()],
     'classifier__C': [0.01, 0.1, 1, 10, 100]},

    # SVM (Linear Kernel) parameters
    {'classifier': [SVC(kernel='linear')],
     'classifier__C': [0.01, 0.1, 1, 10, 100]},

    # SVM (RBF Kernel) parameters
    {'classifier': [SVC(kernel='rbf')], # THE WINER
     'classifier__C': [0.01, 0.1, 1, 10, 100],
     'classifier__gamma': ['scale', 0.001, 0.01, 0.1, 1]},

    # K-Nearest Neighbors parameters
    {'classifier': [KNeighborsClassifier()],
     'classifier__n_neighbors': [3, 5, 7, 9, 11]}
]

In [50]:
# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

In [51]:
# Fit the grid search on the training data
grid_search.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [52]:
# Print the best parameters and the best accuracy score
print("Best parameters found by GridSearchCV:")
print(grid_search.best_params_)

Best parameters found by GridSearchCV:
{'classifier': SVC(), 'classifier__C': 100, 'classifier__gamma': 'scale'}


In [53]:
grid_search.best_score_

np.float64(0.8814649388737157)

In [None]:
# Evaluate the best model on the test set
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Best model accuracy on the test set: {accuracy:.4f}')

### another way for piplene multiple models

In [None]:
pipe = Pipeline([('scaled' , MinMaxScaler()),
                 ('LR' ,LogisticRegression())])
pipe.fit(X_train ,y_train)
print(accuracy_score(y_test, pipe.predict(X_test)))

In [None]:
pipelines = []
pipelines.append(('scaledLR' , (Pipeline([('scaled' , MinMaxScaler()),('LR' ,LogisticRegression())]))))
pipelines.append(('scaledKNN' , (Pipeline([('scaled' , MinMaxScaler()),('KNN' ,KNeighborsClassifier())]))))
pipelines.append(('scaledDT' , (Pipeline([('scaled' , MinMaxScaler()),('DT' ,DecisionTreeClassifier())]))))
pipelines.append(('scaledSVC' , (Pipeline([('scaled' , MinMaxScaler()),('SVC' ,SVC())]))))
pipelines.append(('scaledMNB' , (Pipeline([('scaled' , MinMaxScaler()),('MNB' ,GaussianNB())]))))

model_name = []
results = []
for pipe ,model in pipelines:
    kfold = KFold(n_splits=10, random_state=42)
    crossv_results = cross_val_score(model , X_train ,y_train ,cv =kfold , scoring='accuracy')
    results.append(crossv_results)
    model_name.append(pipe)
    msg = "%s: %f (%f)" % (model_name, crossv_results.mean(), crossv_results.std())
    print(msg)

In [None]:
# Compare different Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(model_name)
plt.show()