In [74]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
import pickle

In [75]:
data = pd.read_csv("nearest-earth-objects(1910-2024).csv")

FileNotFoundError: [Errno 2] No such file or directory: 'nearest-earth-objects(1910-2024).csv'

### Exploratory Data Analysis

In [None]:
data.isna().sum()

neo_id                     0
name                       0
absolute_magnitude        28
estimated_diameter_min    28
estimated_diameter_max    28
orbiting_body              0
relative_velocity          0
miss_distance              0
is_hazardous               0
dtype: int64

In [None]:
data[data["absolute_magnitude"].isna()].head()

Unnamed: 0,neo_id,name,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,orbiting_body,relative_velocity,miss_distance,is_hazardous
107508,3511355,(2010 DJ77),,,,Earth,69599.619497,41636680.0,False
111671,3511355,(2010 DJ77),,,,Earth,48866.251824,58768550.0,False
114171,3511355,(2010 DJ77),,,,Earth,50912.941751,7713979.0,False
116688,3511355,(2010 DJ77),,,,Earth,80266.699067,60884220.0,False
146059,3511355,(2010 DJ77),,,,Earth,70193.252023,42732360.0,False


In [None]:
data[np.logical_and(~data["absolute_magnitude"].isna(), data["neo_id"] == 3511355)]

Unnamed: 0,neo_id,name,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,orbiting_body,relative_velocity,miss_distance,is_hazardous


* noticed that all objects with the neo_id == "3511355" has missing values and can't be handled 

In [None]:
data = data[data["neo_id"] != 3511355]

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 338173 entries, 0 to 338198
Data columns (total 9 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   neo_id                  338173 non-null  int64  
 1   name                    338173 non-null  object 
 2   absolute_magnitude      338171 non-null  float64
 3   estimated_diameter_min  338171 non-null  float64
 4   estimated_diameter_max  338171 non-null  float64
 5   orbiting_body           338173 non-null  object 
 6   relative_velocity       338173 non-null  float64
 7   miss_distance           338173 non-null  float64
 8   is_hazardous            338173 non-null  bool   
dtypes: bool(1), float64(5), int64(1), object(2)
memory usage: 23.5+ MB


In [None]:
data[data['name'].duplicated(keep=False)]

Unnamed: 0,neo_id,name,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,orbiting_body,relative_velocity,miss_distance,is_hazardous
0,2162117,162117 (1998 SD15),19.140,0.394962,0.883161,Earth,71745.401048,5.814362e+07,False
1,2349507,349507 (2008 QY),18.500,0.530341,1.185878,Earth,109949.757148,5.580105e+07,True
2,2455415,455415 (2003 GA),21.450,0.136319,0.304818,Earth,24865.506798,6.720689e+07,False
3,3132126,(2002 PB),20.630,0.198863,0.444672,Earth,78890.076805,3.039644e+07,False
4,3557844,(2011 DW),22.700,0.076658,0.171412,Earth,56036.519484,6.311863e+07,False
...,...,...,...,...,...,...,...,...,...
338193,54336231,(2022 YJ2),22.140,0.099210,0.221840,Earth,41743.290048,3.770475e+07,False
338194,54403809,(2023 VS4),28.580,0.005112,0.011430,Earth,56646.985988,6.406548e+07,False
338195,54415298,(2023 XW5),28.690,0.004859,0.010865,Earth,21130.768947,2.948883e+07,False
338196,54454871,(2024 KJ7),21.919,0.109839,0.245607,Earth,11832.041031,5.346078e+07,False


In [None]:
data["orbiting_body"].value_counts()

orbiting_body
Earth    338173
Name: count, dtype: int64

* all the values are "earth" so there is no need to the column

In [None]:
data[data["absolute_magnitude"].isna()]

Unnamed: 0,neo_id,name,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,orbiting_body,relative_velocity,miss_distance,is_hazardous
274352,3592397,(2010 AW108),,,,Earth,22559.617403,51496830.0,False
274626,54049909,(2010 CJ188),,,,Earth,61048.333919,46317940.0,False


In [None]:
data[data["relative_velocity"] == 22559.617403]

Unnamed: 0,neo_id,name,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,orbiting_body,relative_velocity,miss_distance,is_hazardous


* There is no way to impute the missing values so we will drop them 

In [None]:
data[data.duplicated()]

Unnamed: 0,neo_id,name,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,orbiting_body,relative_velocity,miss_distance,is_hazardous


### Feature Engineering

In [None]:
data = data[~data["absolute_magnitude"].isna()]

In [None]:
data.isna().sum()

neo_id                    0
name                      0
absolute_magnitude        0
estimated_diameter_min    0
estimated_diameter_max    0
orbiting_body             0
relative_velocity         0
miss_distance             0
is_hazardous              0
dtype: int64

In [None]:
data

Unnamed: 0,neo_id,name,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,orbiting_body,relative_velocity,miss_distance,is_hazardous
0,2162117,162117 (1998 SD15),19.140,0.394962,0.883161,Earth,71745.401048,5.814362e+07,False
1,2349507,349507 (2008 QY),18.500,0.530341,1.185878,Earth,109949.757148,5.580105e+07,True
2,2455415,455415 (2003 GA),21.450,0.136319,0.304818,Earth,24865.506798,6.720689e+07,False
3,3132126,(2002 PB),20.630,0.198863,0.444672,Earth,78890.076805,3.039644e+07,False
4,3557844,(2011 DW),22.700,0.076658,0.171412,Earth,56036.519484,6.311863e+07,False
...,...,...,...,...,...,...,...,...,...
338194,54403809,(2023 VS4),28.580,0.005112,0.011430,Earth,56646.985988,6.406548e+07,False
338195,54415298,(2023 XW5),28.690,0.004859,0.010865,Earth,21130.768947,2.948883e+07,False
338196,54454871,(2024 KJ7),21.919,0.109839,0.245607,Earth,11832.041031,5.346078e+07,False
338197,54456245,(2024 NE),23.887,0.044377,0.099229,Earth,56198.382733,5.184742e+06,False


### Data Splitting

In [None]:
X = data[["absolute_magnitude", "estimated_diameter_min", "estimated_diameter_max", "relative_velocity", "miss_distance"]]
y = data["is_hazardous"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Model Training and Evaluation 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()

knn_clf.fit(X_train, y_train)

In [None]:
print(knn_clf.score(X_train, y_train))
print(knn_clf.score(X_test, y_test))
knn_clf.sa

0.8819190686003235
0.8588396483065883


In [None]:
import joblib

joblib.dump(knn_clf, 'scoreregression.pkl')

['scoreregression.pkl']