In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score


In [3]:
# Import the data from nasa.csv into a DataFrame
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
nasa_df = pd.read_csv('nasa.csv')

X = nasa_df[['Asc Node Longitude', 'Orbital Period', 'Perihelion Distance', 'Perihelion Arg', 'Aphelion Dist', 'Perihelion Time', 'Mean Anomaly', 'Mean Motion']]
y = nasa_df[['Hazardous']]

print(nasa_df.head())


   Neo Reference ID     Name  Absolute Magnitude  Est Dia in KM(min)  \
0           3703080  3703080                21.6            0.127220   
1           3723955  3723955                21.3            0.146068   
2           2446862  2446862                20.3            0.231502   
3           3092506  3092506                27.4            0.008801   
4           3514799  3514799                21.6            0.127220   

   Est Dia in KM(max)  Est Dia in M(min)  Est Dia in M(max)  \
0            0.284472         127.219879         284.472297   
1            0.326618         146.067964         326.617897   
2            0.517654         231.502122         517.654482   
3            0.019681           8.801465          19.680675   
4            0.284472         127.219879         284.472297   

   Est Dia in Miles(min)  Est Dia in Miles(max)  Est Dia in Feet(min)  ...  \
0               0.079051               0.176763            417.388066  ...   
1               0.090762        

In [4]:
is_null_counter = nasa_df.isnull().values.sum()
print("資料是否有空值:", is_null_counter)
print("資料筆數:", nasa_df.shape)
print("資料的欄位名稱，分別是:", nasa_df.keys())
print("第一筆的資料內容:", nasa_df.iloc[0,::])
print("第一筆的預測目標:", nasa_df['Hazardous'][0])


資料是否有空值: 0
資料筆數: (4687, 40)
資料的欄位名稱，分別是: Index(['Neo Reference ID', 'Name', 'Absolute Magnitude', 'Est Dia in KM(min)',
       'Est Dia in KM(max)', 'Est Dia in M(min)', 'Est Dia in M(max)',
       'Est Dia in Miles(min)', 'Est Dia in Miles(max)',
       'Est Dia in Feet(min)', 'Est Dia in Feet(max)', 'Close Approach Date',
       'Epoch Date Close Approach', 'Relative Velocity km per sec',
       'Relative Velocity km per hr', 'Miles per hour',
       'Miss Dist.(Astronomical)', 'Miss Dist.(lunar)',
       'Miss Dist.(kilometers)', 'Miss Dist.(miles)', 'Orbiting Body',
       'Orbit ID', 'Orbit Determination Date', 'Orbit Uncertainity',
       'Minimum Orbit Intersection', 'Jupiter Tisserand Invariant',
       'Epoch Osculation', 'Eccentricity', 'Semi Major Axis', 'Inclination',
       'Asc Node Longitude', 'Orbital Period', 'Perihelion Distance',
       'Perihelion Arg', 'Aphelion Dist', 'Perihelion Time', 'Mean Anomaly',
       'Mean Motion', 'Equinox', 'Hazardous'],
      dtype='ob

In [5]:
# 檢查每個欄位的資料型別
data_types = nasa_df.dtypes

# 找出非數值的欄位名稱
non_numeric_columns = data_types[data_types != 'float64'][data_types != 'int64'].index.tolist()

# 確保 'Hazardous' 不在非數值欄位名稱列表中
if 'Hazardous' in non_numeric_columns:
    non_numeric_columns.remove('Hazardous')

print("非數值的欄位名稱:", non_numeric_columns)

# 將 'Hazardous' 作為相依變數 y
y = nasa_df['Hazardous']

# 將其餘欄位作為獨立變數 X
X = nasa_df[['Asc Node Longitude', 'Orbital Period', 'Perihelion Distance', 'Perihelion Arg', 'Aphelion Dist', 'Perihelion Time', 'Mean Anomaly', 'Mean Motion']]

# 刪除識別碼或唯一標識符欄位
if 'ID' in X.columns:
    X = X.drop(columns=['ID'])

# 刪除非數值欄位（如果這些欄位不能直接用於模型中）
non_numeric_columns = [col for col in non_numeric_columns if col in X.columns]
X = X.drop(columns=non_numeric_columns)

print("獨立變數 X 的欄位名稱:", X.columns)
print("相依變數 y 的欄位名稱:", y.name)

非數值的欄位名稱: ['Close Approach Date', 'Orbiting Body', 'Orbit Determination Date', 'Equinox']
獨立變數 X 的欄位名稱: Index(['Asc Node Longitude', 'Orbital Period', 'Perihelion Distance',
       'Perihelion Arg', 'Aphelion Dist', 'Perihelion Time', 'Mean Anomaly',
       'Mean Motion'],
      dtype='object')
相依變數 y 的欄位名稱: Hazardous


In [6]:
X = nasa_df[['Asc Node Longitude', 'Orbital Period', 'Perihelion Distance', 'Perihelion Arg', 'Aphelion Dist', 'Perihelion Time', 'Mean Anomaly', 'Mean Motion']]
y = nasa_df['Hazardous']

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0)
print(train_X.shape, test_X.shape)

(3749, 8) (938, 8)


In [7]:
model = KNeighborsClassifier(n_neighbors=2)
model.fit(train_X, train_y)

In [8]:
pred_y = model.predict(test_X)
print(accuracy_score(test_y, pred_y))
print(confusion_matrix(test_y, pred_y))

0.8550106609808102
[[773  20]
 [116  29]]
