In [30]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


### Feature Importance looking at all features together

In [41]:
df = pd.read_csv("./NASA_Exoplant_df_merged_meadian.csv")

In [42]:
le_pdisposition = LabelEncoder() # 2 classes
df['koi_pdisposition_encode'] = le_pdisposition.fit_transform(df['koi_pdisposition'])

non_features = ['koi_pdisposition_encode', 'kepid', 'kepoi_name', 'koi_disposition', 'koi_pdisposition']
features = df.drop(columns=non_features)
features = df.columns[~df.columns.isin(non_features)].tolist()



Y = df['koi_pdisposition_encode']
X = df[features]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=100)

In [43]:
# Train model
model = RandomForestClassifier(random_state=100)
model.fit(X_train, Y_train)


# Eval model
Y_pred = model.predict(X_test)

print(classification_report(Y_test, Y_pred, target_names=le_pdisposition.classes_))
print(f"Accuracy: {accuracy_score(Y_test, Y_pred)}")

importances = model.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': features, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False) 
print(feature_imp_df)

                precision    recall  f1-score   support

     CANDIDATE       0.99      0.99      0.99      1401
FALSE POSITIVE       0.99      0.99      0.99      1469

      accuracy                           0.99      2870
     macro avg       0.99      0.99      0.99      2870
  weighted avg       0.99      0.99      0.99      2870

Accuracy: 0.9878048780487805
              Feature  Gini Importance
0           koi_score         0.268889
1       koi_fpflag_nt         0.130610
3       koi_fpflag_co         0.108703
2       koi_fpflag_ss         0.061706
20           koi_prad         0.046773
21      koi_prad_err1         0.043332
22      koi_prad_err2         0.035491
4       koi_fpflag_ec         0.031059
30     koi_steff_err1         0.030671
5          koi_period         0.020238
24          koi_insol         0.017431
17          koi_depth         0.015045
31     koi_steff_err2         0.014558
25     koi_insol_err1         0.012224
15  koi_duration_err1         0.010191
6     ko

### Feature Importance looking using transit method features

In [37]:
df = pd.read_csv("./NASA_Exoplant_df_merged_meadian.csv")

In [38]:
disposition_features = ['koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec']
transit_features = ['koi_period', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_model_snr', 'koi_teq']
# Transit Features Not in dataset = koi_ror, koi_sma, koi_incl, koi_ingress, koi_num_transits 

features = disposition_features + transit_features

In [39]:
le_pdisposition = LabelEncoder() # 2 classes
df['koi_pdisposition_encode'] = le_pdisposition.fit_transform(df['koi_pdisposition'])

Y = df['koi_pdisposition_encode']
X = df[features]

scaler = RobustScaler()
X = scaler.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=100)

In [40]:
# Train model
model = RandomForestClassifier(random_state=100)
model.fit(X_train, Y_train)

# Eval model
Y_pred = model.predict(X_test)

print(classification_report(Y_test, Y_pred, target_names=le_pdisposition.classes_))
print(f"Accuracy: {accuracy_score(Y_test, Y_pred)}")

importances = model.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': features, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False) 
print(feature_imp_df)

                precision    recall  f1-score   support

     CANDIDATE       0.98      1.00      0.99      1401
FALSE POSITIVE       1.00      0.99      0.99      1469

      accuracy                           0.99      2870
     macro avg       0.99      0.99      0.99      2870
  weighted avg       0.99      0.99      0.99      2870

Accuracy: 0.9898954703832753
          Feature  Gini Importance
0       koi_score         0.392767
1   koi_fpflag_nt         0.170738
2   koi_fpflag_ss         0.148246
3   koi_fpflag_co         0.117842
8       koi_depth         0.034783
4   koi_fpflag_ec         0.030510
10        koi_teq         0.027793
5      koi_period         0.027202
9   koi_model_snr         0.017770
6      koi_impact         0.016200
7    koi_duration         0.016151


In [13]:
df = pd.read_csv("./NASA_Exoplant_df_merged_meadian.csv")
disposition_features = ['koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec']
transit_features = ['koi_period', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_model_snr', 'koi_teq']
labels = ['kepid', 'kepoi_name', 'koi_disposition', 'koi_pdisposition']
features = disposition_features + transit_features
features_labels = features + labels
features_labels
transit_df = df[features_labels]

transit_df.to_csv('NASA_Exoplanet_transit.csv', index=False)

In [8]:
len(features)

11

In [25]:
df['koi_fpflag_ec'].value_counts()

koi_fpflag_ec
0    8416
1    1148
Name: count, dtype: int64

In [28]:
df['koi_depth'].value_counts()

koi_depth
421.0    370
134.0     25
116.0     25
126.0     23
120.0     22
        ... 
94.9       1
31.3       1
55.6       1
56.5       1
17.0       1
Name: count, Length: 2853, dtype: int64

In [None]:
df['koi_model_snr'].value_counts()

koi_model_snr
23.0      374
9.8        49
9.2        49
9.7        48
10.1       46
         ... 
4.3         1
254.3       1
176.1       1
230.7       1
1406.8      1
Name: count, Length: 2740, dtype: int64

In [44]:
df['koi_teq'].value_counts()

koi_teq
878.0     369
734.0      14
554.0      14
530.0      14
523.0      14
         ... 
3419.0      1
6757.0      1
2133.0      1
1802.0      1
3791.0      1
Name: count, Length: 2511, dtype: int64