## Comaprison of different standard models in sci-kit to find exoplanets using Nasa K2 DB

In [68]:
import pandas as pd
import numpy as np
df = pd.read_csv("k2pandc_2025.10.02_03.11.29.csv", comment="#")
df_filtered = df[~df["disposition"].isin(["CANDIDATE", "REFUTED"])]
df = df_filtered
df


Unnamed: 0,pl_name,hostname,default_flag,disposition,disp_refname,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,...,sy_vmagerr2,sy_kmag,sy_kmagerr1,sy_kmagerr2,sy_gaiamag,sy_gaiamagerr1,sy_gaiamagerr2,rowupdate,pl_pubdate,releasedate
0,K2-2016-BLG-0005L b,K2-2016-BLG-0005L,1,CONFIRMED,Specht et al. 2023,1.0,1.0,Microlensing,2023.0,K2,...,,,,,,,,2023-05-03,2023-04,2023-05-03
1,K2-2016-BLG-0005L b,K2-2016-BLG-0005L,0,CONFIRMED,Specht et al. 2023,1.0,1.0,Microlensing,2023.0,K2,...,,,,,,,,2023-05-03,2023-04,2023-05-03
2,HD 137496 c,HD 137496,1,CONFIRMED,Azevedo Silva et al. 2022,1.0,2.0,Radial Velocity,2021.0,Multiple Observatories,...,-0.006,8.244000,0.036,-0.036,9.77168,0.000601,-0.000601,2021-11-18,2022-01,2021-11-18
3,HD 224018 b,HD 224018,1,CONFIRMED,Damasso et al. 2025,1.0,3.0,Radial Velocity,2025.0,Roque de los Muchachos Observatory,...,-0.005,8.145001,0.027,-0.027,9.52137,0.000154,-0.000154,2025-09-17,2025-08,2025-09-17
4,HD 3167 d,HD 3167,1,CONFIRMED,Christiansen et al. 2017,1.0,4.0,Radial Velocity,2017.0,Multiple Observatories,...,-0.030,7.066000,0.020,-0.020,8.76221,0.000265,-0.000265,2023-04-17,2023-04,2023-04-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3999,WASP-85 A b,WASP-85 A,0,CONFIRMED,Mo&,,,,,,...,,,,,,,,,,
4000,Wolf 503 b,Wolf 503,1,CONFIRMED,Peterson et al. 2018,1.0,1.0,Transit,2018.0,K2,...,-0.030,7.617000,0.023,-0.023,9.89816,0.000337,-0.000337,2022-05-23,2021-12,2022-05-23
4001,Wolf 503 b,Wolf 503,0,CONFIRMED,Peterson et al. 2018,1.0,1.0,Transit,2018.0,K2,...,-0.030,7.617000,0.023,-0.023,9.89816,0.000337,-0.000337,2018-09-04,2018-11,2018-09-06
4002,Wolf 503 b,Wolf 503,0,CONFIRMED,Peterson et al. 2018,1.0,1.0,Transit,2018.0,K2,...,-0.030,7.617000,0.023,-0.023,9.89816,0.000337,-0.000337,2023-04-17,2023-04,2023-04-17


We start from excluding candidates/refuted, beacuse they won't help in learnig for our model.

Now we'll try to figure out which features can be somehow helpful

In [69]:
features = [
    "sy_snum", "sy_pnum",
    "pl_orbper", "pl_orbsmax",
    "pl_rade", "pl_radj",
    "pl_bmasse", "pl_bmassj",
    "pl_orbeccen", "pl_insol", "pl_eqt",
    "st_teff", "st_rad", "st_mass", "st_met", "st_logg",
    "ra", "dec", "sy_dist", "sy_vmag", "sy_kmag", "sy_gaiamag"
]

These are all features that are numerical and might offer something.
Now we have to prepare our X,y

In [70]:
X = df.filter(items=features)
y = df["disposition"]
y

0       CONFIRMED
1       CONFIRMED
2       CONFIRMED
3       CONFIRMED
4       CONFIRMED
          ...    
3999    CONFIRMED
4000    CONFIRMED
4001    CONFIRMED
4002    CONFIRMED
4003    CONFIRMED
Name: disposition, Length: 2608, dtype: object

We can see that we have a lot of NaN in our DB, so we'll try to change them to median values, so they won't have any influence on our data

In [71]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X

Unnamed: 0,sy_snum,sy_pnum,pl_orbper,pl_orbsmax,pl_rade,pl_radj,pl_bmasse,pl_bmassj,pl_orbeccen,pl_insol,...,st_rad,st_mass,st_met,st_logg,ra,dec,sy_dist,sy_vmag,sy_kmag,sy_gaiamag
0,1.0,1.0,4700.000000,4.16000,2.48000,0.220219,349.611248,1.100000,0.077,70.0000,...,0.860000,0.584,0.00,4.50000,269.879833,-27.607472,5200.0000,12.6915,10.471000,12.43640
1,1.0,1.0,4700.000000,4.11000,2.48000,0.220219,368.680952,1.160000,0.077,70.0000,...,0.860000,0.574,0.00,4.50000,269.879833,-27.607472,5260.0000,12.6915,10.471000,12.43640
2,1.0,2.0,479.900000,1.21630,2.48000,0.220219,2434.565598,7.660000,0.477,70.0000,...,1.590000,1.040,-0.03,4.05000,231.742054,-16.509001,155.3170,9.8990,8.244000,9.77168
3,1.0,3.0,10.641300,0.09520,0.91000,0.081185,4.100000,0.012900,0.060,70.0000,...,1.147000,1.013,0.05,4.32000,358.639006,-4.723336,105.4800,9.7150,8.145001,9.52137
4,1.0,4.0,8.411200,0.07630,2.48000,0.220219,4.330000,0.013624,0.120,70.0000,...,0.880000,0.837,0.04,4.50000,8.740149,4.380721,47.2899,8.9700,7.066000,8.76221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2603,1.0,1.0,7.122605,0.06698,2.48000,0.220219,12.000000,0.037756,0.077,70.0000,...,0.860000,0.880,0.00,4.50000,170.103103,1.285868,231.8280,12.6915,10.471000,12.43640
2604,1.0,1.0,6.001270,0.05706,2.04300,0.182265,6.260000,0.019696,0.410,70.0000,...,0.689000,0.688,-0.47,4.62000,206.846198,-6.139337,44.5260,10.2700,7.617000,9.89816
2605,1.0,1.0,6.001180,0.05710,2.03000,0.181104,12.000000,0.037756,0.077,69.6000,...,0.690000,0.688,-0.47,4.62000,206.846198,-6.139337,44.5260,10.2700,7.617000,9.89816
2606,1.0,1.0,6.001270,0.05712,2.04300,0.182265,6.270000,0.019728,0.409,64.7000,...,0.689000,0.688,-0.47,4.50000,206.846198,-6.139337,44.5260,10.2700,7.617000,9.89816


In [72]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42, stratify=y
)
model = RandomForestClassifier(random_state=42, n_estimators=200)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Dokładność na zbiorze testowym:", model.score(X_test, y_test))

Dokładność na zbiorze testowym: 1.0


In [73]:
from sklearn.metrics import precision_score, recall_score, f1_score


precision = precision_score(y_test, y_pred, pos_label="CONFIRMED")
recall = recall_score(y_test, y_pred, pos_label="CONFIRMED")
f1 = f1_score(y_test, y_pred, pos_label="CONFIRMED")

print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")

Precision: 1.000, Recall: 1.000, F1: 1.000


Now we want to show which of our features matters most

In [74]:
fi = pd.Series(model.feature_importances_, index=X.columns)
print("\nFeature Importance (RandomForest):")
print(fi.sort_values(ascending=False))


Feature Importance (RandomForest):
sy_pnum        0.568766
sy_dist        0.064078
sy_kmag        0.056068
pl_radj        0.053004
pl_rade        0.046559
pl_orbper      0.041987
st_rad         0.036412
dec            0.028418
sy_gaiamag     0.023654
st_teff        0.016398
ra             0.014840
st_logg        0.014111
sy_vmag        0.010037
st_met         0.008393
st_mass        0.005781
pl_eqt         0.004261
pl_orbsmax     0.002481
pl_bmassj      0.001347
pl_insol       0.001134
pl_bmasse      0.001005
sy_snum        0.000884
pl_orbeccen    0.000382
dtype: float64


we can see that sy_pnum was the most important one

In [75]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1158    0]
 [   0  146]]
                precision    recall  f1-score   support

     CONFIRMED       1.00      1.00      1.00      1158
FALSE POSITIVE       1.00      1.00      1.00       146

      accuracy                           1.00      1304
     macro avg       1.00      1.00      1.00      1304
  weighted avg       1.00      1.00      1.00      1304

