In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/raw/kepler_cumulative.csv')

In [2]:
df

Unnamed: 0,koi_name,planet_name,disposition,false_positive_type,ml_fittype,star_id,num_of_objects_around_star,fp_not_transit,fp_stellar_eclipse,fp_centroid_offset,...,centroid_right_ascension_offset,centroid_declination_offset,planet_star_right_ascension_offset,planet_star_declination_offset,angular_offset_right_ascension,angular_offset_declination,angular_offset_sky,insolation_flux,star_density,disposition_score
0,K00752.01,Kepler-227 b,CONFIRMED,NO_COMMENT,LS+MCMC,10797460,2,0,0,0,...,0.430,0.940,-0.000200,-0.000550,-0.010,0.200,0.200,93.59,3.20796,1.000
1,K00752.02,Kepler-227 c,CONFIRMED,NO_COMMENT,LS+MCMC,10797460,2,0,0,0,...,-0.630,1.230,0.000660,-0.001050,0.390,0.000,0.390,9.11,3.02368,0.969
2,K00753.01,,CANDIDATE,DEEP_V_SHAPED,LS+MCMC,10811496,1,0,0,0,...,-0.021,-0.038,0.000700,0.000600,-0.025,-0.034,0.042,39.30,7.29555,0.000
3,K00754.01,,FALSE POSITIVE,MOD_ODDEVEN_DV---MOD_ODDEVEN_ALT---DEEP_V_SHAPED,LS+MCMC,10848459,1,0,1,0,...,-0.111,0.002,0.003020,-0.001420,-0.249,0.147,0.289,891.96,0.22080,0.000
4,K00755.01,Kepler-664 b,CONFIRMED,NO_COMMENT,LS+MCMC,10854555,1,0,0,0,...,-0.010,0.230,0.000080,-0.000070,0.030,-0.090,0.100,926.16,1.98635,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,K07985.01,,FALSE POSITIVE,MOD_SEC_DV---PLANET_OCCULT_DV---MOD_SEC_ALT---...,LS+MCMC,10090151,1,0,1,1,...,0.792,-0.484,-0.000237,0.000178,2.132,-1.757,2.763,4500.53,0.16318,0.000
9560,K07986.01,,CANDIDATE,NO_COMMENT,LS+MCMC,10128825,1,0,0,0,...,0.100,2.000,-0.000018,-0.000074,0.740,-0.250,0.780,1585.81,0.50770,0.497
9561,K07987.01,,FALSE POSITIVE,CENT_RESOLVED_OFFSET,LS+MCMC,10147276,1,0,0,1,...,7.200,-5.600,-0.000720,0.000500,3.410,-3.650,5.000,5713.41,8.97692,0.021
9562,K07988.01,,CANDIDATE,ALL_TRANS_CHASES---CENT_SATURATED,LS+MCMC,10155286,1,0,0,0,...,0.310,0.010,-0.000290,-0.000100,1.061,1.320,1.690,22.68,85.88623,0.092


In [3]:
df.drop('planet_name', axis = 1, inplace = True)

In [4]:
df['fp_total'] = df.fp_not_transit + df.fp_stellar_eclipse + df.fp_centroid_offset + df.fp_contamination

In [5]:
df.fp_total.value_counts()

0      4799
1      3230
2      1042
3       489
4         3
465       1
Name: fp_total, dtype: int64

In [6]:
multi_class_df = df.loc[((df.fp_total <= 1) & (df.disposition == 'FALSE POSITIVE')) | (df.disposition == 'CONFIRMED')].copy()

In [7]:
multi_class_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5976 entries, 0 to 9561
Data columns (total 59 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   koi_name                            5976 non-null   object 
 1   disposition                         5976 non-null   object 
 2   false_positive_type                 5336 non-null   object 
 3   ml_fittype                          5976 non-null   object 
 4   star_id                             5976 non-null   int64  
 5   num_of_objects_around_star          5976 non-null   int64  
 6   fp_not_transit                      5976 non-null   int64  
 7   fp_stellar_eclipse                  5976 non-null   int64  
 8   fp_centroid_offset                  5976 non-null   int64  
 9   fp_contamination                    5976 non-null   int64  
 10  right_ascension                     5976 non-null   float64
 11  declination                         5976 no

In [8]:
multi_class_df.dropna(inplace = True)

In [10]:
multi_class_df = multi_class_df.loc[~((multi_class_df.disposition == 'FALSE POSITIVE')
                                      & ~(multi_class_df.fp_not_transit) & ~(multi_class_df.fp_stellar_eclipse)
                                      & ~(multi_class_df.fp_centroid_offset) & ~(multi_class_df.fp_contamination))]\
                                        .copy()

In [11]:
def encode_disposition(row):
    if row.disposition == 'CONFIRMED':
        return 0
    if row.fp_not_transit:
        return 1
    elif row.fp_stellar_eclipse:
        return 2
    elif row.fp_centroid_offset:
        return 3
    elif row.fp_contamination:
        return 4
    
multi_class_df['classification'] = multi_class_df.apply(lambda x: int(encode_disposition(x)), axis = 1)

In [13]:
multi_class_df.drop(['koi_name', 'false_positive_type', 'fp_total', 'disposition',
                     'fp_not_transit', 'fp_stellar_eclipse', 'fp_centroid_offset', 'fp_contamination'],
                    axis = 1, inplace = True)

In [14]:
multi_class_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4260 entries, 0 to 9561
Data columns (total 52 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ml_fittype                          4260 non-null   object 
 1   star_id                             4260 non-null   int64  
 2   num_of_objects_around_star          4260 non-null   int64  
 3   right_ascension                     4260 non-null   float64
 4   declination                         4260 non-null   float64
 5   g_band_mag                          4260 non-null   float64
 6   r_band_mag                          4260 non-null   float64
 7   i_band_mag                          4260 non-null   float64
 8   z_band_mag                          4260 non-null   float64
 9   j_band_mag                          4260 non-null   float64
 10  h_band_mag                          4260 non-null   float64
 11  k_band_mag                          4260 no

In [11]:
binary_class_df = df.loc[(df.disposition == 'CONFIRMED') | (df.disposition == 'FALSE POSITIVE')].copy()

In [12]:
binary_class_df

Unnamed: 0,koi_name,planet_name,disposition,false_positive_type,ml_fittype,star_id,num_of_objects_around_star,fp_not_transit,fp_stellar_eclipse,fp_centroid_offset,...,centroid_declination_offset,planet_star_right_ascension_offset,planet_star_declination_offset,angular_offset_right_ascension,angular_offset_declination,angular_offset_sky,insolation_flux,star_density,disposition_score,fp_total
0,K00752.01,Kepler-227 b,CONFIRMED,NO_COMMENT,LS+MCMC,10797460,2,0,0,0,...,0.940,-0.000200,-0.000550,-0.010,0.200,0.200,93.59,3.20796,1.000,0
1,K00752.02,Kepler-227 c,CONFIRMED,NO_COMMENT,LS+MCMC,10797460,2,0,0,0,...,1.230,0.000660,-0.001050,0.390,0.000,0.390,9.11,3.02368,0.969,0
3,K00754.01,,FALSE POSITIVE,MOD_ODDEVEN_DV---MOD_ODDEVEN_ALT---DEEP_V_SHAPED,LS+MCMC,10848459,1,0,1,0,...,0.002,0.003020,-0.001420,-0.249,0.147,0.289,891.96,0.22080,0.000,1
4,K00755.01,Kepler-664 b,CONFIRMED,NO_COMMENT,LS+MCMC,10854555,1,0,0,0,...,0.230,0.000080,-0.000070,0.030,-0.090,0.100,926.16,1.98635,1.000,0
5,K00756.01,Kepler-228 d,CONFIRMED,NO_COMMENT,LS+MCMC,10872983,3,0,0,0,...,0.140,0.000040,,0.040,-0.070,0.080,114.81,0.67324,1.000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9557,K08198.01,,FALSE POSITIVE,ALL_TRANS_CHASES---INCONSISTENT_TRANS---CENT_F...,LS+MCMC,10205598,1,1,0,0,...,-0.300,0.000600,0.000300,2.360,-0.990,2.550,0.42,0.00771,0.008,1
9558,K07984.01,,FALSE POSITIVE,EPHEM_MATCH,LS+MCMC,10031643,1,0,0,0,...,1.200,-0.000080,-0.000100,1.260,-1.200,1.750,176.40,0.18863,0.000,1
9559,K07985.01,,FALSE POSITIVE,MOD_SEC_DV---PLANET_OCCULT_DV---MOD_SEC_ALT---...,LS+MCMC,10090151,1,0,1,1,...,-0.484,-0.000237,0.000178,2.132,-1.757,2.763,4500.53,0.16318,0.000,2
9561,K07987.01,,FALSE POSITIVE,CENT_RESOLVED_OFFSET,LS+MCMC,10147276,1,0,0,1,...,-5.600,-0.000720,0.000500,3.410,-3.650,5.000,5713.41,8.97692,0.021,1
