# Import Packages

In [1]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Load Dataframe

In [2]:
# Load in dataframe from csv
df = pd.read_csv('../data/raw/kepler_cumulative.csv')
df

Unnamed: 0,koi_name,disposition,false_positive_type,ml_fittype,num_of_objects_around_star,fp_not_transit,fp_stellar_eclipse,fp_centroid_offset,fp_contamination,right_ascension,...,centroid_declination,centroid_right_ascension_offset,centroid_declination_offset,planet_star_right_ascension_offset,planet_star_declination_offset,angular_offset_right_ascension,angular_offset_declination,angular_offset_sky,insolation_flux,star_density
0,K00752.01,CONFIRMED,NO_COMMENT,LS+MCMC,2,0,0,0,0,291.93423,...,48.141910,0.430,0.940,-0.000200,-0.000550,-0.010,0.200,0.200,93.59,3.20796
1,K00752.02,CONFIRMED,NO_COMMENT,LS+MCMC,2,0,0,0,0,291.93423,...,48.141990,-0.630,1.230,0.000660,-0.001050,0.390,0.000,0.390,9.11,3.02368
2,K00753.01,CANDIDATE,DEEP_V_SHAPED,LS+MCMC,1,0,0,0,0,297.00482,...,48.134120,-0.021,-0.038,0.000700,0.000600,-0.025,-0.034,0.042,39.30,7.29555
3,K00754.01,FALSE POSITIVE,MOD_ODDEVEN_DV---MOD_ODDEVEN_ALT---DEEP_V_SHAPED,LS+MCMC,1,0,1,0,0,285.53461,...,48.285210,-0.111,0.002,0.003020,-0.001420,-0.249,0.147,0.289,891.96,0.22080
4,K00755.01,CONFIRMED,NO_COMMENT,LS+MCMC,1,0,0,0,0,288.75488,...,48.226260,-0.010,0.230,0.000080,-0.000070,0.030,-0.090,0.100,926.16,1.98635
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,K07985.01,FALSE POSITIVE,MOD_SEC_DV---PLANET_OCCULT_DV---MOD_SEC_ALT---...,LS+MCMC,1,0,1,1,0,297.18875,...,47.093686,0.792,-0.484,-0.000237,0.000178,2.132,-1.757,2.763,4500.53,0.16318
9560,K07986.01,CANDIDATE,NO_COMMENT,LS+MCMC,1,0,0,0,0,286.50937,...,47.163770,0.100,2.000,-0.000018,-0.000074,0.740,-0.250,0.780,1585.81,0.50770
9561,K07987.01,FALSE POSITIVE,CENT_RESOLVED_OFFSET,LS+MCMC,1,0,0,1,0,294.16489,...,47.174720,7.200,-5.600,-0.000720,0.000500,3.410,-3.650,5.000,5713.41,8.97692
9562,K07988.01,CANDIDATE,ALL_TRANS_CHASES---CENT_SATURATED,LS+MCMC,1,0,0,0,0,296.76288,...,47.145142,0.310,0.010,-0.000290,-0.000100,1.061,1.320,1.690,22.68,85.88623


# Split Into Data into Subsets

In [3]:
# Creating a row representing the total number of false positive flags marked true for each record
df['fp_total'] = df.fp_not_transit + df.fp_stellar_eclipse + df.fp_centroid_offset + df.fp_contamination
df.fp_total.value_counts()

0      4799
1      3230
2      1042
3       489
4         3
465       1
Name: fp_total, dtype: int64

In [4]:
# Split df into two datasets - one for multiclass classification and one for binary class classification
# In order to have clearly disjoint classes in the multiclass case, choose only those with a confirmed disposition
# or those which have a false positive disposition and only 1 false positive flag set
multi_class_df = df.loc[((df.fp_total == 1) & (df.disposition == 'FALSE POSITIVE')) | (df.disposition == 'CONFIRMED')].copy()
binary_class_df = df.loc[(df.disposition == 'CONFIRMED') | (df.disposition == 'FALSE POSITIVE')].copy()

# Clean Multi-class Dataset

In [5]:
# Inspect multi_class_df
multi_class_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5881 entries, 0 to 9561
Data columns (total 57 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   koi_name                            5881 non-null   object 
 1   disposition                         5881 non-null   object 
 2   false_positive_type                 5335 non-null   object 
 3   ml_fittype                          5881 non-null   object 
 4   num_of_objects_around_star          5881 non-null   int64  
 5   fp_not_transit                      5881 non-null   int64  
 6   fp_stellar_eclipse                  5881 non-null   int64  
 7   fp_centroid_offset                  5881 non-null   int64  
 8   fp_contamination                    5881 non-null   int64  
 9   right_ascension                     5881 non-null   float64
 10  declination                         5881 non-null   float64
 11  g_band_mag                          5863 no

In [6]:
# Helper function which label encodes class based off of disposition and false positive flags
def encode_disposition(row):
    if row.disposition == 'CONFIRMED':
        return 0
    if row.fp_not_transit:
        return 1
    elif row.fp_stellar_eclipse:
        return 2
    elif row.fp_centroid_offset:
        return 3
    elif row.fp_contamination:
        return 4

# The data is relatively clean and none of the columns have good options for imputing - drop rows containing nans
# Create a classification column using above helper function, then drop unneeded columns
multi_class_df.dropna(inplace = True)
multi_class_df['classification'] = multi_class_df.apply(lambda x: int(encode_disposition(x)), axis = 1)
multi_class_df.drop(['koi_name', 'false_positive_type', 'fp_total', 'disposition',
                     'fp_not_transit', 'fp_stellar_eclipse', 'fp_centroid_offset', 'fp_contamination'],
                    axis = 1, inplace = True)
multi_class_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4260 entries, 0 to 9561
Data columns (total 50 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ml_fittype                          4260 non-null   object 
 1   num_of_objects_around_star          4260 non-null   int64  
 2   right_ascension                     4260 non-null   float64
 3   declination                         4260 non-null   float64
 4   g_band_mag                          4260 non-null   float64
 5   r_band_mag                          4260 non-null   float64
 6   i_band_mag                          4260 non-null   float64
 7   z_band_mag                          4260 non-null   float64
 8   j_band_mag                          4260 non-null   float64
 9   h_band_mag                          4260 non-null   float64
 10  k_band_mag                          4260 non-null   float64
 11  kepler_band_mag                     4260 no

In [7]:
# Write dataframe to parquet file for further use
# multi_class_df.to_parquet('../data/cleaned/multi_class_data.parquet')

# Clean Binary Class Dataset

In [8]:
# Inspect binary_class_df
binary_class_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7510 entries, 0 to 9563
Data columns (total 57 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   koi_name                            7510 non-null   object 
 1   disposition                         7510 non-null   object 
 2   false_positive_type                 6826 non-null   object 
 3   ml_fittype                          7510 non-null   object 
 4   num_of_objects_around_star          7510 non-null   int64  
 5   fp_not_transit                      7510 non-null   int64  
 6   fp_stellar_eclipse                  7510 non-null   int64  
 7   fp_centroid_offset                  7510 non-null   int64  
 8   fp_contamination                    7510 non-null   int64  
 9   right_ascension                     7510 non-null   float64
 10  declination                         7510 non-null   float64
 11  g_band_mag                          7476 no

In [9]:
# Perform the same cleaning operations - drop unnecessary columns, drop rows with nans
binary_class_df.drop(['koi_name', 'false_positive_type', 'fp_total', 'fp_not_transit',
                      'fp_stellar_eclipse', 'fp_centroid_offset', 'fp_contamination'],
                    axis = 1, inplace = True)
binary_class_df.dropna(inplace = True)
binary_class_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5365 entries, 0 to 9563
Data columns (total 50 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   disposition                         5365 non-null   object 
 1   ml_fittype                          5365 non-null   object 
 2   num_of_objects_around_star          5365 non-null   int64  
 3   right_ascension                     5365 non-null   float64
 4   declination                         5365 non-null   float64
 5   g_band_mag                          5365 non-null   float64
 6   r_band_mag                          5365 non-null   float64
 7   i_band_mag                          5365 non-null   float64
 8   z_band_mag                          5365 non-null   float64
 9   j_band_mag                          5365 non-null   float64
 10  h_band_mag                          5365 non-null   float64
 11  k_band_mag                          5365 no

In [10]:
# Split into X and y dataframes to use LabelEncoder() on the target variable
# Reconvert back into dataframe - target variable 'disposition' encoded as 'CONFIRMED' = 0, 'FALSE POSITIVE' = 1
X_binary_class = binary_class_df.drop('disposition', axis = 1)
y_binary_class = binary_class_df.disposition

lec = LabelEncoder()

y_binary_class = pd.DataFrame(lec.fit_transform(y_binary_class),
                              index = y_binary_class.index,
                              columns = y_binary_class.to_frame().columns)

binary_class_df = pd.concat([X_binary_class, y_binary_class], axis = 1)
binary_class_df

Unnamed: 0,ml_fittype,num_of_objects_around_star,right_ascension,declination,g_band_mag,r_band_mag,i_band_mag,z_band_mag,j_band_mag,h_band_mag,...,centroid_right_ascension_offset,centroid_declination_offset,planet_star_right_ascension_offset,planet_star_declination_offset,angular_offset_right_ascension,angular_offset_declination,angular_offset_sky,insolation_flux,star_density,disposition
0,LS+MCMC,2,291.93423,48.141651,15.890,15.270,15.114,15.006,14.082,13.751,...,0.430,0.940,-0.000200,-0.000550,-0.010,0.200,0.200,93.59,3.20796,0
1,LS+MCMC,2,291.93423,48.141651,15.890,15.270,15.114,15.006,14.082,13.751,...,-0.630,1.230,0.000660,-0.001050,0.390,0.000,0.390,9.11,3.02368,0
3,LS+MCMC,1,285.53461,48.285210,16.100,15.554,15.382,15.266,14.326,13.911,...,-0.111,0.002,0.003020,-0.001420,-0.249,0.147,0.289,891.96,0.22080,1
4,LS+MCMC,1,288.75488,48.226200,16.015,15.468,15.292,15.241,14.366,14.064,...,-0.010,0.230,0.000080,-0.000070,0.030,-0.090,0.100,926.16,1.98635,0
8,LS+MCMC,1,298.86435,42.151569,12.999,12.609,12.515,12.480,11.659,11.415,...,-13.450,24.090,0.003032,-0.005549,-4.506,7.710,8.930,767.22,0.00485,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9557,LS+MCMC,1,290.93597,47.290730,15.534,14.863,14.643,14.487,13.487,13.066,...,-1.000,-0.300,0.000600,0.000300,2.360,-0.990,2.550,0.42,0.00771,1
9558,LS+MCMC,1,298.74921,46.973351,15.064,14.441,14.227,14.144,13.153,12.847,...,1.100,1.200,-0.000080,-0.000100,1.260,-1.200,1.750,176.40,0.18863,1
9559,LS+MCMC,1,297.18875,47.093819,14.082,13.501,13.299,13.177,12.200,11.814,...,0.792,-0.484,-0.000237,0.000178,2.132,-1.757,2.763,4500.53,0.16318,1
9561,LS+MCMC,1,294.16489,47.176281,15.853,15.347,15.185,15.158,14.220,13.913,...,7.200,-5.600,-0.000720,0.000500,3.410,-3.650,5.000,5713.41,8.97692,1


In [11]:
# Write dataframe to parquet file for further use
# binary_class_df.to_parquet('../data/cleaned/binary_class_data.parquet')