Esse arquivo serve para realizar qualquer limpeza na base, no que tange ao tratamento de valores nulos, eliminação de colunas desnecessárias e padronização dos dados

In [1]:
import pandas as pd
import os

In [2]:
# Importando as constantes de constant.py no notebook

%run ../utils/constants.py

In [3]:
# Caminho para a base de dados do Kepler
BASEPATH = os.path.join(DATA_DIRPATH, "kepler_data.csv")

In [4]:
df = pd.read_csv(BASEPATH)

In [5]:
# Eliminação dos registros classificados como CANDIDATE
df = df[df["koi_disposition"] != "CANDIDATE"]

In [6]:
df["koi_disposition"].value_counts()

koi_disposition
FALSE POSITIVE    4839
CONFIRMED         2743
Name: count, dtype: int64

In [7]:
# Colunas a serem removidas

err_columns = list(filter(lambda x: "_err" in x, df.columns))
other_columns = ["kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score"]

removed_columns = err_columns + other_columns

In [8]:
# Eliminação das colunas desnecessárias

df_removed_columns = df.drop(removed_columns, axis=1)

In [156]:
df_removed_columns.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7582 entries, 0 to 9563
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   koi_disposition    7582 non-null   object 
 1   koi_fpflag_nt      7582 non-null   int64  
 2   koi_fpflag_ss      7582 non-null   int64  
 3   koi_fpflag_co      7582 non-null   int64  
 4   koi_fpflag_ec      7582 non-null   int64  
 5   koi_period         7582 non-null   float64
 6   koi_time0bk        7582 non-null   float64
 7   koi_impact         7323 non-null   float64
 8   koi_duration       7582 non-null   float64
 9   koi_depth          7323 non-null   float64
 10  koi_prad           7323 non-null   float64
 11  koi_teq            7323 non-null   float64
 12  koi_insol          7363 non-null   float64
 13  koi_model_snr      7323 non-null   float64
 14  koi_tce_plnt_num   7296 non-null   float64
 15  koi_tce_delivname  7296 non-null   object 
 16  koi_steff          7323 non-n

In [9]:
REMOVED_COLUMNS_FILEPATH = os.path.join(DATA_DIRPATH, "kepler_removed_columns.csv")
df_removed_columns.to_csv(REMOVED_COLUMNS_FILEPATH, index=False)

In [10]:
df = pd.read_csv(REMOVED_COLUMNS_FILEPATH)

In [11]:
# Eliminação das instâncias com valores nulos
df = df.dropna()

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7112 entries, 0 to 7581
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   koi_disposition    7112 non-null   object 
 1   koi_fpflag_nt      7112 non-null   int64  
 2   koi_fpflag_ss      7112 non-null   int64  
 3   koi_fpflag_co      7112 non-null   int64  
 4   koi_fpflag_ec      7112 non-null   int64  
 5   koi_period         7112 non-null   float64
 6   koi_time0bk        7112 non-null   float64
 7   koi_impact         7112 non-null   float64
 8   koi_duration       7112 non-null   float64
 9   koi_depth          7112 non-null   float64
 10  koi_prad           7112 non-null   float64
 11  koi_teq            7112 non-null   float64
 12  koi_insol          7112 non-null   float64
 13  koi_model_snr      7112 non-null   float64
 14  koi_tce_plnt_num   7112 non-null   float64
 15  koi_tce_delivname  7112 non-null   object 
 16  koi_steff          7112 non-n

In [13]:
REMOVED_NULL_INSTANCES_FILEPATH = os.path.join(DATA_DIRPATH, "kepler_removed_null_instances.csv")
df.to_csv(REMOVED_NULL_INSTANCES_FILEPATH, index=False)

In [14]:
CONVERTED_TYPES_PATH = os.path.join(DATA_DIRPATH, "kepler_coverted_types.csv")

In [15]:
df = pd.read_csv(CONVERTED_TYPES_PATH)

In [16]:
# Removendo as colunas "koi_time0bk" e "koi_period"

removed_columns = ["koi_time0bk", "koi_period"]

df = df.drop(removed_columns, axis=1)

In [17]:
# Removendo as colunas que foram transformadas de object para float

removed_columns = ["dec_str", "ra_str"]

df = df.drop(removed_columns, axis=1)

In [18]:
df.to_csv(REMOVED_COLUMNS_FILEPATH, index=False)