Esse arquivo serve para realizar qualquer transformação de dados na base, ou seja, a normalização das colunas e conversão de valores simbólicos em numéricos

In [5]:
import pandas as pd
import os

In [6]:
# Importando as constantes de constant.py no notebook

%run ../utils/constants.py

In [7]:
REMOVED_NULL_INSTANCES_FILEPATH = os.path.join(DATA_DIRPATH, "kepler_removed_null_instances.csv")

In [8]:
df = pd.read_csv(REMOVED_NULL_INSTANCES_FILEPATH)

In [9]:
# Conversão da coluna "dec_str" de object para float (em decimais)

import re

def convertToFloatDegrees(data: str) -> float:

    degrees_pattern = re.compile(r"\d+(?=d)")
    minutes_pattern = re.compile(r"\d+(?=m)")
    seconds_pattern = re.compile(r"[\d\.]+(?=s)")

    degrees = float(degrees_pattern.search(data).group())
    minutes = float(minutes_pattern.search(data).group())
    seconds = float(seconds_pattern.search(data).group())

    return degrees + minutes / 60 + seconds / 3600

df["dec"] = df["dec_str"].apply(convertToFloatDegrees)

In [10]:
# Conversão da coluna "ra_str" de object para float

def convertToFloatHours(data: str) -> float:

    hours_pattern = re.compile(r"\d+(?=h)")
    minutes_pattern = re.compile(r"\d+(?=m)")
    seconds_pattern = re.compile(r"[\d\.]+(?=s)")

    hours = float(hours_pattern.search(data).group())
    minutes = float(minutes_pattern.search(data).group())
    seconds = float(seconds_pattern.search(data).group())

    return hours + minutes / 60 + seconds / 3600

df["ra"] = df["ra_str"].apply(convertToFloatHours)

In [11]:
CONVERTED_TYPES_FILEPATH = os.path.join(DATA_DIRPATH, "kepler_coverted_types.csv")
df.to_csv(CONVERTED_TYPES_FILEPATH, index=False)

Serão removidas as colunas "dec_str" e "ra_str" no arquivo "cleaning_data.ipynb"

In [12]:
REMOVED_COLUMNS_FILEPATH = os.path.join(DATA_DIRPATH, "kepler_removed_columns.csv")

In [13]:
df = pd.read_csv(REMOVED_COLUMNS_FILEPATH)

In [14]:
# Conversão da coluna "koi_tce_delivname" de object para int

mapper = {value: number for number, value in enumerate(df["koi_tce_delivname"].unique())}

df["koi_tce_delivname"] = df["koi_tce_delivname"].map(mapper)

In [15]:
df["koi_tce_delivname"].value_counts()

koi_tce_delivname
0    6672
1     395
2     286
3     229
Name: count, dtype: int64

In [16]:
df.to_csv(CONVERTED_TYPES_FILEPATH, index=False)

In [17]:
df = pd.read_csv(CONVERTED_TYPES_FILEPATH)

In [18]:
# Conversão da coluna "koi_disposition" de object para int

mapper = {value: number for number, value in enumerate(df["koi_disposition"].unique())}

df["koi_disposition"] = df["koi_disposition"].map(mapper)

In [19]:
df["koi_disposition"].value_counts()

koi_disposition
1    4839
0    2743
Name: count, dtype: int64

In [20]:
# Normalização das colunas float entre 0 e 1

from sklearn.preprocessing import normalize

df_float_only = df.select_dtypes("float64")

normalized_data = normalize(df_float_only)
float_columns = df_float_only.columns

print(normalized_data.shape)
print(float_columns.shape)

for i in range(0, normalized_data.shape[1]):

    df[float_columns[i]] = normalized_data[:, i]

ValueError: Input contains NaN.

In [None]:
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,...,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_tce_delivname,koi_steff,koi_slogg,koi_srad,koi_kepmag,dec,ra
0,0,0,0,0,0,0.001709,0.030725,2.6e-05,0.000533,0.110946,...,0.016862,0.00645,0.00018,0,0.982807,0.000805,0.000167,0.002765,0.008673,0.003506
1,0,0,0,0,0,0.009813,0.029306,0.000106,0.000813,0.157752,...,0.001643,0.004652,0.000361,0,0.983695,0.000806,0.000167,0.002768,0.008681,0.00351
2,1,0,1,0,0,0.000172,0.016863,0.000126,0.000238,0.799953,...,0.088316,0.050061,9.9e-05,0,0.574775,0.000452,7.8e-05,0.001544,0.004781,0.001885
3,0,0,0,0,0,0.000401,0.027266,0.000111,0.000263,0.095864,...,0.147166,0.006499,0.000159,0,0.958321,0.000705,0.000166,0.002464,0.007663,0.003059
4,0,0,0,0,0,0.001763,0.027204,8.5e-05,0.00073,0.241134,...,0.018244,0.010567,0.000159,0,0.960721,0.000713,0.000154,0.002497,0.007663,0.003139


In [None]:
NORMALIZED_FILEPATH = os.path.join(DATA_DIRPATH, "kepler_normalized.csv")

In [None]:
df.to_csv(NORMALIZED_FILEPATH, index=False)

Agora é possível construir o modelo de ML. Isso será feito no arquivo "data_mining.ipynb", que também será responsável por balancear a coluna-alvo