Esse arquivo serve para realizar qualquer transformação de dados na base, ou seja, a normalização das colunas e conversão de valores simbólicos em numéricos

In [1]:
import pandas as pd
import os

In [2]:
# Importando as constantes de constant.py no notebook

%run ../utils/constants.py

In [3]:
REMOVED_NULL_INSTANCES_FILEPATH = os.path.join(DATA_DIRPATH, "kepler_removed_null_instances.csv")

In [4]:
df = pd.read_csv(REMOVED_NULL_INSTANCES_FILEPATH)

In [5]:
# Conversão da coluna "dec_str" de object para float (em decimais)

import re

def convertToFloatDegrees(data: str) -> float:

    degrees_pattern = re.compile(r"\d+(?=d)")
    minutes_pattern = re.compile(r"\d+(?=m)")
    seconds_pattern = re.compile(r"[\d\.]+(?=s)")

    degrees = float(degrees_pattern.search(data).group())
    minutes = float(minutes_pattern.search(data).group())
    seconds = float(seconds_pattern.search(data).group())

    return degrees + minutes / 60 + seconds / 3600

df["dec"] = df["dec_str"].apply(convertToFloatDegrees)

In [6]:
# Conversão da coluna "ra_str" de object para float

def convertToFloatHours(data: str) -> float:

    hours_pattern = re.compile(r"\d+(?=h)")
    minutes_pattern = re.compile(r"\d+(?=m)")
    seconds_pattern = re.compile(r"[\d\.]+(?=s)")

    hours = float(hours_pattern.search(data).group())
    minutes = float(minutes_pattern.search(data).group())
    seconds = float(seconds_pattern.search(data).group())

    return hours + minutes / 60 + seconds / 3600

df["ra"] = df["ra_str"].apply(convertToFloatHours)

In [7]:
CONVERTED_TYPES_FILEPATH = os.path.join(DATA_DIRPATH, "kepler_coverted_types.csv")
df.to_csv(CONVERTED_TYPES_FILEPATH, index=False)

Serão removidas as colunas "dec_str" e "ra_str" no arquivo "cleaning_data.ipynb"

In [8]:
REMOVED_COLUMNS_FILEPATH = os.path.join(DATA_DIRPATH, "kepler_removed_columns.csv")

In [9]:
df = pd.read_csv(REMOVED_COLUMNS_FILEPATH)

In [10]:
# Conversão da coluna "koi_tce_delivname" de object para int

mapper = {value: number for number, value in enumerate(df["koi_tce_delivname"].unique())}

df["koi_tce_delivname"] = df["koi_tce_delivname"].map(mapper)

In [11]:
df["koi_tce_delivname"].value_counts()

koi_tce_delivname
0    6619
1     294
2     199
Name: count, dtype: int64

In [12]:
df.to_csv(CONVERTED_TYPES_FILEPATH, index=False)

In [13]:
df = pd.read_csv(CONVERTED_TYPES_FILEPATH)

In [14]:
# Conversão da coluna "koi_disposition" de object para int

mapper = {value: number for number, value in enumerate(df["koi_disposition"].unique())}

df["koi_disposition"] = df["koi_disposition"].map(mapper)

In [15]:
df["koi_disposition"].value_counts()

koi_disposition
1    4381
0    2731
Name: count, dtype: int64

In [16]:
# Normalização das colunas float entre 0 e 1

from sklearn.preprocessing import normalize

df_float_only = df.select_dtypes("float64")

normalized_data = normalize(df_float_only)
float_columns = df_float_only.columns

print(normalized_data.shape)
print(float_columns.shape)

for i in range(0, normalized_data.shape[1]):

    df[float_columns[i]] = normalized_data[:, i]

(7112, 14)
(14,)


In [17]:
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_tce_delivname,koi_steff,koi_slogg,koi_srad,koi_kepmag,dec,ra
0,0,0,0,0,0,2.6e-05,0.000533,0.110999,0.000407,0.14294,0.01687,0.006453,0.00018,0,0.983273,0.000805,0.000167,0.002766,0.008678,0.003508
1,0,0,0,0,0,0.000106,0.000813,0.157827,0.000511,0.079924,0.001644,0.004655,0.000361,0,0.984165,0.000806,0.000167,0.002769,0.008685,0.003511
2,1,0,1,0,0,0.000126,0.000238,0.800067,0.003313,0.138144,0.088329,0.050069,9.9e-05,0,0.574857,0.000452,7.8e-05,0.001545,0.004782,0.001885
3,0,0,0,0,0,0.000111,0.000263,0.0959,0.000437,0.223495,0.147221,0.006501,0.000159,0,0.958677,0.000705,0.000166,0.002465,0.007666,0.00306
4,0,0,0,0,0,8.6e-05,0.00073,0.241223,0.00062,0.132732,0.01825,0.010571,0.000159,0,0.961079,0.000713,0.000155,0.002498,0.007666,0.00314


In [18]:
NORMALIZED_FILEPATH = os.path.join(DATA_DIRPATH, "kepler_normalized.csv")

In [19]:
df.to_csv(NORMALIZED_FILEPATH, index=False)

Agora é possível construir o modelo de ML. Isso será feito no arquivo "data_mining.ipynb", que também será responsável por balancear a coluna-alvo