# Tratando Dados p/ ML

In [1]:
import pandas as pd
import numpy as np

In [2]:
heart_disease_df = pd.read_csv("./dataset/heart-disease.csv")
heart_disease_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# Features 
x = heart_disease_df.drop("target", axis = 1)

In [4]:
# Target
y = heart_disease_df["target"]

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2) # 80% p/ treino e 20% p/ validação (teste)

In [6]:
# Verifica o shape dos dados de treino / teste
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

## Como validar a divição dos dados p/ Treino/Teste?

(Opcional)

In [7]:
print(f'Quantidade Total de Dados: {x.shape[0]}')
print(f'Quantidade de Dados p/ Treino (80%): {round(x.shape[0] * 0.8)}')
print(f'Quantidade de Dados p/ Teste (20%): {round(x.shape[0] * 0.2)}')
print(f'Validação: 242 + 61 = {242 + 61}')

Quantidade Total de Dados: 303
Quantidade de Dados p/ Treino (80%): 242
Quantidade de Dados p/ Teste (20%): 61
Validação: 242 + 61 = 303


## Etapas

### 1. Converter Dados p/ Linguegem de Máquina (Binário)

In [8]:
car_sales_df = pd.read_csv("./dataset/car-sales-extended.csv")
car_sales_df.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [9]:
car_sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Make           1000 non-null   object
 1   Colour         1000 non-null   object
 2   Odometer (KM)  1000 non-null   int64 
 3   Doors          1000 non-null   int64 
 4   Price          1000 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 39.2+ KB


É necessário converter as colunas categóricas (não apenas "object") em binário.

In [10]:
# Dividir em feature (x) e target (y)
x = car_sales_df.drop("Price", axis = 1)
y = car_sales_df["Price"]

#### 1. Converter categorias em binário (linguagem de máquina)

In [11]:
from sklearn.preprocessing import OneHotEncoder # transforma cada categoria em um vetor binário "one-hot"
from sklearn.compose import ColumnTransformer # permite aplicar transformações diferentes a subsets de colunas de um df

categorical_features = ["Make", "Colour", "Doors"] # lista os nomes/índices das colunas que contém variáveis categóricas

one_hot = OneHotEncoder() # Cria o objeto codificador:
#                           1. Cada valor distinto de cada coluna vira uma nova coluna binária
#                           2. O resultado vem como matriz sparse para economizar memória (menos útil em regressão)

transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder = "passthrough")
# Acima, há a criação de um pipline simples:
# 1. No trecho "("one_hot", one_hot, categorical_features)" → aplique one_hot às colunas listadas em categorical_features
# 2. E no segundo trecho "remainder="passthrough"" → todas as outras colunas serão mantidas sem alteração

transformed_x = transformer.fit_transform(x) # 1. fit_tranform(x) → o codificador aprende todos os valores possíveis de
#                                                                  cada coluna categórica em x.
#                                             2. transformer: converte o x original em uma nova matriz onde:
#                                             2.1. As colunas categóricas viram várias colunas binárias
#                                             2.2. As demais colunas ("remainder") aparecem depois, exatamente como eram

In [12]:
pd.DataFrame(transformed_x) # apenas visualização

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [13]:
np.random.seed(42) # garante uniformidade no gerador NumPy

from sklearn.model_selection import train_test_split
# Dividir em Treino / Teste
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size = 0.2)

In [14]:
from sklearn.ensemble import RandomForestRegressor # importa um modelo

model = RandomForestRegressor() # intancia o modelo com hiper-parâmetros padrões

model.fit(x_train, y_train) # treina o modelo

In [15]:
model.score(x_train, y_train) # verifica a curácia do modelo com os dados de treinamento

0.891612713353635

In [16]:
model.score(x_test, y_test) # verifica a acurácia do modelo com dados "novos" (do teste)

0.3235867221569877