In [1]:
import pandas as pd

uri = "https://gist.githubusercontent.com/guilhermesilveira/e99a526b2e7ccc6c3b70f53db43a87d2/raw/1605fc74aa778066bf2e6695e24d53cf65f2f447/machine-learning-carros-simulacao.csv"
df = pd.read_csv(uri).drop(columns=["Unnamed: 0"], axis=1)
df

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano
0,30941.02,1,18,35085.22134
1,40557.96,1,20,12622.05362
2,89627.50,0,12,11440.79806
3,95276.14,0,3,43167.32682
4,117384.68,1,4,12770.11290
...,...,...,...,...
9995,97112.86,0,12,25060.64248
9996,107424.63,1,16,21317.31764
9997,93856.99,0,4,20950.38812
9998,51250.57,1,7,16840.13376


In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

x = df[["preco", "idade_do_modelo", "km_por_ano"]]
y = df["vendido"]

SEED = 158020
np.random.seed(SEED)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)
print(f"Dados de Treino: {len(x_train)}\nDados de Teste: {len(x_test)}")

Dados de Treino: 7500
Dados de Teste: 2500


In [3]:
from sklearn.dummy import DummyClassifier

dummy_stratified = DummyClassifier()
dummy_stratified.fit(x_train, y_train)
acuracia = dummy_stratified.score(x_test, y_test) * 100

print("A Acurácia do Dummy Estratificado foi de %.2f%%" % acuracia)

A Acurácia do Dummy Estratificado foi de 58.00%


In [4]:
from sklearn.tree import DecisionTreeClassifier

SEED = 158020
np.random.seed(SEED)

model = DecisionTreeClassifier(max_depth=2)
model.fit(x_train, y_train)

predicts = model.predict(x_test)
acuracia = accuracy_score(y_test, predicts) * 100

print("A Acurácia da Árvore de Decisão foi de %.2f%%" % acuracia)

A Acurácia da Árvore de Decisão foi de 71.92%


In [5]:
SEED = 158020
np.random.seed(SEED)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)
print(f"Dados de Treino: {len(x_train)}\nDados de Teste: {len(x_test)}")

dummy_stratified = DummyClassifier()
dummy_stratified.fit(x_train, y_train)
acuracia = dummy_stratified.score(x_test, y_test) * 100

print("A Acurácia do Dummy Estratificado foi de %.2f%%" % acuracia)

model = DecisionTreeClassifier(max_depth=2)
model.fit(x_train, y_train)

predicts = model.predict(x_test)
acuracia = accuracy_score(y_test, predicts) * 100

print("A Acurácia da Árvore de Decisão foi de %.2f%%" % acuracia)

Dados de Treino: 7500
Dados de Teste: 2500
A Acurácia do Dummy Estratificado foi de 58.00%
A Acurácia da Árvore de Decisão foi de 71.92%


## Holdout

#### Usar dados isolados, baseia-se na sorte ou azar na separação de treino e teste.

## K-Fold

#### Quebrar o dataset para validação cruzada etc.

## Cross Validation

In [6]:
from sklearn.model_selection import cross_validate

SEED = 1234454
np.random.seed(SEED)

model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv=10, return_train_score=False)
media = results["test_score"].mean()
desvio_padrao = results["test_score"].std()

print(
    "Acurácia entre %.2f%% a %.2f%%"
    % ((media - 2 * desvio_padrao) * 100, (media + 2 * desvio_padrao) * 100)
)

Acurácia entre 74.24% a 77.32%


## Cross Validation with KFold

In [7]:
def print_results(results):
    media = results["test_score"].mean()
    desvio_padrao = results["test_score"].std()
    print(
        "Acurácia entre %.2f%% a %.2f%%"
        % ((media - 2 * desvio_padrao) * 100, (media + 2 * desvio_padrao) * 100)
    )

In [8]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

SEED = 1234454
np.random.seed(SEED)

cv = KFold(n_splits=10, shuffle=True)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv=cv, return_train_score=False)
print_results(results)

Acurácia entre 72.70% a 78.86%


## Cross Validation with KFold and Stratified Values

In [9]:
df_bad = df.sort_values(by="vendido", ascending=True)
x_bad = df_bad[["preco", "idade_do_modelo", "km_por_ano"]]
y_bad = df_bad["vendido"]
df_bad

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano
4999,74023.29,0,12,24812.80412
5322,84843.49,0,13,23095.63834
5319,83100.27,0,19,36240.72746
5316,87932.13,0,16,32249.56426
5315,77937.01,0,15,28414.50704
...,...,...,...,...
5491,71910.43,1,9,25778.40812
1873,30456.53,1,6,15468.97608
1874,69342.41,1,11,16909.33538
5499,70520.39,1,16,19622.68262


In [10]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

SEED = 301
np.random.seed(SEED)

cv = KFold(n_splits=10)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x_bad, y_bad, cv=cv, return_train_score=False)
print_results(results)

Acurácia entre 34.29% a 81.39%


In [11]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

SEED = 301
np.random.seed(SEED)

cv = KFold(n_splits=10, shuffle=True)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x_bad, y_bad, cv=cv, return_train_score=False)
print_results(results)

Acurácia entre 72.30% a 79.26%


In [12]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

SEED = 301
np.random.seed(SEED)

cv = StratifiedKFold(n_splits=10, shuffle=True)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x_bad, y_bad, cv=cv, return_train_score=False)
print_results(results)

Acurácia entre 73.55% a 78.01%


## Geração de Dados Aleatórios

In [13]:
np.random.seed(SEED)

df["modelo"] = df.idade_do_modelo + np.random.randint(-2, 3, size=10000)
df["modelo"] = df.modelo + abs(df.modelo.min()) + 1

df

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano,modelo
0,30941.02,1,18,35085.22134,18
1,40557.96,1,20,12622.05362,24
2,89627.50,0,12,11440.79806,14
3,95276.14,0,3,43167.32682,6
4,117384.68,1,4,12770.11290,5
...,...,...,...,...,...
9995,97112.86,0,12,25060.64248,14
9996,107424.63,1,16,21317.31764,17
9997,93856.99,0,4,20950.38812,6
9998,51250.57,1,7,16840.13376,11


## Cross Validation com Grupos

In [18]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GroupKFold

SEED = 301
np.random.seed(SEED)

cv = GroupKFold(n_splits=10)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(
    model, x_bad, y_bad, groups=df.modelo, cv=cv, return_train_score=False
)
print_results(results)

Acurácia entre 73.67% a 77.90%


## Cross Validation com StardardScaller e Pipeline

In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

SEED = 301
np.random.seed(SEED)

scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

model = SVC()
model.fit(x_train_scaled, y_train)

predicts = model.predict(x_test_scaled)
acuracia = accuracy_score(y_test, predicts) * 100

print("A Acurácia da Árvore de Decisão foi de %.2f%%" % acuracia)

A Acurácia da Árvore de Decisão foi de 74.40%


In [22]:
from sklearn.pipeline import Pipeline

scaler = StandardScaler()
model = SVC()

pipeline = Pipeline([("Transformador", scaler), ("Estimador", model)])

SEED = 301
np.random.seed(SEED)

cv = GroupKFold(n_splits=10)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(
    pipeline, x_bad, y_bad, groups=df.modelo, cv=cv, return_train_score=False
)
print_results(results)

Acurácia entre 74.28% a 79.08%
