In [1]:
import pandas as pd

url = "https://gist.githubusercontent.com/LuisFelipeFrancisco/8fb48d6e9feeacb63e4c1c6666353339/raw/8816772a35dc629c109ab32558a78cc4ed5083c6/car_prices.csv"

dados = pd.read_csv(url)

dados.head()

Unnamed: 0.1,Unnamed: 0,mileage_per_year,model_year,price,sold
0,0,21801,2000,30941.02,yes
1,1,7843,1998,40557.96,yes
2,2,7109,2006,89627.5,no
3,3,26823,2015,95276.14,no
4,4,7935,2014,117384.68,yes


In [2]:
rename = {
    "mileage_per_year": "milhas_por_ano",
    "model_year": "ano_do_modelo",
    "price": "preco",
    "sold": "vendido"
}

dados = dados.rename(columns = rename)

dados.head()

Unnamed: 0.1,Unnamed: 0,milhas_por_ano,ano_do_modelo,preco,vendido
0,0,21801,2000,30941.02,yes
1,1,7843,1998,40557.96,yes
2,2,7109,2006,89627.5,no
3,3,26823,2015,95276.14,no
4,4,7935,2014,117384.68,yes


In [3]:
change = {
    "no": 0,
    "yes": 1
}

dados.vendido = dados.vendido.map(change)

dados.head()

Unnamed: 0.1,Unnamed: 0,milhas_por_ano,ano_do_modelo,preco,vendido
0,0,21801,2000,30941.02,1
1,1,7843,1998,40557.96,1
2,2,7109,2006,89627.5,0
3,3,26823,2015,95276.14,0
4,4,7935,2014,117384.68,1


In [4]:
from datetime import datetime

ano_atual = datetime.today().year

dados["idade_do_modelo"] = ano_atual - dados.ano_do_modelo

dados.head()

Unnamed: 0.1,Unnamed: 0,milhas_por_ano,ano_do_modelo,preco,vendido,idade_do_modelo
0,0,21801,2000,30941.02,1,23
1,1,7843,1998,40557.96,1,25
2,2,7109,2006,89627.5,0,17
3,3,26823,2015,95276.14,0,8
4,4,7935,2014,117384.68,1,9


In [5]:
dados.milhas_por_ano.describe()

count    10000.000000
mean     14183.391200
std       5008.571422
min        363.000000
25%      10474.750000
50%      13418.500000
75%      17176.750000
max      39572.000000
Name: milhas_por_ano, dtype: float64

In [6]:
dados["km_por_ano"] = dados.milhas_por_ano * 1.60934

dados.head()

Unnamed: 0.1,Unnamed: 0,milhas_por_ano,ano_do_modelo,preco,vendido,idade_do_modelo,km_por_ano
0,0,21801,2000,30941.02,1,23,35085.22134
1,1,7843,1998,40557.96,1,25,12622.05362
2,2,7109,2006,89627.5,0,17,11440.79806
3,3,26823,2015,95276.14,0,8,43167.32682
4,4,7935,2014,117384.68,1,9,12770.1129


In [7]:
dados.km_por_ano.describe()

count    10000.000000
mean     22825.898794
std       8060.494332
min        584.190420
25%      16857.434165
50%      21594.928790
75%      27643.230845
max      63684.802480
Name: km_por_ano, dtype: float64

In [8]:
dados = dados.drop(columns = ["Unnamed: 0", "milhas_por_ano", "ano_do_modelo"], axis = 1)

dados.head()

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano
0,30941.02,1,23,35085.22134
1,40557.96,1,25,12622.05362
2,89627.5,0,17,11440.79806
3,95276.14,0,8,43167.32682
4,117384.68,1,9,12770.1129


In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

x = dados[["preco", "idade_do_modelo", "km_por_ano"]]
y = dados["vendido"]

SEED = 5
np.random.seed(SEED)

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.25, stratify = y)

print(f"Treinaremos com {len(train_x)} elementos e testaremos com {len(test_x)} elementos")

model = LinearSVC()
model.fit(train_x, train_y)
predictions = model.predict(test_x)

accuracy = accuracy_score(test_y, predictions) * 100

print(f"A acurácia foi de {accuracy:.2f}%")


Treinaremos com 7500 elementos e testaremos com 2500 elementos
A acurácia foi de 46.88%




Dummy Classifiers

In [10]:
from sklearn.dummy import DummyClassifier

dummy_stratified = DummyClassifier()

dummy_stratified.fit(train_x, train_y)
dummy_stratified_predictions = dummy_stratified.predict(test_x)

accuracy = accuracy_score(test_y, dummy_stratified_predictions) * 100

print(f"A acurácia do dummy foi de {accuracy:.2f}% / strategy='stratified' (default)")

A acurácia do dummy foi de 58.00% / strategy='stratified' (default)


In [11]:
dummy_mostfrequent = DummyClassifier()

dummy_mostfrequent.fit(train_x, train_y)
dummy_mostfrequent_predictions = dummy_mostfrequent.predict(test_x)
dummy_mostfrequent_accuracy = dummy_mostfrequent.score(test_x, test_y) * 100 # score é o mesmo que accuracy_score

print(f"A acurácia do dummy foi de {dummy_mostfrequent_accuracy:.2f}% / strategy='most_frequent'")

A acurácia do dummy foi de 58.00% / strategy='most_frequent'


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

SEED = 5
np.random.seed(SEED)

raw_train_x, raw_test_x, train_y, test_y = train_test_split(x, y, test_size = 0.25, stratify = y)

print(f"Treinaremos com {len(train_x)} elementos e testaremos com {len(test_x)} elementos")

scaler = StandardScaler()
scaler.fit(raw_train_x)
train_x = scaler.transform(raw_train_x)
test_x = scaler.transform(raw_test_x)

model = SVC()
model.fit(train_x, train_y)
predictions = model.predict(test_x)

accuracy = accuracy_score(test_y, predictions) * 100

print(f"A acurácia foi de {accuracy:.2f}%")

Treinaremos com 7500 elementos e testaremos com 2500 elementos
A acurácia foi de 77.48%
