In [19]:
import pandas as pd
from datetime import datetime 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#### Importação e análise descritiva da base

In [5]:
uri = 'https://gist.githubusercontent.com/guilhermesilveira/4d1d4a16ccbf6ea4e0a64a38a24ec884/raw/afd05cb0c796d18f3f5a6537053ded308ba94bf7/car-prices.csv'
df = pd.read_csv(uri)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,mileage_per_year,model_year,price,sold
0,0,21801,2000,30941.02,yes
1,1,7843,1998,40557.96,yes
2,2,7109,2006,89627.5,no
3,3,26823,2015,95276.14,no
4,4,7935,2014,117384.68,yes


In [7]:
df.describe()

Unnamed: 0.1,Unnamed: 0,mileage_per_year,model_year,price
count,10000.0,10000.0,10000.0,10000.0
mean,4999.5,14183.3912,2004.1442,64842.373698
std,2886.89568,5008.571422,4.68758,25250.592393
min,0.0,363.0,1998.0,-840.36
25%,2499.75,10474.75,2000.0,44796.375
50%,4999.5,13418.5,2003.0,65225.685
75%,7499.25,17176.75,2007.0,85111.2375
max,9999.0,39572.0,2017.0,118929.72


In [8]:
df.sold.value_counts()

yes    5800
no     4200
Name: sold, dtype: int64

#### Transformação dos dados

In [9]:
replace_dic = {
    'no': 0,
    'yes': 1
}
df.sold = df.sold.map(replace_dic)
df.head()

Unnamed: 0.1,Unnamed: 0,mileage_per_year,model_year,price,sold
0,0,21801,2000,30941.02,1
1,1,7843,1998,40557.96,1
2,2,7109,2006,89627.5,0
3,3,26823,2015,95276.14,0
4,4,7935,2014,117384.68,1


In [10]:
# Convertendo ano do modelo por quantos anos tem o carro
corrent_year = datetime.today().year
df['years_old'] = corrent_year - df.model_year

df.head()

Unnamed: 0.1,Unnamed: 0,mileage_per_year,model_year,price,sold,years_old
0,0,21801,2000,30941.02,1,22
1,1,7843,1998,40557.96,1,24
2,2,7109,2006,89627.5,0,16
3,3,26823,2015,95276.14,0,7
4,4,7935,2014,117384.68,1,8


#### Separando a base entre preditores e classe

In [11]:
X = df[['mileage_per_year', 'price', 'years_old']]
y = df['sold']

#### Separando entre base de treino e teste

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [14]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7500, 3), (2500, 3), (7500,), (2500,))

In [16]:
scaler = StandardScaler()
scaler.fit(X_train, y_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#### Treinando o modelo

In [17]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [20]:
predictions = model.predict(X_test)
accuracy_score(y_test, predictions)

0.7176