In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [28]:
uri = "https://gist.githubusercontent.com/guilhermesilveira/e99a526b2e7ccc6c3b70f53db43a87d2/raw/1605fc74aa778066bf2e6695e24d53cf65f2f447/machine-learning-carros-simulacao.csv"

data = pd.read_csv(uri).drop(columns=["Unnamed: 0"], axis=1)

data.rename(columns={"preco": "price", "vendido":"sold", "idade_do_modelo": "model_year", "km_por_ano": "km_per_year"}, inplace=True)

data

Unnamed: 0,price,sold,model_year,km_per_year
0,30941.02,1,18,35085.22134
1,40557.96,1,20,12622.05362
2,89627.50,0,12,11440.79806
3,95276.14,0,3,43167.32682
4,117384.68,1,4,12770.11290
...,...,...,...,...
9995,97112.86,0,12,25060.64248
9996,107424.63,1,16,21317.31764
9997,93856.99,0,4,20950.38812
9998,51250.57,1,7,16840.13376


In [29]:
y= data["sold"]
x= data[["price","model_year","km_per_year"]]

In [30]:
SEED = 158020
np.random.seed(SEED)
train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.25, stratify=y)

model = DecisionTreeClassifier(max_depth=2)
model.fit(train_x, train_y)
y_hat = model.predict(test_x)

accuracy = accuracy_score(test_y, y_hat) * 100

print("We'll train with %d elements and test with %d elements" % (len(train_x), len(test_x)))
print("The accurary is %.2f%%" % accuracy)

We'll train with 7500 elements and test with 2500 elements
The accurary is 71.92%


In [31]:
SEED = 5
np.random.seed(SEED)
train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.25, stratify=y)

model = DecisionTreeClassifier(max_depth=2)
model.fit(train_x, train_y)
y_hat = model.predict(test_x)

accuracy = accuracy_score(test_y, y_hat) * 100

print("We'll train with %d elements and test with %d elements" % (len(train_x), len(test_x)))
print("The accurary is %.2f%%" % accuracy)

We'll train with 7500 elements and test with 2500 elements
The accurary is 76.84%


# Cross Validation

It's a method which you split your data in many parts (hold-out) and then you train and test with differents parts, multiple times, getting differents accuracys. Than you can use the median and the standard deviation between these values to get an accuracy interval.

In [32]:
from sklearn.model_selection import cross_validate

In [33]:
SEED = 158020
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv=3)
mean = results['test_score'].mean()
std = results['test_score'].std()
print("Accuracy interval: %.2f - %.2f" % ((mean - 2*std)*100, (mean + 2*std)*100))

Accuracy interval: 74.99 - 76.57


In [34]:
SEED = 5
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv=3)
mean = results['test_score'].mean()
std = results['test_score'].std()
print("Accuracy interval: %.2f - %.2f" % ((mean - 2*std)*100, (mean + 2*std)*100))

Accuracy interval: 74.99 - 76.57


In [35]:
SEED = 103847
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv=3)
mean = results['test_score'].mean()
std = results['test_score'].std()
print("Accuracy interval (cv=3): %.2f - %.2f" % ((mean - 2*std)*100, (mean + 2*std)*100))

Accuracy interval (cv=3): 74.99 - 76.57


With this method, we obtain the same result regardless of the seed, but the result will vary depending on the number of splis (cv):

In [36]:
SEED = 103847
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv=10)
mean = results['test_score'].mean()
std = results['test_score'].std()
print("Accuracy interval (cv=10): %.2f - %.2f" % ((mean - 2*std)*100, (mean + 2*std)*100))

Accuracy interval (cv=10): 74.24 - 77.32


In [37]:
SEED = 103847
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv=5)
mean = results['test_score'].mean()
std = results['test_score'].std()
print("Accuracy interval (cv=5): %.2f - %.2f" % ((mean - 2*std)*100, (mean + 2*std)*100))

Accuracy interval (cv=5): 75.21 - 76.35


There are references that point out it is a good pratice to use cv between 5 and 10. The most common is using 5.

The cross validation doesnt shuffle our data before fold it, so we need to do that using Kfold:

In [38]:
def print_results(results):
    mean = results['test_score'].mean()
    std = results['test_score'].std()
    print("Accuracy mean: %.2f" % (mean * 100))
    print("Accuracy interval: %.2f - %.2f" % ((mean - 2*std)*100, (mean + 2*std)*100))

In [39]:
from sklearn.model_selection import KFold

SEED = 103847
np.random.seed(SEED)

cv = KFold(n_splits=10, shuffle=True, random_state=SEED)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv=cv)
print_results(results)

Accuracy mean: 75.79
Accuracy interval: 73.26 - 78.32


When we have a imbalance between classes (like all 0 in one side and all 1 in other), is good to use the Stratified KFold:

In [40]:
from sklearn.model_selection import StratifiedKFold

SEED = 103847
np.random.seed(SEED)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv=cv)
print_results(results)

Accuracy mean: 75.78
Accuracy interval: 73.09 - 78.47


# Creating a random model column:

In [41]:
np.random.seed(SEED)
data["model"] = data.model_year + np.random.randint(-2,3, size=len(data))
data

Unnamed: 0,price,sold,model_year,km_per_year,model
0,30941.02,1,18,35085.22134,20
1,40557.96,1,20,12622.05362,18
2,89627.50,0,12,11440.79806,11
3,95276.14,0,3,43167.32682,5
4,117384.68,1,4,12770.11290,6
...,...,...,...,...,...
9995,97112.86,0,12,25060.64248,12
9996,107424.63,1,16,21317.31764,18
9997,93856.99,0,4,20950.38812,4
9998,51250.57,1,7,16840.13376,9


In [42]:
data.model.unique()

array([20, 18, 11,  5,  6,  9, 21, 17,  3, 12, 15,  4, 16, 19, 10, 14, 13,
        8,  7, 22,  1,  2,  0, -1], dtype=int64)

In [43]:
data.model = data.model + abs(data.model.min())
data

Unnamed: 0,price,sold,model_year,km_per_year,model
0,30941.02,1,18,35085.22134,21
1,40557.96,1,20,12622.05362,19
2,89627.50,0,12,11440.79806,12
3,95276.14,0,3,43167.32682,6
4,117384.68,1,4,12770.11290,7
...,...,...,...,...,...
9995,97112.86,0,12,25060.64248,13
9996,107424.63,1,16,21317.31764,19
9997,93856.99,0,4,20950.38812,5
9998,51250.57,1,7,16840.13376,10


In [44]:
data.model.unique()

array([21, 19, 12,  6,  7, 10, 22, 18,  4, 13, 16,  5, 17, 20, 11, 15, 14,
        9,  8, 23,  2,  3,  1,  0], dtype=int64)

## Cross validation with GroupKfold

In [46]:
from sklearn.model_selection import GroupKFold

SEED = 103847
np.random.seed(SEED)

cv = StratifiedKFold(n_splits=10)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv=cv, return_train_score=False, groups= data.model)
print_results(results)

Accuracy mean: 75.78
Accuracy interval: 74.24 - 77.32


## Cros validation with StandardScaler

In [48]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

SEED = 103847
np.random.seed(SEED)

scaler = StandardScaler()
scaler.fit(train_x)
scaled_train_x = scaler.transform(train_x)
scaled_test_x = scaler.transform(test_x)

model = SVC()
model.fit(scaled_train_x, train_y)
y_hat = model.predict(scaled_test_x)

accuracy = accuracy_score(test_y, y_hat) * 100
print("A acurácia foi de %.2f %%" % accuracy)

A acurácia foi de 77.48 %


In [53]:
from sklearn.model_selection import GroupKFold

SEED = 103847
np.random.seed(SEED)

cv = GroupKFold(n_splits=10)
model = SVC()
results = cross_validate(model, x, y, cv=cv, groups=data.model)
print_results(results)

Accuracy mean: 77.22
Accuracy interval: 72.37 - 82.07


In [54]:
scaler = StandardScaler()
scaler.fit(x)
scaled_x = scaler.transform(x)

In [55]:
from sklearn.pipeline import Pipeline

SEED = 103847
np.random.seed(SEED)

scaler = StandardScaler()
model = SVC()

pipeline = Pipeline([('transformation', scaler), ('estimator', model)])

cv = GroupKFold(n_splits=10)
model = SVC()
results = cross_validate(model, scaled_x, y, cv=cv, groups=data.model)
print_results(results)

Accuracy mean: 76.64
Accuracy interval: 72.39 - 80.90
