In [721]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score, mean_absolute_error

import pandas as pd
import numpy as np

## Carregar os dados

In [722]:
df = pd.read_pickle("..//datasets/processed/laptop_price.pkl")
df.head()

Unnamed: 0,company,typename,inches,screen_resolution,cpu_manufacturer,cpu_frequency,gpu_manufacturer,ram_gb,memory_0_gb,memory_0_type,memory_1_gb,memory_1_type,opsys,weight_kg,price_euros
0,Apple,Ultrabook,13.3,2560x1600,Intel,2.3,Intel,8,128,ssd,0,0,macOS,1.37,1339.69
1,Apple,Ultrabook,13.3,1440x900,Intel,1.8,Intel,8,128,flash storage,0,0,macOS,1.34,898.94
2,HP,Notebook,15.6,1920x1080,Intel,2.5,Intel,8,256,ssd,0,0,Outros,1.86,575.0
3,Apple,Ultrabook,15.4,2880x1800,Intel,2.7,AMD,16,512,ssd,0,0,macOS,1.83,2537.45
4,Apple,Ultrabook,13.3,2560x1600,Intel,3.1,Intel,8,256,ssd,0,0,macOS,1.37,1803.6


In [723]:
features = df.drop(columns='price_euros')
for item in ['company', 'typename', 'screen_resolution', 'cpu_manufacturer', 'gpu_manufacturer', 'memory_0_type', 'memory_1_type', 'opsys']:
    features[item] = LabelEncoder().fit_transform(features[item])
target = df.price_euros

train_x, test_x, train_y, test_y = train_test_split(features, target, test_size=0.3)

In [725]:
models = {
    'Random Forest Regressor': RandomForestRegressor(),
    'KNeighbors Regressor': KNeighborsRegressor(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Linear Regression': LinearRegression()
}

for name, model_ in models.items():
    model = model_
    model.fit(train_x, train_y)
    pred_y = model.predict(test_x)
    print(f'Algorithm: {name}',)
    print(f'R2 score: {r2_score(test_y,pred_y):.02f}',)
    print('Mean Absolute Error:', mean_absolute_error(test_y,pred_y))
    print('********************************************************')

Algorithm: Random Forest Regressor
R2 score: 0.82
Mean Absolute Error: 193.54309044726585
********************************************************
Algorithm: KNeighbors Regressor
R2 score: 0.68
Mean Absolute Error: 249.95386700767264
********************************************************
Algorithm: Decision Tree Regressor
R2 score: 0.69
Mean Absolute Error: 242.48589514066495
********************************************************
Algorithm: Linear Regression
R2 score: 0.67
Mean Absolute Error: 275.26580108264704
********************************************************


## Random Forest Regression

In [726]:
model = RandomForestRegressor(bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=100)
model.fit(train_x, train_y)
pred_y = model.predict(test_x)
print(f'R2 score: {r2_score(test_y,pred_y):.02f}',)
print('Mean Absolute Error:', mean_absolute_error(test_y,pred_y))

R2 score: 0.80
Mean Absolute Error: 200.95658639108356


In [727]:
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [80, 90, 100, 110],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [100, 200, 300, 1000]
# }
# grid = GridSearchCV(RandomForestRegressor(), param_grid)
# grid.fit(train_x, train_y)
# grid.best_params_

## KNeighbors Regression

In [728]:
model = KNeighborsRegressor(n_neighbors=3)
model.fit(train_x, train_y)
pred_y = model.predict(test_x)
print(f'R2 score: {r2_score(test_y,pred_y):.02f}',)
print('Mean Absolute Error:', mean_absolute_error(test_y,pred_y))

R2 score: 0.68
Mean Absolute Error: 253.52694799658994


In [729]:
# param_grid = dict(n_neighbors=np.arange(1, 100))
# grid = GridSearchCV(KNeighborsRegressor(), param_grid)
# grid.fit(train_x, train_y)
# grid.best_params_

## Decision Tree Regression

In [730]:
model = DecisionTreeRegressor(max_depth= 12, max_features= 'sqrt', max_leaf_nodes= 90, min_samples_leaf= 1, splitter= 'best')
model.fit(train_x, train_y)
pred_y = model.predict(test_x)
print(f'R2 score: {r2_score(test_y,pred_y):.02f}',)
print('Mean Absolute Error:', mean_absolute_error(test_y,pred_y))

R2 score: 0.71
Mean Absolute Error: 258.53713820236214


In [731]:
# param_grid={"splitter":["best","random"],
#            "max_depth" : [1,3,5,7,9,11,12],
#            "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
#            "max_features":["log2","sqrt"],
#            "max_leaf_nodes":[10,20,30,40,50,60,70,80,90]
# }

# grid = GridSearchCV(DecisionTreeRegressor(), param_grid)
# grid.fit(train_x, train_y)
# grid.best_params_

## Linear Regression

In [732]:
model = LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, positive=False)
model.fit(train_x, train_y)
pred_y = model.predict(test_x)
print(f'R2 score: {r2_score(test_y,pred_y):.02f}',)
print('Mean Absolute Error:', mean_absolute_error(test_y,pred_y))

R2 score: 0.67
Mean Absolute Error: 274.93784437438467


In [733]:
# param_grid = {
#     'copy_X': [True, False],
#     'fit_intercept': [True, False],
#     'n_jobs': np.arange(1, 100),
#     'positive': [True, False]
# }

# grid = GridSearchCV(LinearRegression(), param_grid)
# grid.fit(train_x, train_y)
# grid.best_params_