# Regressão com Random Forest
Crie um modelo de Random Forest para prever o preço de carros. Compare os resultados com um modelo de regressão linear usando MSE e R².

Autor: Jardson Alves Ribeiro

In [1]:
# Importado as bibliotecas necessárias.
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [5]:
# Importado o arquivo csv.
dataset = pd.read_csv('/content/carDetails.csv', encoding='utf-8')

In [6]:
# Exibindo as 10 primeiras linhas do dataset.
dataset.head(10)

Unnamed: 0,Make,Model,Price,Year,Kilometer,Fuel Type,Transmission,Location,Color,Owner,Seller Type,Engine,Max Power,Max Torque,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity
0,Honda,Amaze 1.2 VX i-VTEC,505000,2017,87150,Petrol,Manual,Pune,Grey,First,Corporate,1198 cc,87 bhp @ 6000 rpm,109 Nm @ 4500 rpm,FWD,3990.0,1680.0,1505.0,5.0,35.0
1,Maruti Suzuki,Swift DZire VDI,450000,2014,75000,Diesel,Manual,Ludhiana,White,Second,Individual,1248 cc,74 bhp @ 4000 rpm,190 Nm @ 2000 rpm,FWD,3995.0,1695.0,1555.0,5.0,42.0
2,Hyundai,i10 Magna 1.2 Kappa2,220000,2011,67000,Petrol,Manual,Lucknow,Maroon,First,Individual,1197 cc,79 bhp @ 6000 rpm,112.7619 Nm @ 4000 rpm,FWD,3585.0,1595.0,1550.0,5.0,35.0
3,Toyota,Glanza G,799000,2019,37500,Petrol,Manual,Mangalore,Red,First,Individual,1197 cc,82 bhp @ 6000 rpm,113 Nm @ 4200 rpm,FWD,3995.0,1745.0,1510.0,5.0,37.0
4,Toyota,Innova 2.4 VX 7 STR [2016-2020],1950000,2018,69000,Diesel,Manual,Mumbai,Grey,First,Individual,2393 cc,148 bhp @ 3400 rpm,343 Nm @ 1400 rpm,RWD,4735.0,1830.0,1795.0,7.0,55.0
5,Maruti Suzuki,Ciaz ZXi,675000,2017,73315,Petrol,Manual,Pune,Grey,First,Individual,1373 cc,91 bhp @ 6000 rpm,130 Nm @ 4000 rpm,FWD,4490.0,1730.0,1485.0,5.0,43.0
6,Mercedes-Benz,CLA 200 Petrol Sport,1898999,2015,47000,Petrol,Automatic,Mumbai,White,Second,Individual,1991 cc,181 bhp @ 5500 rpm,300 Nm @ 1200 rpm,FWD,4630.0,1777.0,1432.0,5.0,
7,BMW,X1 xDrive20d M Sport,2650000,2017,75000,Diesel,Automatic,Coimbatore,White,Second,Individual,1995 cc,188 bhp @ 4000 rpm,400 Nm @ 1750 rpm,AWD,4439.0,1821.0,1612.0,5.0,51.0
8,Skoda,Octavia 1.8 TSI Style Plus AT [2017],1390000,2017,56000,Petrol,Automatic,Mumbai,White,First,Individual,1798 cc,177 bhp @ 5100 rpm,250 Nm @ 1250 rpm,FWD,4670.0,1814.0,1476.0,5.0,50.0
9,Nissan,Terrano XL (D),575000,2015,85000,Diesel,Manual,Mumbai,White,First,Individual,1461 cc,84 bhp @ 3750 rpm,200 Nm @ 1900 rpm,FWD,4331.0,1822.0,1671.0,5.0,50.0


In [None]:
 # Remoção de dados desnecessários:

In [7]:
 # Limpeza nos dados.
dataset = dataset.drop(['Model'], axis=1)
dataset = dataset.drop(['Location'], axis=1)
dataset = dataset.drop(['Engine'], axis=1)
dataset = dataset.drop(['Max Power'], axis=1)
dataset = dataset.drop(['Max Torque'], axis=1)

In [8]:
# Exibindo as 10 primeiras linhas do dataset depois da limpeza.
dataset.head(10)

Unnamed: 0,Make,Price,Year,Kilometer,Fuel Type,Transmission,Color,Owner,Seller Type,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity
0,Honda,505000,2017,87150,Petrol,Manual,Grey,First,Corporate,FWD,3990.0,1680.0,1505.0,5.0,35.0
1,Maruti Suzuki,450000,2014,75000,Diesel,Manual,White,Second,Individual,FWD,3995.0,1695.0,1555.0,5.0,42.0
2,Hyundai,220000,2011,67000,Petrol,Manual,Maroon,First,Individual,FWD,3585.0,1595.0,1550.0,5.0,35.0
3,Toyota,799000,2019,37500,Petrol,Manual,Red,First,Individual,FWD,3995.0,1745.0,1510.0,5.0,37.0
4,Toyota,1950000,2018,69000,Diesel,Manual,Grey,First,Individual,RWD,4735.0,1830.0,1795.0,7.0,55.0
5,Maruti Suzuki,675000,2017,73315,Petrol,Manual,Grey,First,Individual,FWD,4490.0,1730.0,1485.0,5.0,43.0
6,Mercedes-Benz,1898999,2015,47000,Petrol,Automatic,White,Second,Individual,FWD,4630.0,1777.0,1432.0,5.0,
7,BMW,2650000,2017,75000,Diesel,Automatic,White,Second,Individual,AWD,4439.0,1821.0,1612.0,5.0,51.0
8,Skoda,1390000,2017,56000,Petrol,Automatic,White,First,Individual,FWD,4670.0,1814.0,1476.0,5.0,50.0
9,Nissan,575000,2015,85000,Diesel,Manual,White,First,Individual,FWD,4331.0,1822.0,1671.0,5.0,50.0


In [10]:
# Definindo as features do modelo.
dataset['Seller Type'], dataset_seller_mapping = pd.factorize(dataset['Seller Type'])
dataset['Owner'], dataset_owner_mapping = pd.factorize(dataset['Owner'])
dataset['Color'], dataset_color_mapping = pd.factorize(dataset['Color'])
dataset['Transmission'], dataset_transmission_mapping = pd.factorize(dataset['Transmission'])
dataset['Fuel Type'], dataset_fuel_mapping = pd.factorize(dataset['Fuel Type'])
dataset['Make'], dataset_make_mapping = pd.factorize(dataset['Make'])
dataset['Drivetrain'], dataset_drive_mapping = pd.factorize(dataset['Drivetrain'])


In [12]:
# Definindo a classe do modelo.
features, classe = dataset.drop(['Price'], axis=1), dataset['Price']

In [13]:
# Separando os dados de treinamento e teste.
features_treinamento, features_validacao, classe_treinamento, classe_validacao = train_test_split(features, classe, test_size=0.30)

In [14]:
# Treinando o modelo de RandomForest.
modelo_RFR = RandomForestRegressor()
modelo_RFR.fit(features_treinamento, classe_treinamento)

In [15]:
# Realizando a predição com os dados de validação.
classe_predicao = modelo_RFR.predict(features_validacao)

In [16]:
# Métrica MSE.
print(f"Média Quadrática de Erro: {mean_squared_error(classe_validacao, classe_predicao):.2f}")

Média Quadrática de Erro: 330485932215.42


In [17]:
# Métrica R2.
print(f"R² Score: {r2_score(classe_validacao, classe_predicao):.2f}")

R² Score: 0.92
