In [3]:
# Importing the libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [4]:
# Import the dataset 
dataset = pd.read_csv('Car_prices.csv')
dataset

Unnamed: 0.1,Unnamed: 0,mark,model,generation_name,year,mileage,vol_engine,fuel,city,province,price
0,0,opel,combo,gen-d-2011,2015,139568,1248,Diesel,Janki,Mazowieckie,35900
1,1,opel,combo,gen-d-2011,2018,31991,1499,Diesel,Katowice,Śląskie,78501
2,2,opel,combo,gen-d-2011,2015,278437,1598,Diesel,Brzeg,Opolskie,27000
3,3,opel,combo,gen-d-2011,2016,47600,1248,Diesel,Korfantów,Opolskie,30800
4,4,opel,combo,gen-d-2011,2014,103000,1400,CNG,Tarnowskie Góry,Śląskie,35900
...,...,...,...,...,...,...,...,...,...,...,...
117922,117922,volvo,xc-90,gen-ii-2014-xc-90,2020,40000,1969,Hybrid,Katowice,Śląskie,222790
117923,117923,volvo,xc-90,gen-ii-2014-xc-90,2017,51000,1969,Diesel,Chechło Pierwsze,Łódzkie,229900
117924,117924,volvo,xc-90,gen-ii-2014-xc-90,2016,83500,1969,Gasoline,Pruszcz Gdański,Pomorskie,135000
117925,117925,volvo,xc-90,gen-ii-2014-xc-90,2017,174000,1969,Diesel,Kalisz,Wielkopolskie,154500


In [5]:
# Create independant and dependant variables
X = dataset.iloc[:5000, 1:8].values
y = dataset.iloc[:5000, -1].values

In [6]:
X

array([['opel', 'combo', 'gen-d-2011', ..., 139568, 1248, 'Diesel'],
       ['opel', 'combo', 'gen-d-2011', ..., 31991, 1499, 'Diesel'],
       ['opel', 'combo', 'gen-d-2011', ..., 278437, 1598, 'Diesel'],
       ...,
       ['opel', 'corsa', 'gen-d-2006-2014', ..., 185800, 1229,
        'Gasoline'],
       ['opel', 'corsa', 'gen-d-2006-2014', ..., 158848, 1229,
        'Gasoline'],
       ['opel', 'corsa', 'gen-d-2006-2014', ..., 110000, 1398,
        'Gasoline']], dtype=object)

In [7]:
# Taking care of missing data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 4:6])
X[:, 4:6] = imputer.transform(X[:, 4:6])

In [8]:
# Encode categorical data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 1, 2, 6])], remainder='passthrough')
X = ct.fit_transform(X).toarray()

In [9]:
X

array([[1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.01500e+03,
        1.39568e+05, 1.24800e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.01800e+03,
        3.19910e+04, 1.49900e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.01500e+03,
        2.78437e+05, 1.59800e+03],
       ...,
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.00700e+03,
        1.85800e+05, 1.22900e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.00800e+03,
        1.58848e+05, 1.22900e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.01100e+03,
        1.10000e+05, 1.39800e+03]])

In [10]:
# Split data to train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [11]:
# Train the classifier
classifier = RandomForestRegressor(n_estimators=40)
classifier.fit(X_train, y_train)

RandomForestRegressor(n_estimators=40)

In [12]:
# Predict the test set
y_pred = classifier.predict(X_test)

In [13]:
print(pd.Series(y_test))

0      25300
1      14900
2      52500
3      60000
4      28900
       ...  
995    36900
996    13500
997     5400
998    22600
999    25900
Length: 1000, dtype: int64


In [14]:
print(pd.Series(y_pred))

0      27314.850
1      15927.425
2      54414.875
3      62624.975
4      31699.725
         ...    
995    31830.000
996    11214.950
997     5225.000
998    15554.825
999    25809.150
Length: 1000, dtype: float64


In [15]:
# Evaluating the model
RMSE = r2_score(y_test, y_pred)

print('The root mean squared error of the model is:', RMSE)

The root mean squared error of the model is: 0.9463687662366557
