In [1]:
#loading required library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
#loading dataset
df = pd.read_csv('dataset.csv')
df.columns

Index(['name', 'description', 'make', 'model', 'year', 'price', 'engine',
       'cylinders', 'fuel', 'mileage', 'transmission', 'trim', 'body', 'doors',
       'exterior_color', 'interior_color', 'drivetrain'],
      dtype='object')

In [3]:
#remove irrelavant columns
df.drop(columns=['name', 'description'], inplace=True)

In [4]:
#checking null values
df.isnull().sum(axis=0)

make                0
model               0
year                0
price              23
engine              2
cylinders         105
fuel                7
mileage            34
transmission        2
trim                1
body                3
doors               7
exterior_color      5
interior_color     38
drivetrain          0
dtype: int64

In [5]:
df.shape

(1002, 15)

In [6]:
#drop missing values in target variable
df = df.dropna(subset='price')

In [7]:
df.shape

(979, 15)

In [8]:
#filling mileage with median
df['mileage'].fillna(df['mileage'].median(), inplace=True)

#filling cylinder and doors with mode
df['cylinders'].fillna(df['cylinders'].mode()[0], inplace=True)
df['doors'].fillna(df['doors'].mode()[0], inplace=True)

df.isnull().sum(axis=0)

make               0
model              0
year               0
price              0
engine             2
cylinders          0
fuel               7
mileage            0
transmission       2
trim               1
body               3
doors              0
exterior_color     5
interior_color    37
drivetrain         0
dtype: int64

In [9]:
#filling Null values for categorical columns using mode

cat_col = ['make', 'model', 'engine', 'fuel','transmission', 
           'trim', 'body', 'exterior_color', 'interior_color', 'drivetrain']

for col in cat_col:
    df[col].fillna(df[col].mode()[0], inplace=True)

df.isnull().sum(axis=0)

make              0
model             0
year              0
price             0
engine            0
cylinders         0
fuel              0
mileage           0
transmission      0
trim              0
body              0
doors             0
exterior_color    0
interior_color    0
drivetrain        0
dtype: int64

missing value done.

In [10]:
#seperate feature and target
X = df.drop(columns='price')
y = df[['price']]

In [11]:
#check numerial columns
numerical_col = X.select_dtypes(include=np.number).columns.tolist()
numerical_col

['year', 'cylinders', 'mileage', 'doors']

In [12]:
#check categorical columns
categorical_col = X.select_dtypes(include='object').columns.tolist()
categorical_col

['make',
 'model',
 'engine',
 'fuel',
 'transmission',
 'trim',
 'body',
 'exterior_color',
 'interior_color',
 'drivetrain']

In [13]:
#check no. of unique value are present in each col.
for col in categorical_col:
    print(f'{col}: {len(df[col].unique())}')

make: 28
model: 151
engine: 100
fuel: 7
transmission: 38
trim: 197
body: 8
exterior_color: 262
interior_color: 90
drivetrain: 4


#### low cardinality Nominal : unique variable in column <= 40

#### high cardinality Nominal: unique variable in column > 40

#### if columns have more unique variable then we will consider target encoding instead of one-hot-encoder.

In [14]:
#selecting columns for one - hot encoding (low cardinality Nominal)
onehot_cols = ['make', 'fuel', "transmission", 'body', 'drivetrain']

#select colunms for target encoding (high cardinality Nominal)
target_cols = ['model', 'engine', 'trim', 'exterior_color', 'interior_color']

In [15]:
onehot_cols

['make', 'fuel', 'transmission', 'body', 'drivetrain']

In [16]:
from sklearn.preprocessing import OneHotEncoder

In [17]:
from category_encoders import TargetEncoder

In [18]:
#apply onehot_encoding
onehot_encoder = OneHotEncoder(drop='first', sparse_output=False)
onehot_encoder.fit(X[onehot_cols])

encoded_col = onehot_encoder.transform(X[onehot_cols])
encoded_df = pd.DataFrame(encoded_col, columns=onehot_encoder.get_feature_names_out(onehot_cols))

In [19]:
encoded_df.shape

(979, 80)

In [20]:
#apply target encoder for categorical columns have more no. of unique variables.
target_encoder = TargetEncoder(cols=target_cols)
target_encoder.fit(X[target_cols], y['price'])
target_encoded_col = target_encoder.transform(X[target_cols])
target_encoded_col.reset_index(drop=True, inplace=True) #reset index

In [21]:
#drop those columns which have converted using encoder ex. onehot encoder and target encoder
X.drop(columns=['make', 'fuel', "transmission", 'body', 'drivetrain', 
                'model', 'engine', 'trim', 'exterior_color', 'interior_color'], inplace=True)


In [22]:
#reset index.
X.reset_index(drop=True, inplace=True)


In [23]:
#creating input feature using all encoded
input_feature = pd.concat([X, encoded_df, target_encoded_col], axis=1)
input_feature.shape

(979, 89)

# Create model

In [24]:
#loading library
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [25]:
#spliting the train and test datafram
x_train, x_test, y_train, y_test = train_test_split(input_feature, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((783, 89), (196, 89), (783, 1), (196, 1))

In [26]:
#fit the model and predict
rfr = RandomForestRegressor(n_estimators=100, random_state=42)
rfr.fit(x_train, y_train)
prediction_rfr = rfr.predict(x_test)

In [27]:
#calculating R2 values.
r2_score(y_test, prediction_rfr)

0.9188678811695565

In [28]:
y.describe()

Unnamed: 0,price
count,979.0
mean,50202.9857
std,18700.392062
min,0.0
25%,36600.0
50%,47165.0
75%,58919.5
max,195895.0


In [29]:
#checking RMSE
print(np.sqrt(mean_squared_error(y_test, prediction_rfr)))

4977.646864101206


In [30]:
#convert RMSE into error percentage
print(f'Percetage of error b/w actual and predicted price is: {np.sqrt(mean_squared_error(y_test, prediction_rfr))/195895 * 100}') 
#formulla used: RMSE/Max of traget_value * 100

Percetage of error b/w actual and predicted price is: 2.5409769846607655


Note: Target i.e. Price columns has min and max value are 0 and 195895. 
Error we are getting b/w predicted and actual price is 2.5 %. which not bad

In [31]:
#now saving model and encoded variables
import joblib

joblib.dump(rfr, 'Random_forest_regressor.pkl')
joblib.dump(onehot_encoder, 'Onehot_encoder.pkl')
joblib.dump(target_encoder, 'Target_encoder.pkl')

['Target_encoder.pkl']

## Project Completed By Deepak Kumar

# Thanks you ! 