In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler


In [4]:
data = pd.read_csv('car-details.csv')
data.sample(5)

Unnamed: 0,name,company,model,edition,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
6032,Honda City i-VTEC CVT ZX,Honda,City,i-VTEC CVT ZX,2017,First,Petrol,Individual,Automatic,20000,42.3,1497.0,117.6,145.0,5.0,1100000
5094,Mahindra Scorpio LX BSIV,Mahindra,Scorpio,LX BSIV,2014,First,Diesel,Individual,Manual,120000,28.31,2179.0,120.0,290.0,9.0,700000
1819,Mahindra TUV 300 T8,Mahindra,TUV,300 T8,2015,First,Diesel,Individual,Manual,58945,43.44,1493.0,100.0,240.0,7.0,700000
5919,Maruti Alto LXi,Maruti,Alto,LXi,2007,Third,Petrol,Individual,Manual,120000,46.28,796.0,46.3,62.0,5.0,85000
1977,Mahindra Scorpio LX,Mahindra,Scorpio,LX,2013,First,Diesel,Individual,Manual,110000,28.31,2179.0,120.0,290.0,9.0,520000


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6926 entries, 0 to 6925
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6926 non-null   object 
 1   company        6926 non-null   object 
 2   model          6926 non-null   object 
 3   edition        6926 non-null   object 
 4   year           6926 non-null   int64  
 5   owner          6926 non-null   object 
 6   fuel           6926 non-null   object 
 7   seller_type    6926 non-null   object 
 8   transmission   6926 non-null   object 
 9   km_driven      6926 non-null   int64  
 10  mileage_mpg    6718 non-null   float64
 11  engine_cc      6718 non-null   float64
 12  max_power_bhp  6717 non-null   float64
 13  torque_nm      6717 non-null   float64
 14  seats          6718 non-null   float64
 15  selling_price  6926 non-null   int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 865.9+ KB


In [6]:
data.isna().sum()

name               0
company            0
model              0
edition            0
year               0
owner              0
fuel               0
seller_type        0
transmission       0
km_driven          0
mileage_mpg      208
engine_cc        208
max_power_bhp    209
torque_nm        209
seats            208
selling_price      0
dtype: int64

In [7]:
data.shape

(6926, 16)

In [8]:
for col in data.select_dtypes(include=['object']).columns:
    print(col)

name
company
model
edition
owner
fuel
seller_type
transmission


In [9]:
data = data.drop(columns = ['name','model','edition'])
data.head()

Unnamed: 0,company,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti,2014,First,Diesel,Individual,Manual,145500,55.0,1248.0,74.0,190.0,5.0,450000
1,Skoda,2014,Second,Diesel,Individual,Manual,120000,49.7,1498.0,103.52,250.0,5.0,370000
2,Honda,2006,Third,Petrol,Individual,Manual,140000,41.6,1497.0,78.0,124.544455,5.0,158000
3,Hyundai,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.0,219.66896,5.0,225000
4,Maruti,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.2,112.776475,5.0,130000


In [10]:
data =data.drop_duplicates()

In [11]:
data.duplicated().sum()

0

In [12]:
X = data.drop(columns = ['selling_price'])
Y = data.selling_price.copy()

print(X.shape,Y.shape)

(6907, 12) (6907,)


In [13]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(5525, 12) (1382, 12) (5525,) (1382,)


In [14]:
num_col = X_train.select_dtypes(include='number').columns.to_list()
cat_col = X_train.select_dtypes(include='object').columns.to_list()

print(num_col)
print(cat_col)

['year', 'km_driven', 'mileage_mpg', 'engine_cc', 'max_power_bhp', 'torque_nm', 'seats']
['company', 'owner', 'fuel', 'seller_type', 'transmission']


In [24]:
num_pipe = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')), 
    ('scaler',StandardScaler())
])

cat_pipe = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot',OneHotEncoder(handle_unknown='ignore',sparse_output=False))
])
preprocessor = ColumnTransformer(transformers=[
    ('num',num_pipe,num_col),
    ('cat',cat_pipe,cat_col)
])

preprocessor.fit_transform(X_train)
regressor =     RandomForestRegressor(
    n_estimators = 10,
    max_depth = 5,
    random_state = 42
)
rf_model = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('regressor',regressor)
])

rf_model.fit(X_train,Y_train)


In [25]:
y_train_predict = rf_model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(Y_train,y_train_predict))
print(f'Train RMSE: {train_rmse:,.3f}')

y_test_predict = rf_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(Y_test,y_test_predict))
print(f'Test RMSE: {test_rmse:,.3f}')


Train RMSE: 169,947.490
Test RMSE: 172,392.131


In [15]:
X_train.head()

Unnamed: 0,company,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats
3442,Mahindra,2012,Second,Diesel,Individual,Manual,120000,28.31,2179.0,120.0,290.0,8.0
1883,Tata,2011,First,Diesel,Individual,Manual,156000,27.19,2179.0,138.1,320.0,7.0
5200,Mahindra,2012,First,Diesel,Individual,Manual,120000,37.5,2523.0,62.1,195.0,7.0
969,Ford,2011,Third,Diesel,Individual,Manual,110000,41.84,1399.0,68.0,159.848395,5.0
4520,Tata,2017,First,Petrol,Individual,Manual,20000,56.03,1199.0,84.0,114.0,5.0
