In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [24]:
df=pd.read_csv('car-details.csv')
df.sample(5)

Unnamed: 0,name,company,model,edition,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
3340,Hyundai i10 Asta Sunroof AT,Hyundai,i10,Asta Sunroof AT,2008,Second,Petrol,Individual,Automatic,120000,45.12,1197.0,80.0,111.79581,5.0,220000
1035,Mahindra Scorpio M2DI,Mahindra,Scorpio,M2DI,2011,Third,Diesel,Individual,Manual,120000,31.72,2179.0,115.0,277.5,5.0,350000
3413,Mahindra Bolero 2011-2019 Special Edition,Mahindra,Bolero,2011-2019 Special Edition,2015,First,Diesel,Individual,Manual,40000,37.5,2523.0,62.1,195.0,7.0,600000
5990,Maruti S-Cross 2017-2020 Zeta DDiS 200 SH,Maruti,S-Cross,2017-2020 Zeta DDiS 200 SH,2018,First,Diesel,Individual,Manual,35000,59.0,1248.0,88.5,200.0,5.0,899000
2841,Maruti Wagon R VXI AMT 1.2,Maruti,Wagon,R VXI AMT 1.2,2018,First,Petrol,Dealer,Automatic,32995,48.22,1197.0,81.8,113.0,5.0,445000


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6926 entries, 0 to 6925
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6926 non-null   object 
 1   company        6926 non-null   object 
 2   model          6926 non-null   object 
 3   edition        6926 non-null   object 
 4   year           6926 non-null   int64  
 5   owner          6926 non-null   object 
 6   fuel           6926 non-null   object 
 7   seller_type    6926 non-null   object 
 8   transmission   6926 non-null   object 
 9   km_driven      6926 non-null   int64  
 10  mileage_mpg    6718 non-null   float64
 11  engine_cc      6718 non-null   float64
 12  max_power_bhp  6717 non-null   float64
 13  torque_nm      6717 non-null   float64
 14  seats          6718 non-null   float64
 15  selling_price  6926 non-null   int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 865.9+ KB


In [26]:
df.isna().sum()

name               0
company            0
model              0
edition            0
year               0
owner              0
fuel               0
seller_type        0
transmission       0
km_driven          0
mileage_mpg      208
engine_cc        208
max_power_bhp    209
torque_nm        209
seats            208
selling_price      0
dtype: int64

In [27]:
df.shape

(6926, 16)

In [28]:

for col in df.select_dtypes(include=['number']).columns:
    print(f"Column: {col}")
    print(f'Cardinality: {df[col].nunique()}')
    print(df[col].unique())
    print(df[col].value_counts(normalize=True)  )
    print()


Column: year
Cardinality: 29
[2014 2006 2010 2007 2017 2001 2011 2013 2005 2009 2016 2012 2002 2015
 2018 2003 2019 2008 2020 1999 2000 1983 2004 1996 1994 1995 1998 1997
 1991]
year
2017    0.116662
2016    0.100058
2015    0.098325
2018    0.087785
2012    0.087208
2014    0.083887
2013    0.081144
2011    0.079988
2010    0.054866
2019    0.050101
2009    0.034652
2008    0.029599
2007    0.025411
2006    0.017759
2005    0.013139
2020    0.009096
2004    0.008663
2003    0.006786
2002    0.003754
2000    0.002888
1999    0.002455
1997    0.001588
1998    0.001444
2001    0.001299
1996    0.000433
1994    0.000433
1995    0.000289
1983    0.000144
1991    0.000144
Name: proportion, dtype: float64

Column: km_driven
Cardinality: 921
[ 145500  120000  140000  127000   45000  175000    5000   90000  169000
   68000  100000   80000   40000   70000   53000   50000   72000   35000
   28000   25000    2388   16200   10000   15000   42000   60000   76000
   28900   86300   23300   32600   1

In [29]:
df=df.drop(columns=['name','model','edition'])
df.head()

Unnamed: 0,company,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti,2014,First,Diesel,Individual,Manual,145500,55.0,1248.0,74.0,190.0,5.0,450000
1,Skoda,2014,Second,Diesel,Individual,Manual,120000,49.7,1498.0,103.52,250.0,5.0,370000
2,Honda,2006,Third,Petrol,Individual,Manual,140000,41.6,1497.0,78.0,124.544455,5.0,158000
3,Hyundai,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.0,219.66896,5.0,225000
4,Maruti,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.2,112.776475,5.0,130000


In [30]:
df.duplicated().sum()
df=df.drop_duplicates()
df.duplicated().sum()

np.int64(0)

In [31]:
X=df.drop(columns=['selling_price'])
y=df.selling_price.copy()

print(X.shape,y.shape)

(6907, 12) (6907,)


In [32]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(5525, 12) (5525,)
(1382, 12) (1382,)


In [33]:
num_cols=X.select_dtypes(include=['number']).columns.tolist()
cat_cols=[col for col in X_train.columns if col not in num_cols]
print(num_cols)
print(cat_cols)

['year', 'km_driven', 'mileage_mpg', 'engine_cc', 'max_power_bhp', 'torque_nm', 'seats']
['company', 'owner', 'fuel', 'seller_type', 'transmission']


In [34]:
num_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),   # fills missing values with median
    ('scaler', StandardScaler())                     # standardizes data
])


cat_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])

regressor = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model=Pipeline(steps=[
    ('pre',preprocessor),
    ('reg',regressor)
])

rf_model.fit(X_train,y_train)


0,1,2
,steps,"[('pre', ...), ('reg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [37]:
y_train_pred = rf_model.predict(X_train)
y_test_pred  = rf_model.predict(X_test)

# RMSE calculations (manual sqrt instead of squared=False)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse  = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Print results
print(f"Train RMSE: {train_rmse:,.3f}")
print(f"Test RMSE:  {test_rmse:,.3f}")

Train RMSE: 64,431.270
Test RMSE:  128,048.721
