In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    root_mean_squared_error,
)

In [44]:
df = pd.read_csv("car-details.csv")
df.sample(5)

Unnamed: 0,name,company,model,edition,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
5291,Hyundai i10 Era,Hyundai,i10,Era,2009,First,Petrol,Individual,Manual,35000,46.56,1086.0,68.05,99.04,5.0,160000
2882,Ford Figo Diesel Titanium,Ford,Figo,Diesel Titanium,2011,Second,Diesel,Individual,Manual,70000,47.0,1399.0,68.0,160.0,5.0,204999
3451,Hyundai i10 Sportz 1.1L,Hyundai,i10,Sportz 1.1L,2008,Third,Petrol,Individual,Manual,100000,46.56,1086.0,68.05,99.04,5.0,180000
5559,Honda City i VTEC V,Honda,City,i VTEC V,2015,First,Petrol,Individual,Manual,80000,40.88,1497.0,117.3,145.0,5.0,650000
3895,Maruti Wagon R VXI BS IV,Maruti,Wagon,R VXI BS IV,2013,Second,Petrol,Individual,Manual,57000,48.2,998.0,67.04,90.0,5.0,300000


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6926 entries, 0 to 6925
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6926 non-null   object 
 1   company        6926 non-null   object 
 2   model          6926 non-null   object 
 3   edition        6926 non-null   object 
 4   year           6926 non-null   int64  
 5   owner          6926 non-null   object 
 6   fuel           6926 non-null   object 
 7   seller_type    6926 non-null   object 
 8   transmission   6926 non-null   object 
 9   km_driven      6926 non-null   int64  
 10  mileage_mpg    6718 non-null   float64
 11  engine_cc      6718 non-null   float64
 12  max_power_bhp  6717 non-null   float64
 13  torque_nm      6717 non-null   float64
 14  seats          6718 non-null   float64
 15  selling_price  6926 non-null   int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 865.9+ KB


In [21]:
df.isna().sum()

name               0
company            0
model              0
edition            0
year               0
owner              0
fuel               0
seller_type        0
transmission       0
km_driven          0
mileage_mpg      208
engine_cc        208
max_power_bhp    209
torque_nm        209
seats            208
selling_price      0
dtype: int64

In [22]:
df.shape

(6926, 16)

In [23]:
for col in df.select_dtypes(include="O").columns:
    print(f"column: {col}")
    print(f"cardinality: {df[col].nunique()}")
    print(df[col].unique())
    print(df.value_counts(normalize=True))
    print()

column: name
cardinality: 2058
['Maruti Swift Dzire VDI' 'Skoda Rapid 1.5 TDI Ambition'
 'Honda City 2017-2020 EXi' ... 'Tata Nexon 1.5 Revotorq XT'
 'Ford Freestyle Titanium Plus Diesel BSIV'
 'Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV']
name                               company     model    edition              year  owner   fuel    seller_type  transmission  km_driven  mileage_mpg  engine_cc  max_power_bhp  torque_nm  seats  selling_price
Ambassador CLASSIC 1500 DSL AC     Ambassador  CLASSIC  1500 DSL AC          2000  Second  Diesel  Individual   Manual        90000      30.08        1489.0     35.5           72.9       5.0    75000            0.000149
Ambassador Classic 2000 DSZ AC PS  Ambassador  Classic  2000 DSZ AC PS       1994  Second  Diesel  Individual   Manual        100000     30.08        1995.0     52.0           106.0      5.0    99000            0.000149
Ambassador Grand 1500 DSZ BSIII    Ambassador  Grand    1500 DSZ BSIII       2008  Second  Diesel  Individual 

In [24]:
df = df.drop(columns=["name", "model", "edition"])
df.head()

Unnamed: 0,company,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti,2014,First,Diesel,Individual,Manual,145500,55.0,1248.0,74.0,190.0,5.0,450000
1,Skoda,2014,Second,Diesel,Individual,Manual,120000,49.7,1498.0,103.52,250.0,5.0,370000
2,Honda,2006,Third,Petrol,Individual,Manual,140000,41.6,1497.0,78.0,124.544455,5.0,158000
3,Hyundai,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.0,219.66896,5.0,225000
4,Maruti,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.2,112.776475,5.0,130000


In [25]:
df = df.drop_duplicates()

In [26]:
df.duplicated().sum()

np.int64(0)

In [27]:
X = df.drop(columns=["selling_price"])
y = df.selling_price.copy()
print(X.shape, y.shape)

(6907, 12) (6907,)


In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(5525, 12) (1382, 12) (5525,) (1382,)


In [35]:
num_cols = X_train.select_dtypes(include="number").columns.tolist()
cat_cols = [col for col in X_train.columns if col not in num_cols]

print(num_cols)
print(cat_cols)

['year', 'km_driven', 'mileage_mpg', 'engine_cc', 'max_power_bhp', 'torque_nm', 'seats']
['company', 'owner', 'fuel', 'seller_type', 'transmission']


In [40]:
num_pipe = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

cat_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols)]
)
preprocessor.fit_transform(X_train)

regressor = RandomForestRegressor(n_estimators=10, max_depth=5, random_state=42)

rf_model = Pipeline(steps=[("pre", preprocessor), ("reg", regressor)])

rf_model.fit(X_train, y_train)

0,1,2
,steps,"[('pre', ...), ('reg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [46]:
y_train_pred = rf_model.predict(X_train)
train_rmse = root_mean_squared_error(y_train, y_train_pred)
print("Train RMSE:", train_rmse)

y_test_pred = rf_model.predict(X_test)
test_rmse = root_mean_squared_error(y_test, y_test_pred)
print("Test RMSE:", test_rmse)

Train RMSE: 169947.48964050272
Test RMSE: 172392.1313605195
