In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv("test.csv")

train_data.head()
test_data.head()

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,...,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price
0,1303772,Honda Vezel 1.5A X,honda,vezel,4614,2015.0,,29-apr-2015,suv,parf car,...,9582.0,112000.0,19229.0,9229.0,,,uncategorized,"powerful 1.5l i-vtec engine producing 128bhp, ...","pioneer touch screen with reverse camera, 16"" ...",
1,1323166,Mazda 3 1.6A SP (COE till 10/2027),mazda,3,extremely well maintained and in pristine cond...,2007.0,,26-oct-2007,mid-sized sedan,"coe car, premium ad car, low mileage car",...,13644.0,120000.0,14347.0,15782.0,,,uncategorized,fuel efficient 1.6l 4-cylinder inline 16-valve...,"multi-function steering wheel, keyless entry, ...",
2,1308405,MINI Cooper S Countryman 2.0A,mini,cooper,1 owner! beautiful island blue color! eurokars...,2019.0,,27-mar-2020,sports car,parf car,...,54818.0,43000.0,39863.0,47809.0,,,uncategorized,"output of 141kw, 189bhp at 5000rpm to 6000rpm,...","18"" sports rims, sports leather seats, navigat...",
3,1216706,Toyota Vios 1.5A G,toyota,vios,fully agent maintain! genuine low mileage at 5...,2019.0,,28-jun-2019,mid-sized sedan,"parf car, premium ad car",...,26363.0,53300.0,15573.0,15573.0,,,uncategorized,"1.5l 4 cylinder 16 valves dohc vvt-i engine, 7...","push start button, toyota factory player, reve...",
4,1298206,Mazda 3 HB 1.5A,mazda,3,workshop check/sta evaluation available. accid...,2015.0,,19-nov-2015,hatchback,"parf car, premium ad car",...,15197.0,149000.0,18097.0,13097.0,,,uncategorized,1.5l 4 cylinder inline dohc 16 valves skyactiv...,factory fitted audio with audio & multi functi...,


In [3]:
print('Training data number = {}'.format(train_data.shape[0]))
print('Test data number = {}\n'.format(test_data.shape[0]))
train_data.columns 

Training data number = 25000
Test data number = 10000



Index(['listing_id', 'title', 'make', 'model', 'description', 'manufactured',
       'original_reg_date', 'reg_date', 'type_of_vehicle', 'category',
       'transmission', 'curb_weight', 'power', 'fuel_type', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'opc_scheme', 'lifespan', 'eco_category',
       'features', 'accessories', 'indicative_price', 'price'],
      dtype='object')

In [4]:
train_data.select_dtypes(exclude=['object']).isna().sum()
train_data.select_dtypes(include=['object']).isna().sum()

test_data.select_dtypes(exclude=['object']).isna().sum()

listing_id              0
manufactured            3
curb_weight           110
power                1086
engine_cap            235
no_of_owners            8
depreciation          201
coe                     0
road_tax             1082
dereg_value            83
mileage              2166
omv                    29
arf                    65
indicative_price    10000
dtype: int64

In [5]:
# Drop 'indicative_price' since it's all missing
train_data.drop(columns=['listing_id', 'indicative_price'], inplace=True)

# Fill missing values in numerical columns
train_data['manufactured'].fillna(train_data['manufactured'].median(), inplace=True)
train_data['curb_weight'].fillna(train_data['curb_weight'].mean(), inplace=True)
train_data['power'].fillna(train_data['power'].median(), inplace=True)
train_data['engine_cap'].fillna(train_data['engine_cap'].mean(), inplace=True)
train_data['no_of_owners'].fillna(train_data['no_of_owners'].mode()[0], inplace=True)
train_data['depreciation'].fillna(train_data['depreciation'].median(), inplace=True)
train_data['coe'].fillna(train_data['coe'].median(), inplace=True)
train_data['road_tax'].fillna(train_data['road_tax'].median(), inplace=True)
train_data['dereg_value'].fillna(train_data['dereg_value'].median(), inplace=True)
train_data['mileage'].fillna(train_data['mileage'].median(), inplace=True)
train_data['omv'].fillna(train_data['omv'].median(), inplace=True)
train_data['arf'].fillna(train_data['arf'].median(), inplace=True)
train_data['price'].fillna(train_data['price'].median(), inplace=True)

train_data.fillna(0, inplace=True) 

In [6]:
train_data = train_data.drop(columns = train_data.select_dtypes(include=['object']))
test_data = test_data.drop(columns = test_data.select_dtypes(include=['object']))

In [7]:
train_data.isna().sum()
test_data.isna().sum()

listing_id              0
manufactured            3
curb_weight           110
power                1086
engine_cap            235
no_of_owners            8
depreciation          201
coe                     0
road_tax             1082
dereg_value            83
mileage              2166
omv                    29
arf                    65
indicative_price    10000
dtype: int64

In [8]:
train_data.head()
test_data.head()

Unnamed: 0,listing_id,manufactured,curb_weight,power,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,indicative_price
0,1303772,2015.0,1190.0,96.0,1496.0,2.0,17660.0,57199,682.0,9582.0,112000.0,19229.0,9229.0,
1,1323166,2007.0,1235.0,79.0,1598.0,1.0,10920.0,42564,1113.0,13644.0,120000.0,14347.0,15782.0,
2,1308405,2019.0,1535.0,141.0,1998.0,1.0,22120.0,32801,1210.0,54818.0,43000.0,39863.0,47809.0,
3,1216706,2019.0,1100.0,79.0,1496.0,3.0,13700.0,29159,682.0,26363.0,53300.0,15573.0,15573.0,
4,1298206,2015.0,1324.0,88.0,1496.0,3.0,14190.0,56001,682.0,15197.0,149000.0,18097.0,13097.0,


In [9]:
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(columns = ['price']), train_data['price'], test_size=0.2, random_state=35)

X_train
X_test
y_train
y_test

17013     88800.0
6549       7800.0
18919     46800.0
9011      54000.0
1164      73800.0
           ...   
17306     51000.0
24494     46800.0
8800      14000.0
18904    510000.0
4326      42800.0
Name: price, Length: 5000, dtype: float64

In [10]:
def rmse_score(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

In [11]:
print("NaNs in X_train:", np.isnan(X_train).sum())

# Check for NaNs in X_test
print("NaNs in X_test:", np.isnan(X_test).sum())

NaNs in X_train: manufactured    0
curb_weight     0
power           0
engine_cap      0
no_of_owners    0
depreciation    0
coe             0
road_tax        0
dereg_value     0
mileage         0
omv             0
arf             0
dtype: int64
NaNs in X_test: manufactured    0
curb_weight     0
power           0
engine_cap      0
no_of_owners    0
depreciation    0
coe             0
road_tax        0
dereg_value     0
mileage         0
omv             0
arf             0
dtype: int64


In [12]:
model1 = Lasso(alpha= 0.7, max_iter = 10000, warm_start = True,  random_state = 50,selection = 'random')
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)

In [13]:
model2 = BaggingRegressor(random_state=50, max_samples = 950)
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)

In [14]:
model3 = RandomForestRegressor(random_state=50,  max_features=0.43, min_samples_leaf=1,)
model3.fit(X_train, y_train)
y_pred3 = model3.predict(X_test)

In [15]:
model4 = GradientBoostingRegressor(
    random_state=50, 
    min_samples_split = 6, 
    min_samples_leaf = 3, 
    max_depth = 4
)

model4.fit(X_train, y_train) 

y_pred4 = model4.predict(X_test)

In [16]:
training_residuals = y_train - model3.predict(X_train)
model4.fit(X_train, training_residuals)
pred_residuals = model4.predict(X_test)
y_pred5 = pred_residuals + model3.predict(X_test)

In [17]:
y_pred = [y_pred1, y_pred2, y_pred3, y_pred4, y_pred5]

In [18]:
mean_price = y_train.mean()
baseline_predictions = [mean_price] * len(y_test)
baseline_rmse = sqrt(mean_squared_error(y_test, baseline_predictions))

print(f"Baseline RMSE (Predicting Mean): {baseline_rmse:.2f}")

method = 1
for test in y_pred:
    r2_test = r2_score(y_test, test)
    rmse = rmse_score(y_test, test)
    print(f"R-squared (Test): {r2_test}")
    print(f"RMSE{method}: {rmse}\n")
    method+=1 

Baseline RMSE (Predicting Mean): 170308.83
R-squared (Test): 0.9145353541444644
RMSE1: 49780.49403032417

R-squared (Test): 0.9075785293640352
RMSE2: 51766.927714613485

R-squared (Test): 0.9683655095279439
RMSE3: 30286.310851068363

R-squared (Test): 0.9677196818671912
RMSE4: 30593.90098008701

R-squared (Test): 0.9758164917243549
RMSE5: 26480.46490239439

