<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [743]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn import svm
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Lasso
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor,RandomForestClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostRegressor, VotingRegressor
from sklearn.datasets import make_regression
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,LabelEncoder
from sklearn.impute import SimpleImputer

In [744]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv("test.csv")

train_data.head()
test_data.head()

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,...,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price
0,1303772,Honda Vezel 1.5A X,honda,vezel,4614,2015.0,,29-apr-2015,suv,parf car,...,9582.0,112000.0,19229.0,9229.0,,,uncategorized,"powerful 1.5l i-vtec engine producing 128bhp, ...","pioneer touch screen with reverse camera, 16"" ...",
1,1323166,Mazda 3 1.6A SP (COE till 10/2027),mazda,3,extremely well maintained and in pristine cond...,2007.0,,26-oct-2007,mid-sized sedan,"coe car, premium ad car, low mileage car",...,13644.0,120000.0,14347.0,15782.0,,,uncategorized,fuel efficient 1.6l 4-cylinder inline 16-valve...,"multi-function steering wheel, keyless entry, ...",
2,1308405,MINI Cooper S Countryman 2.0A,mini,cooper,1 owner! beautiful island blue color! eurokars...,2019.0,,27-mar-2020,sports car,parf car,...,54818.0,43000.0,39863.0,47809.0,,,uncategorized,"output of 141kw, 189bhp at 5000rpm to 6000rpm,...","18"" sports rims, sports leather seats, navigat...",
3,1216706,Toyota Vios 1.5A G,toyota,vios,fully agent maintain! genuine low mileage at 5...,2019.0,,28-jun-2019,mid-sized sedan,"parf car, premium ad car",...,26363.0,53300.0,15573.0,15573.0,,,uncategorized,"1.5l 4 cylinder 16 valves dohc vvt-i engine, 7...","push start button, toyota factory player, reve...",
4,1298206,Mazda 3 HB 1.5A,mazda,3,workshop check/sta evaluation available. accid...,2015.0,,19-nov-2015,hatchback,"parf car, premium ad car",...,15197.0,149000.0,18097.0,13097.0,,,uncategorized,1.5l 4 cylinder inline dohc 16 valves skyactiv...,factory fitted audio with audio & multi functi...,


In [745]:
def categorical(train_data, column, t, other=None):
    if t=='One':
        enc = OneHotEncoder(sparse_output=False)
        enc_array = enc.fit_transform(train_data[[column]])
        print("Encoded array:\n", enc_array)
        enc_frame = pd.DataFrame(enc_array, columns=enc.get_feature_names_out([column]))
        print("Encoded DataFrame:\n", enc_frame)
        train_data = pd.concat([train_data.reset_index(drop=True), enc_frame.reset_index(drop=True)], axis=1)
        train_data.drop(columns=[column],inplace=True)
        print("Final DataFrame:\n", train_data)
    elif t=='Ord':
        enc = OrdinalEncoder(categories=other)
        train_data[column+"_ord"] = enc.fit_transform(train_data[[column]])
        train_data.drop(columns=[column],inplace=True)
    elif t=='drop':
        train_data.drop(columns=[column],inplace=True)
    return train_data

In [746]:
def fill(train_data, column, t,other=None):
    if t=='mean':
        if other!=None:
            train_data=train_data[train_data[column]>=other]
        train_data[column].fillna(train_data[column].mean(), inplace=True)
    elif t=='median':
        if other!=None:
            train_data=train_data[train_data[column]>=other]
        train_data[column].fillna(train_data[column].median(), inplace=True)
    elif t=='0':
        train_data[column].fillna(0, inplace=True)
    elif t=='None':
        train_data[column].fillna('None', inplace=True)
    elif t=='mode':
        train_data[column].fillna(train_data[column].mode()[0], inplace=True)
    elif t=='imp':
        imputer = SimpleImputer(strategy='most_frequent')
        train_data[column] = imputer.fit_transform(train_data[[column]])
        if other!=None:
            train_data=train_data[train_data[column]>=other]  
    elif t=='clear':
        idx = train_data[train_data[column]==0].index
        train_data.drop(idx, inplace=True)
    elif t=='exact':
        train_data[column].fillna(other, inplace=True)
    return train_data

In [747]:
nan_all_data = (train_data.isnull().sum())*100/train_data.shape[0]
nan_all_data= nan_all_data.drop(nan_all_data[nan_all_data== 0].index).sort_values(ascending=False)
nan_all_data
miss_df = pd.DataFrame({'Missing Ratio' :nan_all_data})
miss_df

Unnamed: 0,Missing Ratio
indicative_price,100.0
opc_scheme,99.352
original_reg_date,98.98
lifespan,90.684
fuel_type,76.484
mileage,21.216
accessories,15.252
power,10.56
road_tax,10.528
make,5.264


In [748]:
train_data=fill(train_data,'power','imp',40)
# train_data=fill(train_data,'fuel_type','None')
train_data=fill(train_data,'mileage','median',100)
train_data=fill(train_data,'road_tax','imp',100)
train_data=fill(train_data,'make','None')
train_data=fill(train_data,'make','clear')
train_data=fill(train_data,'engine_cap','exact',0)
train_data=fill(train_data,'depreciation','median',8000)
train_data=fill(train_data,'curb_weight','median',800)
train_data=fill(train_data,'dereg_value','median',1000)
# train_data=fill(train_data,'dereg_value','clear')
train_data=fill(train_data,'arf','imp',800)
train_data=fill(train_data,'omv','imp',8000)
train_data=fill(train_data,'no_of_owners','exact',1)
train_data=fill(train_data,'manufactured','imp')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data[column].fillna(train_data[column].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[column].fillna(train_data[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplac

In [749]:
nan_all_data = (train_data.isnull().sum())*100/train_data.shape[0]
nan_all_data= nan_all_data.drop(nan_all_data[nan_all_data== 0].index).sort_values(ascending=False)
nan_all_data
miss_df = pd.DataFrame({'Missing Ratio' :nan_all_data})
miss_df

Unnamed: 0,Missing Ratio
indicative_price,100.0
original_reg_date,99.734748
opc_scheme,99.464191
lifespan,96.546419
fuel_type,80.986737
accessories,9.193634
features,1.676393
description,1.517241


In [750]:
train_nan=train_data[['power','engine_cap','fuel_type']]
train_complete = train_nan[train_nan['fuel_type'].notnull()]
train_missing = train_nan[train_nan['fuel_type'].isnull()]
X1 = train_complete[['power', 'engine_cap']]
y1 = train_complete['fuel_type']
label_encoder = LabelEncoder()
y11 = label_encoder.fit_transform(y1)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y11, test_size=0.2, random_state=50)

In [751]:
model0 = RandomForestClassifier(n_estimators=100, random_state=50,max_depth=12)
model0.fit(X_train1, y_train1)
r2_test = r2_score(y_test1, model0.predict(X_test1))
print(f"R-squared (Test): {r2_test}")


R-squared (Test): 0.9763513635071585


In [752]:
X_missing = train_missing[['power', 'engine_cap']]

predicted_labels = model0.predict(X_missing)
train_data.loc[train_data['fuel_type'].isnull(), 'fuel_type'] = label_encoder.inverse_transform(predicted_labels)
print(train_data['fuel_type'].isnull().sum())


0


In [753]:
train_data=categorical(train_data,'listing_id','drop')
train_data=categorical(train_data,'title','drop')
# train_data=categorical(train_data,'make','One')
train_data=categorical(train_data,'make','drop')
train_data=categorical(train_data,'model','drop')
train_data=categorical(train_data,'description','drop')
# train_data=categorical(train_data,'manufactured','One')

train_data=categorical(train_data,'original_reg_date','drop') #
train_data=categorical(train_data,'reg_date','drop') #
# train_data=categorical(train_data,'type_of_vehicle','One') 
train_data=categorical(train_data,'type_of_vehicle','drop') 
# train_data=categorical(train_data,'category','One') 
train_data=categorical(train_data,'category','drop') 
# train_data=categorical(train_data,'transmission','One') 
train_data=categorical(train_data,'transmission','drop') 
train_data=categorical(train_data,'fuel_type','One') 
train_data=categorical(train_data,'no_of_owners','Ord',[['1','2','3','4','5','6']]) 
train_data=categorical(train_data,'opc_scheme','drop')
train_data=categorical(train_data,'lifespan','drop') 
train_data=categorical(train_data,'eco_category','drop')
train_data=categorical(train_data,'features','drop')
train_data=categorical(train_data,'accessories','drop')
train_data=categorical(train_data,'indicative_price','drop')

Encoded array:
 [[0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]
Encoded DataFrame:
        fuel_type_diesel  fuel_type_diesel-electric  fuel_type_electric  \
0                   0.0                        0.0                 0.0   
1                   0.0                        0.0                 0.0   
2                   0.0                        0.0                 0.0   
3                   1.0                        0.0                 0.0   
4                   1.0                        0.0                 0.0   
...                 ...                        ...                 ...   
18845               1.0                        0.0                 0.0   
18846               1.0                        0.0                 0.0   
18847               0.0                        0.0                 0.0   
18848               0.0                        0.0                 0.0   
18849               0.0             

In [754]:
nan_all_data = (train_data.isnull().sum())*100/train_data.shape[0]
nan_all_data= nan_all_data.drop(nan_all_data[nan_all_data== 0].index).sort_values(ascending=False)
nan_all_data
miss_df = pd.DataFrame({'Missing Ratio' :nan_all_data})
miss_df

Unnamed: 0,Missing Ratio


In [755]:
train_data.head()

Unnamed: 0,manufactured,curb_weight,power,engine_cap,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,price,fuel_type_diesel,fuel_type_diesel-electric,fuel_type_electric,fuel_type_petrol,fuel_type_petrol-electric,no_of_owners_ord
0,2018.0,1884.0,280.0,2995.0,34270.0,48011,2380.0,103323.0,96000.0,88906.0,132031.0,193788.0,0.0,0.0,0.0,0.0,1.0,1.0
1,2017.0,1465.0,135.0,1991.0,21170.0,47002,1202.0,45179.0,85680.0,40678.0,43950.0,96800.0,0.0,0.0,0.0,0.0,1.0,1.0
2,2007.0,1648.0,118.0,2354.0,12520.0,50355,2442.0,16003.0,138000.0,27994.0,30794.0,39800.0,0.0,0.0,0.0,0.0,1.0,2.0
3,2008.0,1195.0,80.0,1598.0,10140.0,27571,1113.0,12184.0,160000.0,16084.0,16084.0,44800.0,1.0,0.0,0.0,0.0,0.0,2.0
4,2006.0,1660.0,183.0,2995.0,13690.0,48479,3570.0,9138.0,183000.0,50414.0,55456.0,25800.0,1.0,0.0,0.0,0.0,0.0,5.0


In [756]:
def remove_out(train_data, column):
    low=train_data[column].quantile(0)
    high=train_data[column].quantile(0.99)
    IQR=high-low
    l=low-1.3*IQR
    h=high+1.3*IQR
    print(column,low,high,train_data[(train_data[column]>=l)
        &(train_data[column]<=h)]['price'].count()/train_data['price'].count())
    return train_data[(train_data[column]>=l) &(train_data[column]<=h)]

In [757]:
remain=1
# cols=['price','omv','mileage','dereg_value','arf','depreciation'] #power
cols=['omv','power','dereg_value','arf','depreciation']
for col in cols:
    train_data1=remove_out(train_data,col)
    remain*=train_data1['price'].count()/train_data.count()
    train_data=train_data1

print(remain)

omv 8077.0 288656.5799999997 0.9994164456233422
power 47.0 456.0 1.0
dereg_value 1170.0 320867.5599999991 0.998885291151335
arf 801.0 385848.369999999 0.9996280157296206
depreciation 8040.0 100552.00000000026 0.9987241507628515
manufactured                 0.996658
curb_weight                  0.996658
power                        0.996658
engine_cap                   0.996658
depreciation                 0.996658
coe                          0.996658
road_tax                     0.996658
dereg_value                  0.996658
mileage                      0.996658
omv                          0.996658
arf                          0.996658
price                        0.996658
fuel_type_diesel             0.996658
fuel_type_diesel-electric    0.996658
fuel_type_electric           0.996658
fuel_type_petrol             0.996658
fuel_type_petrol-electric    0.996658
no_of_owners_ord             0.996658
dtype: float64


In [758]:
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(columns = ['price']), train_data['price'], test_size=0.2, random_state=35)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [759]:
def rmse_score(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

In [760]:
model4 = GradientBoostingRegressor(
    random_state=50, 
    min_samples_split = 6, 
    min_samples_leaf =4, 
    max_depth = 7
)

model4.fit(X_train, y_train) 

y_pred4 = model4.predict(X_test)
r2_test = r2_score(y_test, y_pred4)
rmse = rmse_score(y_test, y_pred4)
print(f"R-squared (Test): {r2_test}")
print(f"RMSE: {rmse}\n")

R-squared (Test): 0.9898075017383441
RMSE: 13260.82092677762



In [761]:
model13 = RandomForestRegressor(n_estimators=50,random_state=50,  min_samples_split=6,max_features='sqrt', 
                               min_samples_leaf=2,
                               max_depth=50,
                               min_impurity_decrease=0.3,)
model13.fit(X_train, y_train)
y_pred13 = model13.predict(X_test)
r2_test = r2_score(y_test, y_pred13)
rmse = rmse_score(y_test, y_pred13)
print(f"R-squared (Test): {r2_test}")
print(f"RMSE: {rmse}\n")

R-squared (Test): 0.9821531476806598
RMSE: 17547.32429186008



In [764]:
model7= AdaBoostRegressor(estimator=model4,random_state=50, n_estimators=20,loss='square',learning_rate=0.21)
model7.fit(X_train, y_train) 
y_pred7 = model7.predict(X_test)
r2_test = r2_score(y_test, y_pred7)
rmse = rmse_score(y_test, y_pred7)
print(f"R-squared (Test): {r2_test}")
print(f"RMSE: {rmse}\n")

R-squared (Test): 0.9918461881126275
RMSE: 11860.703007865974



In [None]:
model8= AdaBoostRegressor(estimator=model13,random_state=50, n_estimators=20,loss='square',learning_rate=0.21)
model8.fit(X_train, y_train) 
y_pred8 = model8.predict(X_test)
r2_test = r2_score(y_test, y_pred8)
rmse = rmse_score(y_test, y_pred8)
print(f"R-squared (Test): {r2_test}")
print(f"RMSE: {rmse}\n")

In [None]:
test_data = pd.read_csv("test.csv")
nan_all_data = (test_data.isnull().sum())*100/test_data.shape[0]
nan_all_data= nan_all_data.drop(nan_all_data[nan_all_data== 0].index).sort_values(ascending=False)
nan_all_data
miss_df = pd.DataFrame({'Missing Ratio' :nan_all_data})
miss_df

In [None]:
# test_data=fill(test_data,'fuel_type','exact','petrol')
test_data=fill(test_data,'mileage','mean')
test_data=fill(test_data,'power','imp')
test_data=fill(test_data,'road_tax','imp')
test_data=fill(test_data,'make','None')
test_data=fill(test_data,'make','clear')
test_data=fill(test_data,'engine_cap','exact',0)
test_data=fill(test_data,'depreciation','mean')
test_data=fill(test_data,'curb_weight','mean')
test_data=fill(test_data,'dereg_value','mean')
# train_data=fill(train_data,'dereg_value','clear')
test_data=fill(test_data,'arf','imp')
test_data=fill(test_data,'omv','imp')
test_data=fill(test_data,'no_of_owners','exact',1)
test_data=fill(test_data,'manufactured','imp')

In [None]:
test_missing = test_data[test_data['fuel_type'].isnull()]
X_missing = test_missing[['power', 'engine_cap']]
predicted_labels = model0.predict(X_missing)
test_data.loc[test_data['fuel_type'].isnull(), 'fuel_type'] = label_encoder.inverse_transform(predicted_labels)

In [None]:

test_data=categorical(test_data,'listing_id','drop')
test_data=categorical(test_data,'title','drop')
# test_data=categorical(test_data,'make','One')
test_data=categorical(test_data,'make','drop')
test_data=categorical(test_data,'model','drop')
test_data=categorical(test_data,'description','drop')
# test_data=categorical(test_data,'manufactured','One')

test_data=categorical(test_data,'original_reg_date','drop') #
test_data=categorical(test_data,'reg_date','drop') #
# test_data=categorical(test_data,'type_of_vehicle','One') 
test_data=categorical(test_data,'type_of_vehicle','drop') 
# test_data=categorical(test_data,'category','One') 
test_data=categorical(test_data,'category','drop') 
# test_data=categorical(test_data,'transmission','One') 
test_data=categorical(test_data,'transmission','drop') 
test_data=categorical(test_data,'fuel_type','One') 
test_data=categorical(test_data,'no_of_owners','Ord',[['1','2','3','4','5','6']]) 
test_data=categorical(test_data,'opc_scheme','drop')
test_data=categorical(test_data,'lifespan','drop') 
test_data=categorical(test_data,'eco_category','drop')
test_data=categorical(test_data,'features','drop')
test_data=categorical(test_data,'accessories','drop')
test_data=categorical(test_data,'indicative_price','drop')

In [None]:
test_data.head()

In [None]:
train_data.head()

In [None]:
scaler = StandardScaler()
test_data_y = scaler.fit_transform(test_data)
models=[model7]
y_pred_y=0
for model in models:
    y_pred_y = y_pred_y+model.predict(test_data_y)
    print(model)
y_pred_y=y_pred_y
result_df = pd.DataFrame({
    'Id': test_data.index,
    'Predicted': y_pred_y
})

result_df.to_csv('predicted_results.csv', index=False)