In [144]:
import numpy as np
import pandas as pd
from category_encoders import BinaryEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer

In [145]:
pd.set_option('display.max_columns', None)

train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv("Data/test.csv")

train_data.head()
test_data.head()

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,curb_weight,power,fuel_type,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price
0,1303772,Honda Vezel 1.5A X,honda,vezel,4614,2015.0,,29-apr-2015,suv,parf car,auto,1190.0,96.0,,1496.0,2.0,17660.0,57199,682.0,9582.0,112000.0,19229.0,9229.0,,,uncategorized,"powerful 1.5l i-vtec engine producing 128bhp, ...","pioneer touch screen with reverse camera, 16"" ...",
1,1323166,Mazda 3 1.6A SP (COE till 10/2027),mazda,3,extremely well maintained and in pristine cond...,2007.0,,26-oct-2007,mid-sized sedan,"coe car, premium ad car, low mileage car",auto,1235.0,79.0,,1598.0,1.0,10920.0,42564,1113.0,13644.0,120000.0,14347.0,15782.0,,,uncategorized,fuel efficient 1.6l 4-cylinder inline 16-valve...,"multi-function steering wheel, keyless entry, ...",
2,1308405,MINI Cooper S Countryman 2.0A,mini,cooper,1 owner! beautiful island blue color! eurokars...,2019.0,,27-mar-2020,sports car,parf car,auto,1535.0,141.0,,1998.0,1.0,22120.0,32801,1210.0,54818.0,43000.0,39863.0,47809.0,,,uncategorized,"output of 141kw, 189bhp at 5000rpm to 6000rpm,...","18"" sports rims, sports leather seats, navigat...",
3,1216706,Toyota Vios 1.5A G,toyota,vios,fully agent maintain! genuine low mileage at 5...,2019.0,,28-jun-2019,mid-sized sedan,"parf car, premium ad car",auto,1100.0,79.0,,1496.0,3.0,13700.0,29159,682.0,26363.0,53300.0,15573.0,15573.0,,,uncategorized,"1.5l 4 cylinder 16 valves dohc vvt-i engine, 7...","push start button, toyota factory player, reve...",
4,1298206,Mazda 3 HB 1.5A,mazda,3,workshop check/sta evaluation available. accid...,2015.0,,19-nov-2015,hatchback,"parf car, premium ad car",auto,1324.0,88.0,,1496.0,3.0,14190.0,56001,682.0,15197.0,149000.0,18097.0,13097.0,,,uncategorized,1.5l 4 cylinder inline dohc 16 valves skyactiv...,factory fitted audio with audio & multi functi...,


In [146]:
print('Training data number = {}'.format(train_data.shape[0]))
print('Test data number = {}\n'.format(test_data.shape[0]))
train_data.columns 

Training data number = 25000
Test data number = 10000



Index(['listing_id', 'title', 'make', 'model', 'description', 'manufactured',
       'original_reg_date', 'reg_date', 'type_of_vehicle', 'category',
       'transmission', 'curb_weight', 'power', 'fuel_type', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'opc_scheme', 'lifespan', 'eco_category',
       'features', 'accessories', 'indicative_price', 'price'],
      dtype='object')

In [147]:
from numpy import ndarray
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.optimize import curve_fit


def Gauss(x, A, B, C):
    y = A * np.exp(-((x - B)**2) / (2 * C**2))
    return y

class TruncStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.height_ = None
        self.mean_ = None
        self.scale_ = None

    def transform(self, X):
        assert self.height_ is not None, 'You must fit the scaler before transforming data'
        assert self.mean_ is not None, 'You must fit the scaler before transforming data'
        assert self.scale_ is not None, 'You must fit the scaler before transforming data'
        
        _X = np.copy(X)
        _X = Gauss(_X, self.height_, self.mean_, self.scale_)
        return _X
    
    def inverse_transform(self, X):
        assert self.height_ is not None, 'You must fit the scaler before transforming data'
        assert self.mean_ is not None, 'You must fit the scaler before transforming data'
        assert self.scale_ is not None, 'You must fit the scaler before transforming data'
        
        _X = np.copy(X)
        _X[_X == 0] = 0.000001
        _X = np.sqrt(2 * self.scale_**2 * np.log(self.height_ / _X)) + self.mean_
        return _X

    def fit(self, X, bins=10000):
        hist, binedge = np.histogram(X, bins=bins)
        data_X = [(binedge[i] + binedge[i + 1]) / 2 for i in range(len(binedge) - 1)]
        data_Y = hist
        params, _ = curve_fit(Gauss, data_X, data_Y)
        self.height_ = params[0]
        self.mean_ = params[1]
        self.scale_ = params[2]

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

In [148]:
# preprocess
drop_columns = [
        "listing_id",
        "title",
        "make",
        # "model",
        "description",
        # "manufactured",
        "original_reg_date",
        "reg_date",
        # "type_of_vehicle",
        # "category",
        # "transmission",
        "curb_weight",
        # "power",
        "fuel_type",
        # "engine_cap",
        # "no_of_owners",
        # "depreciation",
        "coe",
        "road_tax",
        # "dereg_value",
        "mileage",
        # "omv",
        # "arf",
        "opc_scheme",
        "lifespan",
        "eco_category",
        "features",
        "accessories",
        "indicative_price",
        # "price",
    ]
train_data.drop(
    columns=drop_columns,
    inplace=True,
)
train_data.dropna(inplace=True)
train_data.reset_index(drop=True, inplace=True)
print("train_data.shape: {}".format(train_data.shape))
print(train_data.columns)


train_X = train_data.drop(columns=["price"])
train_Y = train_data["price"]

manufactured_scaler = MinMaxScaler()
train_X["manufactured"] = manufactured_scaler.fit_transform(train_X["manufactured"].values.reshape(-1, 1))

power_scaler = MinMaxScaler()
train_X["power"] = power_scaler.fit_transform(train_X["power"].values.reshape(-1, 1))

engine_capacity_scaler = MinMaxScaler()
train_X["engine_cap"] = engine_capacity_scaler.fit_transform(
    train_X["engine_cap"].values.reshape(-1, 1)
)

depresiation_scaler = MinMaxScaler()
train_X["depreciation"] = depresiation_scaler.fit_transform(
    train_X["depreciation"].values.reshape(-1, 1)
)

dereg_value_scaler = MinMaxScaler()
train_X["dereg_value"] = dereg_value_scaler.fit_transform(
    train_X["dereg_value"].values.reshape(-1, 1)
)

omv_scaler = MinMaxScaler()
train_X["omv"] = omv_scaler.fit_transform(train_X["omv"].values.reshape(-1, 1))

arf_scaler = MinMaxScaler()
train_X["arf"] = arf_scaler.fit_transform(train_X["arf"].values.reshape(-1, 1))

train_X["transmission"] = train_X["transmission"].map(
    {"auto": 1, "manual": 0}
)

model_binary_encoder = BinaryEncoder()
model_labels = model_binary_encoder.fit_transform(train_X["model"])
model_labels = pd.DataFrame(model_labels)
train_X.drop(columns=["model"], inplace=True)
train_X = pd.concat([train_X, model_labels], axis=1)

type_of_vehicle_encoder = BinaryEncoder()
type_labels = type_of_vehicle_encoder.fit_transform(train_X["type_of_vehicle"])
type_labels = pd.DataFrame(type_labels)
train_X.drop(columns=["type_of_vehicle"], inplace=True)
train_X = pd.concat([train_X, type_labels], axis=1)


train_X["category"] = train_X["category"].map(
    lambda c: [_c.strip() for _c in c.split(",") if _c != "" and _c != "-"]
)
category_encoder = MultiLabelBinarizer()
category_labels = category_encoder.fit_transform(train_X["category"]).astype(np.float64)
category_labels = pd.DataFrame(
    category_labels, columns=["category_" + c for c in category_encoder.classes_]
)
train_X.drop(columns=["category"], inplace=True)
train_X = pd.concat([train_X, category_labels], axis=1)


price_scaler = MinMaxScaler()
train_Y = price_scaler.fit_transform(train_Y.values.reshape(-1, 1))

train_X, valid_X, train_Y, valid_Y = train_test_split(
    train_X, train_Y, test_size=0.05, random_state=35
)

# print(train_X.head())
print("train_X shape = {}".format(train_X.shape))
print("train_Y shape = {}".format(train_Y.shape))
print("valid_X shape = {}".format(valid_X.shape))
print("valid_Y shape = {}".format(valid_Y.shape))

train_data.shape: (21440, 13)
Index(['model', 'manufactured', 'type_of_vehicle', 'category', 'transmission',
       'power', 'engine_cap', 'no_of_owners', 'depreciation', 'dereg_value',
       'omv', 'arf', 'price'],
      dtype='object')
train_X shape = (20368, 37)
train_Y shape = (20368, 1)
valid_X shape = (1072, 37)
valid_Y shape = (1072, 1)


In [149]:
from sklearn.ensemble import *
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import *
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error


# model1 = Ridge()
# model2 = Lasso()
# model3 = ElasticNet()

model4 = RandomForestRegressor(n_jobs=15, random_state=22)
# model5 = GradientBoostingRegressor()
# model6 = AdaBoostRegressor()

# model7 = MLPRegressor()
# model8 = GaussianProcessRegressor()

In [150]:
def train_and_eval(model, train_X, train_Y, valid_X, valid_Y):
    model.fit(train_X, train_Y.ravel())
    pred_Y = model.predict(valid_X)

    pred_Y = price_scaler.inverse_transform(pred_Y.reshape(-1, 1))
    valid_Y = price_scaler.inverse_transform(valid_Y.reshape(-1, 1))

    mse = mean_squared_error(valid_Y, pred_Y)
    rmse = np.sqrt(mse)
    print('{} RMSE = {}'.format(str(model), rmse))
    
# train_and_eval(model1, train_X, train_Y, valid_X, valid_Y)
# train_and_eval(model2, train_X, train_Y, valid_X, valid_Y)
# train_and_eval(model3, train_X, train_Y, valid_X, valid_Y)
train_and_eval(model4, train_X, train_Y, valid_X, valid_Y)
# train_and_eval(model5, train_X, train_Y, valid_X, valid_Y)
# train_and_eval(model6, train_X, train_Y, valid_X, valid_Y)
# train_and_eval(model7, train_X, train_Y, valid_X, valid_Y)
# train_and_eval(model8, train_X, train_Y, valid_X, valid_Y)




RandomForestRegressor(n_jobs=15, random_state=22) RMSE = 13944.843840041862


In [151]:
test_data_copy = test_data.copy()

In [158]:
# generate submission
test_data = test_data_copy.copy()
test_data['manufactured'].fillna(train_data['manufactured'].median(), inplace=True)
# test_data['curb_weight'].fillna(train_data['curb_weight'].mean(), inplace=True)
test_data['power'].fillna(train_data['power'].median(), inplace=True)
test_data['engine_cap'].fillna(train_data['engine_cap'].mean(), inplace=True)
test_data['no_of_owners'].fillna(train_data['no_of_owners'].mode()[0], inplace=True)
test_data['depreciation'].fillna(train_data['depreciation'].median(), inplace=True)
# test_data['coe'].fillna(train_data['coe'].median(), inplace=True)
# test_data['road_tax'].fillna(train_data['road_tax'].median(), inplace=True)
test_data['dereg_value'].fillna(train_data['dereg_value'].median(), inplace=True)
# test_data['mileage'].fillna(train_data['mileage'].median(), inplace=True)
test_data['omv'].fillna(train_data['omv'].median(), inplace=True)
test_data['arf'].fillna(train_data['arf'].median(), inplace=True)
# test_data['price'].fillna(train_data['price'].median(), inplace=True)
test_data.drop(
    columns=drop_columns,
    inplace=True,
)

test_data["manufactured"] = manufactured_scaler.transform(test_data["manufactured"].values.reshape(-1, 1))

test_data["power"] = power_scaler.transform(test_data["power"].values.reshape(-1, 1))

test_data["engine_cap"] = engine_capacity_scaler.transform(
    test_data["engine_cap"].values.reshape(-1, 1)
)

test_data["depreciation"] = depresiation_scaler.transform(
    test_data["depreciation"].values.reshape(-1, 1)
)

test_data["dereg_value"] = dereg_value_scaler.transform(
    test_data["dereg_value"].values.reshape(-1, 1)
)


test_data["omv"] = omv_scaler.transform(test_data["omv"].values.reshape(-1, 1))


test_data["arf"] = arf_scaler.transform(test_data["arf"].values.reshape(-1, 1))

test_data["transmission"] = test_data["transmission"].map(
    {"auto": 1, "manual": 0}
)


model_labels = model_binary_encoder.transform(test_data["model"])
model_labels = pd.DataFrame(model_labels)
test_data.drop(columns=["model"], inplace=True)
test_data = pd.concat([test_data, model_labels], axis=1)


type_labels = type_of_vehicle_encoder.transform(test_data["type_of_vehicle"])
type_labels = pd.DataFrame(type_labels)
test_data.drop(columns=["type_of_vehicle"], inplace=True)
test_data = pd.concat([test_data, type_labels], axis=1)


test_data["category"] = test_data["category"].map(
    lambda c: [_c.strip() for _c in c.split(",") if _c != "" and _c != "-"]
)

category_labels = category_encoder.transform(test_data["category"]).astype(np.float64)
category_labels = pd.DataFrame(
    category_labels, columns=["category_" + c for c in category_encoder.classes_]
)
test_data.drop(columns=["category"], inplace=True)
test_data = pd.concat([test_data, category_labels], axis=1)





AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [46]:
test_pred_Y = model4.predict(test_data)
test_pred_Y = price_scaler.inverse_transform(test_pred_Y.reshape(-1, 1))
res_df = pd.DataFrame({"Id": range(0, test_pred_Y.shape[0]), "Predicted": test_pred_Y.ravel()})
res_df.to_csv('result_1.csv', index=False)