Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

In [4]:
from lightgbm import LGBMRegressor

In [3]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.3-py3-none-win_amd64.whl (1.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.3


Import Dataset

In [5]:
data = pd.read_csv('autos_preprocessed.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,price,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,1,18300.0,coupe,2011.0,manual,190.0,not-declared,125000,5.0,diesel,audi,Yes
1,2,9800.0,suv,2004.0,automatic,163.0,grand,125000,8.0,diesel,jeep,not-declared
2,3,1500.0,small car,2001.0,manual,75.0,golf,150000,6.0,petrol,volkswagen,No
3,4,3600.0,small car,2008.0,manual,69.0,fabia,90000,7.0,diesel,skoda,No
4,5,650.0,limousine,1995.0,manual,102.0,3er,150000,10.0,petrol,bmw,Yes


Label Encoding

In [6]:
labels = ['gearbox', 'notRepairedDamage', 'model', 'brand', 'fuelType', 'vehicleType']

mapper = {}
for i in labels:
    mapper[i] = LabelEncoder()
    mapper[i].fit(data[i])
    tr = mapper[i].transform(data[i])
    np.save(str('classes'+i+'.npy'), mapper[i].classes_)
    data.loc[:, i+'_labels'] = pd.Series(tr, index=data.index)
    
labeled = data[['price', 'yearOfRegistration','powerPS','kilometer','monthOfRegistration']
                  +[x+"_labels" for x in labels]]

print(labeled.columns)

Index(['price', 'yearOfRegistration', 'powerPS', 'kilometer',
       'monthOfRegistration', 'gearbox_labels', 'notRepairedDamage_labels',
       'model_labels', 'brand_labels', 'fuelType_labels',
       'vehicleType_labels'],
      dtype='object')


Different Metrics Evaluation


In [7]:
def find_scores(Y_actual, Y_pred, X_train):
    scores = dict()
    mae = mean_absolute_error(Y_actual, Y_pred)
    mse = mean_squared_error(Y_actual, Y_pred)
    rmse = np.sqrt(mse)
    rmsle = np.log(rmse)
    r2 = r2_score(Y_actual, Y_pred)
    n, k = X_train.shape
    adj_r2_score = 1 - ((1-r2)*(n-1)/(n-k-1))
    
    scores['mae']=mae
    scores['mse']=mse
    scores['rmse']=rmse
    scores['rmsle']=rmsle
    scores['r2']=r2
    scores['adj_r2_score']=adj_r2_score
    
    return scores

Train Test Split

In [8]:
X = labeled.iloc[:,1:].values
Y = labeled.iloc[:,0].values.reshape(-1,1)

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)


Predictive Modeling


LGBM Regressor

*Best Parameters*

booster-"gbtree"
metric-rmse
learning_rate-0.07
n_estimators-300
objective-root_mean_squared_error
reg_sqrt-True
random_state-42

In [10]:
model = LGBMRegressor(boosting_type="gbdt",learning_rate=0.07,metric="rmse",n_estimators=300,objective="root_mean_squared_error",random_state=42,reg_sqrt=True)

model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

find_scores(Y_test, Y_pred, X_train)

  return f(*args, **kwargs)


{'mae': 1320.6988287013735,
 'mse': 9115858.255127603,
 'rmse': 3019.2479618487123,
 'rmsle': 8.01276306010308,
 'r2': 0.8685679338456337,
 'adj_r2_score': 0.8685601636509441}

Save Model

In [11]:
pickle.dump(model, open('resale_model.sav', 'wb'))