In [1]:
import json
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor


In [2]:
df = pd.read_csv("./autos.csv", header=0, sep=",", encoding="Latin1")

In [3]:
df.seller.value_counts()

privat        371525
gewerblich         3
Name: seller, dtype: int64

In [4]:
df = df[df.seller != 'gewerblich']
df = df.drop('seller', axis=1)

In [5]:
df.offerType.value_counts()

Angebot    371513
Gesuch         12
Name: offerType, dtype: int64

In [6]:
df = df[df.offerType != 'Gesuch']
df = df.drop('offerType', axis=1)

In [7]:
df.isnull().sum()

dateCrawled                0
name                       0
price                      0
abtest                     0
vehicleType            37862
yearOfRegistration         0
gearbox                20203
powerPS                    0
model                  20481
kilometer                  0
monthOfRegistration        0
fuelType               33379
brand                      0
notRepairedDamage      72053
dateCreated                0
nrOfPictures               0
postalCode                 0
lastSeen                   0
dtype: int64

In [8]:
df['yearsOld'] = 2022 - df['yearOfRegistration']

In [9]:
df = df[['price', 'vehicleType', 'yearsOld', 'gearbox', 'powerPS', 'kilometer', 'fuelType', 'brand', 'notRepairedDamage']]

In [10]:
df=df.dropna()

In [11]:
df = df.drop_duplicates([ 'price', 'vehicleType','gearbox', 'powerPS', 'kilometer', 'fuelType', 'brand','notRepairedDamage'])

In [12]:
df.gearbox.replace(('manuell', 'automatik'), ('manual', 'automatic'), inplace=True)
df.fuelType.replace(('benzin', 'andere', 'elektro'), ('petrol', 'others', 'electric'), inplace=True)
df.vehicleType.replace(('kleinwagen', 'cabrio', 'kombi', 'andere'), 
                        ('compact', 'convertible', 'combination', 'others'), inplace=True)
df.notRepairedDamage.replace(('ja', 'nein'), ('Yes', 'No'),inplace=True)

In [13]:
df = df.sample(frac=0.093, random_state=42)
df

Unnamed: 0,price,vehicleType,yearsOld,gearbox,powerPS,kilometer,fuelType,brand,notRepairedDamage
141976,12900,limousine,13,manual,120,60000,petrol,audi,No
293573,34000,limousine,12,automatic,0,150000,diesel,bmw,No
299514,1400,combination,19,automatic,136,150000,petrol,peugeot,No
76260,16800,combination,12,manual,184,150000,diesel,bmw,No
330662,3350,limousine,19,manual,73,5000,petrol,citroen,No
...,...,...,...,...,...,...,...,...,...
178696,4700,compact,11,manual,60,150000,petrol,volkswagen,No
123311,2500,combination,17,manual,105,150000,lpg,volkswagen,No
134075,17900,convertible,11,automatic,160,50000,petrol,volkswagen,No
192738,1300,combination,24,manual,0,150000,petrol,audi,Yes


In [14]:
df.nunique(axis=0)

price                1816
vehicleType             8
yearsOld               72
gearbox                 2
powerPS               362
kilometer              13
fuelType                7
brand                  40
notRepairedDamage       2
dtype: int64

In [15]:
Y = df.price
X = df.drop("price", axis=1)

In [16]:
X.dtypes

vehicleType          object
yearsOld              int64
gearbox              object
powerPS               int64
kilometer             int64
fuelType             object
brand                object
notRepairedDamage    object
dtype: object

In [17]:
mapping = {}
encoder = LabelEncoder()
X['vehicleType'] = encoder.fit_transform(X['vehicleType'])
mapping['vehicleType'] = dict(zip(encoder.classes_,range(len(encoder.classes_))))
X['gearbox'] = encoder.fit_transform(X['gearbox'])
mapping['gearbox'] = dict(zip(encoder.classes_,range(len(encoder.classes_))))
X['fuelType'] = encoder.fit_transform(X['fuelType'])
mapping['fuelType'] = dict(zip(encoder.classes_,range(len(encoder.classes_))))
X['brand'] = encoder.fit_transform(X['brand'])
mapping['brand'] = dict(zip(encoder.classes_,range(len(encoder.classes_))))
X['notRepairedDamage'] = encoder.fit_transform(X['notRepairedDamage'])
mapping['notRepairedDamage'] = dict(zip(encoder.classes_,range(len(encoder.classes_))))


In [18]:
with open('mapping.json', 'w') as file:
  json.dump(mapping, file)

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [20]:
model = LGBMRegressor(boosting_type="gbdt",learning_rate=0.07,metric="rmse",n_estimators=300,objective="root_mean_squared_error",random_state=42,reg_sqrt=True)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2_score(y_test, y_pred)

0.8285239256911172

In [21]:
pickle.dump(model, open('model.pkl', 'wb'))