# Importing the libraries

In [1]:
# importing libraries

import pandas as pd
import numpy as np
import matplotlib as plt 
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
#importing dataset

df=pd.read_csv("Data/autos.csv",header=0,sep=",",encoding="Latin1")

# Cleaning the dataset

In [3]:
df.dtypes

dateCrawled            object
name                   object
seller                 object
offerType              object
price                   int64
abtest                 object
vehicleType            object
yearOfRegistration      int64
gearbox                object
powerPS                 int64
model                  object
kilometer               int64
monthOfRegistration     int64
fuelType               object
brand                  object
notRepairedDamage      object
dateCreated            object
nrOfPictures            int64
postalCode              int64
lastSeen               object
dtype: object

In [4]:
print(df.seller.value_counts())
df[df.seller!='gewerblich']
df=df.drop('seller',1)

print(df.offerType.value_counts())
df[df.offerType!='Gesuch']
df=df.drop('offerType',1)

privat        371525
gewerblich         3
Name: seller, dtype: int64


  df=df.drop('seller',1)


Angebot    371516
Gesuch         12
Name: offerType, dtype: int64


  df=df.drop('offerType',1)


In [5]:
print(df.shape)
df=df[(df.powerPS>50) & (df.powerPS)<900]
print(df.shape)
df=df[(df.yearOfRegistration>=1950)&(df.yearOfRegistration<2017)]
print(df.shape)

(371528, 18)
(371528, 18)
(356559, 18)


In [6]:
df.drop(['name','abtest','dateCrawled','nrOfPictures','lastSeen',
'postalCode','dateCreated'],axis='columns',inplace=True)

In [7]:
new_df=df.copy()
new_df=new_df.drop_duplicates(['price','vehicleType','yearOfRegistration',
'gearbox','powerPS','model','kilometer','monthOfRegistration','fuelType',
'notRepairedDamage'])

In [8]:
new_df.gearbox.replace(('manuell','automatik'),('manual', 'automatic'), inplace=True)
new_df.fuelType.replace(('benzin','andere','elektro'),('petrol','others','electric'), inplace=True)
new_df.vehicleType.replace(('kleinwagen','cabrio','kombi','andere'),('small car','convertible','combination','others'),inplace=True)
new_df.notRepairedDamage.replace(('ja','nein'),('Yes','No'),inplace=True)

In [9]:
new_df = new_df[(new_df.price >= 100) & (new_df.price <= 150000)]

new_df['notRepairedDamage'].fillna(value='not-declared', inplace=True)
new_df['fuelType'].fillna(value='not-declared', inplace=True)
new_df['gearbox'].fillna(value='not-declared', inplace=True)
new_df['vehicleType'].fillna(value='not-declared', inplace=True)
new_df['model'].fillna(value='not-declared', inplace=True)

In [10]:
new_df.to_csv('autos_preprocessed.csv')

In [11]:
new_df

Unnamed: 0,price,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,480,not-declared,1993,manual,0,golf,150000,0,petrol,volkswagen,not-declared
1,18300,coupe,2011,manual,190,not-declared,125000,5,diesel,audi,Yes
2,9800,suv,2004,automatic,163,grand,125000,8,diesel,jeep,not-declared
3,1500,small car,2001,manual,75,golf,150000,6,petrol,volkswagen,No
4,3600,small car,2008,manual,69,fabia,90000,7,diesel,skoda,No
...,...,...,...,...,...,...,...,...,...,...,...
371523,2200,not-declared,2005,not-declared,0,not-declared,20000,1,not-declared,sonstige_autos,not-declared
371524,1199,convertible,2000,automatic,101,fortwo,125000,3,petrol,smart,No
371525,9200,bus,1996,manual,102,transporter,150000,3,diesel,volkswagen,No
371526,3400,combination,2002,manual,100,golf,150000,6,diesel,volkswagen,not-declared


In [12]:
labels = ['gearbox', 'notRepairedDamage','model','brand','fuelType','vehicleType']

mapper = {}
for i in labels:
  mapper[i] = LabelEncoder()
  mapper[i].fit(new_df[i])
  tr = mapper[i].transform(new_df[i])
  np.save(str('classes'+i+'.npy'), mapper[i].classes_)
  print(i,":",mapper[i])
  new_df.loc[:,i+"_labels"] = pd.Series(tr, index = new_df.index)

labeled = new_df[['price', 'yearOfRegistration','powerPS','kilometer','monthOfRegistration'] + [x+"_labels" for x in labels]]

print(labeled.columns)


gearbox : LabelEncoder()
notRepairedDamage : LabelEncoder()
model : LabelEncoder()
brand : LabelEncoder()
fuelType : LabelEncoder()
vehicleType : LabelEncoder()
Index(['price', 'yearOfRegistration', 'powerPS', 'kilometer',
       'monthOfRegistration', 'gearbox_labels', 'notRepairedDamage_labels',
       'model_labels', 'brand_labels', 'fuelType_labels',
       'vehicleType_labels'],
      dtype='object')


# Splitting Data Into Independent And Dependent Variables

In [13]:
Y = labeled.iloc[:,0].values
X = labeled.iloc[:,1:].values

Y = Y.reshape(-1,1)

In [14]:
from sklearn.model_selection import cross_val_score, train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=3)

In [15]:
X_train,X_test,Y_train,Y_test

(array([[  2009,    101,  40000, ...,     36,      7,      4],
        [  1998,    115, 150000, ...,     10,      5,      1],
        [  2003,    109, 150000, ...,      1,      7,      7],
        ...,
        [  2005,    209, 150000, ...,     39,      7,      8],
        [  2007,    143, 150000, ...,      2,      7,      4],
        [  1999,    136, 150000, ...,      2,      7,      4]], dtype=int64),
 array([[  2006,    140, 100000, ...,     24,      7,      4],
        [  2001,    179, 150000, ...,      1,      1,      1],
        [  1999,    211, 150000, ...,     24,      7,      1],
        ...,
        [  2003,    113, 150000, ...,     27,      4,      7],
        [  1998,    140, 150000, ...,     39,      7,      4],
        [  1994,     75, 150000, ...,     38,      7,      2]], dtype=int64),
 array([[ 7499],
        [  450],
        [ 2990],
        ...,
        [10500],
        [ 6995],
        [ 1899]], dtype=int64),
 array([[5990],
        [2999],
        [ 899],
        ..

# Building the model

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

rf_regressor = RandomForestRegressor(n_estimators=1000, max_depth=10, random_state=34)

rf_regressor.fit(X_train, np.ravel(Y_train, order='C'))

In [18]:
y_pred = rf_regressor.predict(X_test)

print(r2_score(Y_test, y_pred))

0.8191322832483275


In [20]:
filename='resale_model.sav'
pickle.dump(rf_regressor,open(filename,'wb'))

# Accuracy metrics

In [21]:
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(Y_test, y_pred))

1655.534681561534


In [22]:
from sklearn.metrics import mean_squared_error

print(mean_squared_error(Y_test, y_pred))

11832644.335139675


In [23]:
from sklearn.metrics import mean_squared_error

root_mean_squared_error = mean_squared_error(Y_test, y_pred, squared=False)
print(root_mean_squared_error)

3439.861092419238


In [24]:
from sklearn.metrics import r2_score

print(r2_score(Y_test, y_pred))

0.8191322832483275
