# Importing required libraries

In [87]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

# Read and Cleaning the dataset

In [88]:
df=pd.read_csv("E:/studies/my ibm/dataset/Dataset.csv", header=0, sep=',', encoding='Latin1',)

In [89]:
df.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-24 11:52:17,Golf_3_1.6,privat,Angebot,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Angebot,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,privat,Angebot,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [90]:
df.shape

(371528, 20)

In [91]:
df.drop(['name', 'abtest', 'dateCrawled', 'nrOfPictures', 'lastSeen',
         'postalCode','dateCreated','offerType'], axis='columns',inplace=True)

In [92]:
df = df[(df.yearOfRegistration >= 1950) & (df.yearOfRegistration < 2017)]

In [93]:
df.shape

(356559, 12)

In [94]:
new_df = df.copy()

In [95]:
new_df = new_df.drop_duplicates ([ 'price', 'vehicleType', 'yearOfRegistration'
                                  ,'gearbox', 'powerPS', 'model', 'kilometer', 'monthOfRegistration', 'fuelType'
                                  ,'notRepairedDamage'])

In [96]:
new_df.gearbox.replace(('manuell', 'automatik'), ('manual', 'automatic'), inplace=True)
new_df.fuelType.replace(('benzin', 'andere', 'elektro'), ('petrol', 'others', 'electric'), inplace=True)
new_df.vehicleType.replace(('kleinwagen', 'cabrio', 'kombi', 'andere'), 
                           ('small car', 'convertible', 'combination', 'others'), inplace=True)
new_df.notRepairedDamage.replace(('ja', 'nein'), ('Yes', 'No'),inplace=True)

In [98]:
new_df = new_df[(new_df.price >= 100) & (new_df.price <= 150000)]
new_df = new_df[(new_df.powerPS > 50) & (new_df.powerPS < 900)]

# Preprocessing the Data

In [99]:
new_df[ 'fuelType'].fillna(value='not-declared', inplace=True)
new_df[ 'gearbox'].fillna(value='not-declared', inplace=True)
new_df[ 'vehicleType'].fillna (value='not-declared', inplace=True)
new_df['model'].fillna(value='not-declared',inplace=True)

In [100]:
new_df.to_csv("preprocessed.csv")

In [101]:
labels = ['gearbox', 'notRepairedDamage', 'model', 'brand', 'fuelType', 'vehicleType']

In [102]:
mapper = {}
for i in labels:
    mapper[i]=LabelEncoder()
    mapper[i].fit(new_df[i])
    tr = mapper[i].transform(new_df[i])
    np.save(str('classes'+i+ '.npy'), mapper[i].classes_)
    print(i, ":",mapper[i])
    new_df.loc[:, i + '_labels'] = pd.Series (tr, index=new_df.index)

gearbox : LabelEncoder()
notRepairedDamage : LabelEncoder()
model : LabelEncoder()
brand : LabelEncoder()
fuelType : LabelEncoder()
vehicleType : LabelEncoder()


In [103]:
labeled=new_df[ ['price'
                 ,'yearOfRegistration'
                 ,'powerPS'
                 ,'kilometer'
                 ,'monthOfRegistration'
                ]
               + [x+"_labels" for x in labels]]

In [104]:
print(labeled.columns)

Index(['price', 'yearOfRegistration', 'powerPS', 'kilometer',
       'monthOfRegistration', 'gearbox_labels', 'notRepairedDamage_labels',
       'model_labels', 'brand_labels', 'fuelType_labels',
       'vehicleType_labels'],
      dtype='object')


# Splitting data into Dependent and Independent variables

In [105]:
Y = labeled.iloc[:,0].values.reshape(-1,1)
X = labeled.iloc[:,1:].values

In [106]:
print(Y)

[[18300]
 [ 9800]
 [ 1500]
 ...
 [ 9200]
 [ 3400]
 [28990]]


In [107]:
print(X)

[[  2011    190 125000 ...      1      1      3]
 [  2004    163 125000 ...     14      1      8]
 [  2001     75 150000 ...     38      7      7]
 ...
 [  1996    102 150000 ...     38      1      0]
 [  2002    100 150000 ...     38      1      1]
 [  2013    320  50000 ...      2      7      4]]


In [108]:
from sklearn.model_selection import cross_val_score, train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=20)

In [109]:
X_train.shape

(167146, 10)

In [110]:
X_test.shape

(111432, 10)

In [111]:
Y_train.shape

(167146, 1)

In [112]:
Y_test.shape

(111432, 1)

# Model Building

In [114]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import math

In [115]:
def find_scores(Y_actual, Y_pred, X_train):
    MSE=mean_squared_error(Y_actual,Y_pred)
    print("MSE:",MSE)
    RMSE=math.sqrt(MSE)
    print("RMSE:",RMSE)
    r2=r2_score(Y_actual,Y_pred)
    print("R2_score:",r2)
    Adjusted_R2=1-(1-r2*((X_test.shape[0]-1)/(X_test.shape[0]-X_test.shape[1]-1)))
    print("Adjusted R2:",Adjusted_R2)
#     plt.figure(figsize=(10,5))
#     plt.plot(Y_pred[0:20])
#     plt.plot(np.array(Y_test[0:20]))
#     plt.legend(["predicted","actual"])
#     plt.show()

# Choose the Appropriate Model

## Model 1 : Random Forest Regressor

In [116]:
from sklearn.ensemble import RandomForestRegressor

In [117]:
def random_forest_regressor(n_estimators,max_samples,criterion,bootstrap,random_state):
        
    model = RandomForestRegressor(
      n_estimators = n_estimators,
      criterion = criterion,
      bootstrap = bootstrap,
      max_samples = max_samples,
      random_state = random_state)
    
    model.fit(X_train, np.ravel(Y_train,order='C'))
    
    Y_pred = model.predict(X_test)
    
    find_scores(Y_test, Y_pred, X_train)

In [119]:
random_forest_regressor(100,0.3,'squared_error',True,40)

MSE: 10429809.195812337
RMSE: 3229.521511898061
R2_score: 0.8558564517160402
Adjusted R2: 0.855933264565657


In [120]:
random_forest_regressor(300,0.3,'squared_error',True,40)

MSE: 10441510.240895698
RMSE: 3231.332579741011
R2_score: 0.8556947392508089
Adjusted R2: 0.8557715375867824


In [121]:
random_forest_regressor(200,0.3,'squared_error',True,40)

MSE: 10441214.518756213
RMSE: 3231.286820874342
R2_score: 0.8556988262324297
Adjusted R2: 0.8557756249352085


In [122]:
random_forest_regressor(100,0.4,'squared_error',True,40)

MSE: 10375074.1361412
RMSE: 3221.036189821716
R2_score: 0.8566129090556145
Adjusted R2: 0.8566897897970417


In [123]:
random_forest_regressor(100,0.45,'squared_error',True,40)

MSE: 10299063.708894264
RMSE: 3209.2154351015865
R2_score: 0.8576633992883937
Adjusted R2: 0.8577403743109917


## Model 2 : Bagging Regressor

In [124]:
from sklearn.ensemble import BaggingRegressor

In [128]:
def bagging_regressor(n_estimators,bootstrap,max_samples,random_state):
    
    model = BaggingRegressor(
      n_estimators=n_estimators, 
      bootstrap=bootstrap,
      max_samples=max_samples,
      random_state=random_state)
    
    model.fit(X_train,np.ravel(Y_train,order='C'))
    
    Y_pred = model.predict(X_test)
    
    find_scores(Y_test, Y_pred, X_train)

In [129]:
bagging_regressor(100,True,0.4,40)

MSE: 10391568.470448954
RMSE: 3223.5955810940295
R2_score: 0.8563849516856314
Adjusted R2: 0.8564618119679557


In [130]:
bagging_regressor(200,True,0.4,40)

MSE: 10348619.50812465
RMSE: 3216.9270287223876
R2_score: 0.8569785211084571
Adjusted R2: 0.8570554346634519


In [131]:
bagging_regressor(300,True,0.4,40)

MSE: 10312980.19702098
RMSE: 3211.382910370699
R2_score: 0.8574710686387552
Adjusted R2: 0.8575480263997373


In [132]:
bagging_regressor(300,True,0.3,40)

MSE: 10459870.329386981
RMSE: 3234.1722788662605
R2_score: 0.8554409965166654
Adjusted R2: 0.8555177720793077


# Check the Metrics of the Model

In [135]:
model = RandomForestRegressor(
      n_estimators = 100,
      criterion = 'squared_error',
      bootstrap = True,
      max_samples = 0.45,
      random_state = 40)
    
model.fit(X_train, np.ravel(Y_train,order='C'))

In [136]:
Y_pred = model.predict(X_test)
find_scores(Y_test, Y_pred, X_train)

MSE: 10299063.708894264
RMSE: 3209.2154351015865
R2_score: 0.8576633992883937
Adjusted R2: 0.8577403743109917
