In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as s
import pickle

import warnings
warnings.filterwarnings('ignore')

import datetime as dt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from scipy.stats import shapiro

import AlgoData as alg

In [22]:
df = alg.ReadTrainData('train_final.csv', raw=False)
df.drop('Unnamed: 0', axis=1, inplace=True)

(18707, 18)


In [23]:
df.columns

Index(['index', 'Price', 'Levy', 'Prod. year', 'Leather interior',
       'Engine volume', 'Mileage', 'Cylinders', 'Airbags', 'Turbo',
       'Mileage_BIN', 'EngineVolume_BIN', 'Manufacturer', 'Model', 'Category',
       'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel',
       'Color'],
      dtype='object')

In [24]:
X = df.drop(['Price'], axis=1)
y = df['Price']

In [25]:
#Randomized Search CV for searching the best parameters

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}


In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
rf = RandomForestRegressor()

In [27]:
rf_random_model = RandomizedSearchCV(estimator = rf,
                                     param_distributions = random_grid,
                                    scoring='neg_mean_squared_error',
                                    n_iter = 10,
                                    cv = 5,
                                    verbose=2,
                                    random_state=42,
                                    n_jobs = 1)

In [28]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=10)

In [29]:
y.describe()

count    18707.000000
mean         9.111365
std          1.404353
min          4.605170
25%          8.665958
50%          9.495294
75%         10.007780
max         11.507310
Name: Price, dtype: float64

In [30]:
rf_random_model.fit(Xtrain, ytrain)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   8.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   7.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   7.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   7.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   7.8s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=  12.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=  12.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimator

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=1,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 10],
                                        'min_samples_split': [2, 5, 10, 15,
                                                              100],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100,
                                                         1200]},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

In [31]:
#beta RF final 0.0.4
f = open('beta_RF_final_0_0_4.bin', 'wb')
pickle.dump(rf_random_model, f)

In [32]:
y_pred = rf_random_model.predict(Xtest)

In [33]:
MAPE = mean_absolute_percentage_error(np.exp(ytest), np.exp(y_pred))

In [34]:
MAPE


1.0342339718007947

In [35]:
MAE = mean_absolute_error(np.exp(ytest), np.exp(y_pred))
MAE

4365.877551867809

In [36]:
data = {
    'YTrue': np.exp(ytest),
    'YPred' : np.exp(y_pred)
}
 
# Create the pandas DataFrame
df = pd.DataFrame(data=data)
 
# print dataframe.
df

Unnamed: 0,YTrue,YPred
6217,17562.0,21119.977939
6075,21326.0,20980.594420
3031,40390.0,46712.336195
7241,157.0,260.456054
2762,44517.0,42625.291040
...,...,...
13697,17249.0,14755.042874
16424,16621.0,13231.156543
357,18817.0,15453.545458
15287,470.0,572.680033


In [None]:
df.reset_index()
df.to_csv('TestvsPred.csv')
