In [108]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor


warnings.filterwarnings("ignore")

X = pd.read_csv("../Cleaning/new_clean_data.csv", index_col=0)

X.dropna(axis=0, subset=["price"], inplace=True)
y = X["price"]
X.drop(["price"], axis=1, inplace=True)

# Feature engineering


## Selecting categorical values


In [109]:
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
categorical_cols


['brand', 'model', 'location', 'body_type', 'fuel', 'transmission', 'color']

## Selecting numeric values


In [110]:
numerical_cols = [
    cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]
]
numerical_cols


['mileage', 'year', 'power', 'engine_size']

In [111]:
from scipy.stats import chi2_contingency

data = pd.concat([X, y], axis=1)
for col in categorical_cols:
    csq = chi2_contingency(pd.crosstab(data["price"], data[col]))
    print(f"P-value (price, {col}): ", csq[1])


P-value (price, brand):  0.0
P-value (price, model):  0.0
P-value (price, location):  1.4718723489924792e-08
P-value (price, body_type):  1.9711353417429463e-15
P-value (price, fuel):  0.0
P-value (price, transmission):  1.7114094636921104e-217
P-value (price, color):  1.0


In [112]:
categorical_cols

['brand', 'model', 'location', 'body_type', 'fuel', 'transmission', 'color']

# Preprocessing


## Label Encoding


In [113]:
from joblib import dump
from sklearn import preprocessing

features = ["brand", "model", "location", "color"]
les = {}

for f in features:
    les[f] = preprocessing.LabelEncoder()
    les[f] = les[f].fit(X[f])
    X[f] = les[f].transform(X[f])
    dump(les[f], f"{f}_encoder.pkl")


## Splitting data into Train, Validation and Test data


In [114]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0
)
X_train_full, X_test_full, y_train, y_test = train_test_split(
    X_train_full, y_train, test_size=0.25, random_state=0
)


## Keep selected columns only


In [115]:
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()
categorical_cols


['brand', 'model', 'location', 'body_type', 'fuel', 'transmission', 'color']

## Handling missing values


In [116]:
# imputer for num data
numerical_transformer = SimpleImputer(strategy="mean")
imputed_num_train = pd.DataFrame(
    numerical_transformer.fit_transform(X_train[numerical_cols])
)

imputed_num_valid = pd.DataFrame(
    numerical_transformer.transform(X_valid[numerical_cols])
)

imputed_num_test = pd.DataFrame(numerical_transformer.transform(X_test[numerical_cols]))
# imputer for cat data
categorical_transformer = SimpleImputer(strategy="most_frequent")
imputed_cat_train = pd.DataFrame(
    categorical_transformer.fit_transform(X_train[categorical_cols])
)

imputed_cat_valid = pd.DataFrame(
    categorical_transformer.transform(X_valid[categorical_cols])
)

imputed_cat_test = pd.DataFrame(
    categorical_transformer.transform(X_test[categorical_cols])
)

imputed_num_train.columns = numerical_cols
imputed_num_valid.columns = numerical_cols
imputed_num_test.columns = numerical_cols

imputed_cat_train.columns = categorical_cols
imputed_cat_valid.columns = categorical_cols
imputed_cat_test.columns = categorical_cols

X_train = pd.concat([imputed_num_train, imputed_cat_train], axis=1)
X_valid = pd.concat([imputed_num_valid, imputed_cat_valid], axis=1)
X_test = pd.concat([imputed_num_test, imputed_cat_test], axis=1)

# One-hot encode the data


In [117]:
import joblib

cols = ["body_type", "fuel", "transmission"]
OH_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
OH_X_train = pd.DataFrame(OH_encoder.fit_transform(X_train[cols]))
OH_X_valid = pd.DataFrame(OH_encoder.transform(X_valid[cols]))
OH_X_test = pd.DataFrame(OH_encoder.transform(X_test[cols]))
OH_X_train.index = X_train.index
OH_X_valid.index = X_valid.index
OH_X_test.index = X_test.index

num_X_train = X_train.drop(cols, axis=1)
num_X_valid = X_valid.drop(cols, axis=1)
num_X_test = X_test.drop(cols, axis=1)
X_train = pd.concat([num_X_train, OH_X_train], axis=1)
X_valid = pd.concat([num_X_valid, OH_X_valid], axis=1)
X_test = pd.concat([num_X_test, OH_X_test], axis=1)
joblib.dump(OH_encoder, "encoder.pkl")
for f in features:
    X_train[f] = X_train[f].astype(np.int0)
    X_valid[f] = X_valid[f].astype(np.int0)
    X_test[f] = X_test[f].astype(np.int0)


In [118]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3038 entries, 0 to 3037
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   mileage      3038 non-null   float64
 1   year         3038 non-null   float64
 2   power        3038 non-null   float64
 3   engine_size  3038 non-null   float64
 4   brand        3038 non-null   int64  
 5   model        3038 non-null   int64  
 6   location     3038 non-null   int64  
 7   color        3038 non-null   int64  
 8   0            3038 non-null   float64
 9   1            3038 non-null   float64
 10  2            3038 non-null   float64
 11  3            3038 non-null   float64
 12  4            3038 non-null   float64
 13  5            3038 non-null   float64
 14  6            3038 non-null   float64
 15  7            3038 non-null   float64
 16  8            3038 non-null   float64
 17  9            3038 non-null   float64
 18  10           3038 non-null   float64
 19  11    

# Modeling


## Random Forest


In [119]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(random_state=0)
X_train.columns = X_train.columns.astype(str)
X_valid.columns = X_valid.columns.astype(str)
forest_model.fit(X_train, y_train)

preds = forest_model.predict(X_valid)
print(r2_score(y_valid, preds))

0.75024562923658


In [120]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
# Number of features to consider at every split
max_features = ["auto", "sqrt"]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "bootstrap": bootstrap,
}

In [121]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=random_grid,
    n_iter=10,
    cv=2,
    verbose=2,
    random_state=42,
    n_jobs=-1,
)
# Fit the random search model
rf_random.fit(X_train, y_train)
rf_random.best_params_

Fitting 2 folds for each of 10 candidates, totalling 20 fits


{'n_estimators': 2000,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 50,
 'bootstrap': True}

In [122]:
best_random_rf_params = {
    "n_estimators": 800,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_features": "auto",
    "max_depth": 100,
    "bootstrap": True,
}


In [123]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search
param_grid = {
    "bootstrap": [True],
    "max_depth": [80, 90, 100, 110],
    "max_features": [2, 3],
    "min_samples_leaf": [1, 2, 3],
    "min_samples_split": [8, 10, 12],
    "n_estimators": [400, 600, 800, 1000],
}
# Create a based model
rf = RandomForestRegressor(random_state=0)
# Instantiate the grid search model
grid_search = GridSearchCV(
    estimator=rf, param_grid=param_grid, cv=2, n_jobs=-1, verbose=2
)
grid_search.fit(X_train, y_train)
grid_search.best_params_


Fitting 2 folds for each of 288 candidates, totalling 576 fits


{'bootstrap': True,
 'max_depth': 80,
 'max_features': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 8,
 'n_estimators': 800}

In [124]:
best_cv_param = {
    "bootstrap": True,
    "max_depth": 100,
    "max_features": 3,
    "min_samples_leaf": 1,
    "min_samples_split": 8,
    "n_estimators": 600,
}
forest_model = RandomForestRegressor(
    n_estimators=600,
    min_samples_split=8,
    min_samples_leaf=1,
    max_features=3,
    max_depth=100,
    bootstrap=True,
    random_state=0,
)
forest_model.fit(X_train, y_train)
preds = forest_model.predict(X_valid)
print(r2_score(y_valid, preds))

0.6497527944300071


## XGBRegressor


In [125]:
xgb = XGBRegressor(
    n_estimators=1000,
    early_stopping_rounds=5,
    learning_rate=0.05,
    n_jobs=4,
    random_state=0,
)
xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

In [126]:
predictions = xgb.predict(X_valid)
acc = r2_score(y_valid, predictions)
print(acc)


0.785310980337202


In [127]:
from sklearn.model_selection import GridSearchCV


xgb1 = XGBRegressor()
parameters = {
    "nthread": [4],
    "objective": ["reg:squarederror"],
    "learning_rate": [0.03, 0.05, 0.07],
    "max_depth": [5, 6, 7],
    "min_child_weight": [4],
    "subsample": [0.7],
    "colsample_bytree": [0.7],
    "n_estimators": [500],
}

xgb_grid = GridSearchCV(xgb1, parameters, cv=2, n_jobs=5, verbose=True)

xgb_grid.fit(X_train, y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)


Fitting 2 folds for each of 9 candidates, totalling 18 fits
0.7707711088144954
{'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:squarederror', 'subsample': 0.7}


In [128]:
best_xgb_param = {
    "colsample_bytree": 0.7,
    "learning_rate": 0.03,
    "max_depth": 5,
    "min_child_weight": 4,
    "n_estimators": 500,
    "nthread": 4,
    "objective": "reg:squarederror",
    "silent": 1,
    "subsample": 0.7,
}
xgb = XGBRegressor(
    colsample_bytree=0.7,
    learning_rate=0.03,
    max_depth=5,
    min_child_weight=4,
    n_estimators=500,
    nthread=4,
    objective="reg:squarederror",
    subsample=0.7,
    random_state=0,
)
xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

In [129]:
predictions = xgb.predict(X_valid)
acc = r2_score(y_valid, predictions)
print(acc)

0.7825085265045226


## Ridge


In [131]:
from sklearn.linear_model import Ridge


params = {
    "alpha": [
        0.0001,
        0.001,
        0.01,
        0.05,
        0.1,
        0.2,
        0.3,
        0.4,
        0.5,
        0.6,
        0.7,
        0.8,
        0.9,
        1.0,
        2.0,
        3.0,
        4.0,
        5.0,
        6.0,
        7.0,
        8.0,
        9.0,
        10.0,
        20,
        50,
        100,
        500,
        1000,
    ]
}

ridge = Ridge(random_state=0)
folds = 5
grid_cv_model = GridSearchCV(
    estimator=ridge, param_grid=params, cv=folds, return_train_score=True, verbose=1
)
grid_cv_model.fit(X_train, y_train)


Fitting 5 folds for each of 28 candidates, totalling 140 fits


In [132]:
print(grid_cv_model.best_score_)

0.5466226344614252


## Decision Tree


In [133]:
from sklearn.tree import DecisionTreeRegressor


dt = DecisionTreeRegressor()

params = {"max_depth": [1, 2, 3, 5, 10, 15, 16, 17, 20]}


grid = GridSearchCV(estimator=dt, param_grid=params, cv=3)

tree_reg = grid.fit(X_train, y_train)


In [134]:
print(grid.best_params_)
grid.best_score_


{'max_depth': 10}


0.596015360482875

In [135]:
from sklearn.linear_model import LinearRegression


model = LinearRegression()
parameters = {"fit_intercept": [False], "copy_X": [True, False]}
gs = GridSearchCV(
    estimator=model,
    param_grid=parameters,
    cv=3,
    n_jobs=-1,
    verbose=1,
    return_train_score=True,
)
lin_reg = gs.fit(X_train, y_train)


Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [136]:
gs.best_score_

0.5544526767013861

# XGBoost is the most performant of these models so we will use it


In [137]:
xgb = XGBRegressor(
    n_estimators=1000,
    early_stopping_rounds=5,
    learning_rate=0.05,
    n_jobs=4,
    random_state=0,
)
xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
predictions = xgb.predict(X_test)
acc = r2_score(y_test, predictions)
print(acc)

0.7872309885789903


# Saving the model


In [138]:
xgb.save_model("model.xgb")