In [161]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
from xgboost import XGBRegressor

X = pd.read_csv("../Cleaning/new_clean_data.csv", index_col=0)

X.dropna(axis=0, subset=["price"], inplace=True)
y = X["price"]
X.drop(["price"], axis=1, inplace=True)


# Feature engineering


## Selecting categorical values


In [162]:
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
categorical_cols


['brand', 'model', 'location', 'body_type', 'fuel', 'transmission', 'color']

## Selecting numeric values


In [163]:
numerical_cols = [
    cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]
]
numerical_cols


['mileage', 'year', 'power', 'engine_size']

In [164]:
from datetime import datetime

now = datetime.now()
# X["age"] = now.year - X["year"] + 1
# X["mileage_over_year"] = X["mileage"] + X["age"]


In [165]:
from scipy.stats import chi2_contingency

data = pd.concat([X, y], axis=1)
for col in categorical_cols:
    csq = chi2_contingency(pd.crosstab(data["price"], data[col]))
    print(f"P-value (price, {col}): ", csq[1])


P-value (price, brand):  0.0
P-value (price, model):  0.0
P-value (price, location):  1.471872348992479e-08
P-value (price, body_type):  1.9711353417429463e-15
P-value (price, fuel):  0.0
P-value (price, transmission):  1.7114094636921104e-217
P-value (price, color):  1.0


In [166]:
categorical_cols

['brand', 'model', 'location', 'body_type', 'fuel', 'transmission', 'color']

# Preprocessing


## Splitting data into Train, Validation and Test data


In [167]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0
)
X_train_full, X_test_full, y_train, y_test = train_test_split(
    X_train_full, y_train, test_size=0.25, random_state=0
)


## Keep selected columns only


In [168]:
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()


## Handling missing values


In [169]:
# imputer for num data
numerical_transformer = SimpleImputer(strategy="mean")
imputed_num_train = pd.DataFrame(
    numerical_transformer.fit_transform(X_train[numerical_cols])
)

imputed_num_valid = pd.DataFrame(
    numerical_transformer.transform(X_valid[numerical_cols])
)

imputed_num_test = pd.DataFrame(numerical_transformer.transform(X_test[numerical_cols]))
# imputer for cat data
categorical_transformer = SimpleImputer(strategy="most_frequent")
imputed_cat_train = pd.DataFrame(
    categorical_transformer.fit_transform(X_train[categorical_cols])
)

imputed_cat_valid = pd.DataFrame(
    categorical_transformer.transform(X_valid[categorical_cols])
)

imputed_cat_test = pd.DataFrame(
    categorical_transformer.transform(X_test[categorical_cols])
)

imputed_num_train.columns = numerical_cols
imputed_num_valid.columns = numerical_cols
imputed_num_test.columns = numerical_cols

imputed_cat_train.columns = categorical_cols
imputed_cat_valid.columns = categorical_cols
imputed_cat_test.columns = categorical_cols

X_train = pd.concat([imputed_num_train, imputed_cat_train], axis=1)
X_valid = pd.concat([imputed_num_valid, imputed_cat_valid], axis=1)
X_test = pd.concat([imputed_num_test, imputed_cat_test], axis=1)

# One-hot encode the data


In [170]:
OH_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
OH_X_train = pd.DataFrame(OH_encoder.fit_transform(X_train[categorical_cols]))
OH_X_valid = pd.DataFrame(OH_encoder.transform(X_valid[categorical_cols]))
OH_X_test = pd.DataFrame(OH_encoder.transform(X_test[categorical_cols]))
OH_X_train.index = X_train.index
OH_X_valid.index = X_valid.index
OH_X_test.index = X_test.index

num_X_train = X_train.drop(categorical_cols, axis=1)
num_X_valid = X_valid.drop(categorical_cols, axis=1)
num_X_test = X_test.drop(categorical_cols, axis=1)
X_train = pd.concat([num_X_train, OH_X_train], axis=1)
X_valid = pd.concat([num_X_valid, OH_X_valid], axis=1)
X_test = pd.concat([num_X_test, OH_X_test], axis=1)




# Modeling


## Random Forest


In [171]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(random_state=7)
X_train.columns = X_train.columns.astype(str)
X_valid.columns = X_valid.columns.astype(str)
forest_model.fit(X_train, y_train)

preds = forest_model.predict(X_valid)
print(r2_score(y_valid, preds))

0.7504133999023445


In [172]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
# Number of features to consider at every split
max_features = ["auto", "sqrt"]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "bootstrap": bootstrap,
}

In [173]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=random_grid,
    n_iter=10,
    cv=2,
    verbose=2,
    random_state=42,
    n_jobs=-1,
)
# Fit the random search model
rf_random.fit(X_train, y_train)
rf_random.best_params_

Fitting 2 folds for each of 10 candidates, totalling 20 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   1.3s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   1.4s


  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   1.6s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   2.6s


  warn(


[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=   5.5s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=   6.9s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   8.4s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   8.6s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1400; total time=  17.3s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1400; total time=  17.4s
[CV] END bootstrap=False, max_depth=60, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=600; total time=  28.1s
[CV] END bootstrap=False, max_depth=6

  warn(


{'n_estimators': 2000,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 50,
 'bootstrap': True}

In [174]:
best_random_rf_params = {
    "n_estimators": 800,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_features": "auto",
    "max_depth": 100,
    "bootstrap": True,
}


In [175]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search
param_grid = {
    "bootstrap": [True],
    "max_depth": [80, 90, 100, 110],
    "max_features": [2, 3],
    "min_samples_leaf": [1, 2, 3],
    "min_samples_split": [8, 10, 12],
    "n_estimators": [400, 600, 800, 1000],
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(
    estimator=rf, param_grid=param_grid, cv=2, n_jobs=-1, verbose=2
)
grid_search.fit(X_train, y_train)
grid_search.best_params_


Fitting 2 folds for each of 288 candidates, totalling 576 fits
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=1, min_samples_split=8, n_estimators=400; total time=   4.7s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=1, min_samples_split=8, n_estimators=400; total time=   4.7s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   5.0s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=   5.0s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=1, min_samples_split=10, n_estimators=600; total time=   5.3s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=1, min_samples_split=8, n_estimators=600; total time=   6.0s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=1, min_samples_split=8, n_estimators=600; total time=   6.7s
[CV

{'bootstrap': True,
 'max_depth': 100,
 'max_features': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 8,
 'n_estimators': 600}

In [176]:
best_cv_param = {
    "bootstrap": True,
    "max_depth": 100,
    "max_features": 3,
    "min_samples_leaf": 1,
    "min_samples_split": 8,
    "n_estimators": 600,
}
forest_model = RandomForestRegressor(
    n_estimators=600,
    min_samples_split=8,
    min_samples_leaf=1,
    max_features=3,
    max_depth=100,
    bootstrap= True,
)
forest_model.fit(X_train, y_train)
preds = forest_model.predict(X_valid)
print(r2_score(y_valid, preds))


0.6106198416525495


## XGBRegressor


In [177]:
xgb = XGBRegressor(
    n_estimators=1000, early_stopping_rounds=5, learning_rate=0.05, n_jobs=4
)
xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

In [178]:
predictions = xgb.predict(X_valid)
acc = r2_score(y_valid, predictions)
print(acc)


0.7715243464460869


In [179]:
from sklearn.model_selection import GridSearchCV


xgb1 = XGBRegressor()
parameters = {
    "nthread": [4],
    "objective": ["reg:linear"],
    "learning_rate": [0.03, 0.05, 0.07],
    "max_depth": [5, 6, 7],
    "min_child_weight": [4],
    "subsample": [0.7],
    "colsample_bytree": [0.7],
    "n_estimators": [500],
}

xgb_grid = GridSearchCV(xgb1, parameters, cv=2, n_jobs=5, verbose=True)

xgb_grid.fit(X_train, y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)


Fitting 2 folds for each of 9 candidates, totalling 18 fits
0.7819658575716842
{'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'subsample': 0.7}


In [180]:
best_xgb_param = {
    "colsample_bytree": 0.7,
    "learning_rate": 0.03,
    "max_depth": 5,
    "min_child_weight": 4,
    "n_estimators": 500,
    "nthread": 4,
    "objective": "reg:linear",
    "silent": 1,
    "subsample": 0.7,
}
xgb = XGBRegressor(
    colsample_bytree=0.7,
    learning_rate=0.03,
    max_depth=5,
    min_child_weight=4,
    n_estimators=500,
    nthread=4,
    objective="reg:linear",
    subsample=0.7,
)
xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)




In [181]:
predictions = xgb.predict(X_valid)
acc = r2_score(y_valid, predictions)
print(acc)

0.7767935322847682


## Ridge


In [182]:
from sklearn.linear_model import Ridge


params = {
    "alpha": [
        0.0001,
        0.001,
        0.01,
        0.05,
        0.1,
        0.2,
        0.3,
        0.4,
        0.5,
        0.6,
        0.7,
        0.8,
        0.9,
        1.0,
        2.0,
        3.0,
        4.0,
        5.0,
        6.0,
        7.0,
        8.0,
        9.0,
        10.0,
        20,
        50,
        100,
        500,
        1000,
    ]
}

ridge = Ridge()
folds = 5
grid_cv_model = GridSearchCV(
    estimator=ridge, param_grid=params, cv=folds, return_train_score=True, verbose=1
)
grid_cv_model.fit(X_train, y_train)


Fitting 5 folds for each of 28 candidates, totalling 140 fits


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [183]:
print(grid_cv_model.best_score_)

0.6891855544227831


## decision tree


In [184]:
from sklearn.tree import DecisionTreeRegressor


dt = DecisionTreeRegressor()

params = {"max_depth": [1, 2, 3, 5, 10, 15, 16, 17, 20]}


grid = GridSearchCV(estimator=dt, param_grid=params, cv=3)

tree_reg = grid.fit(X_train, y_train)


In [185]:
print(grid.best_params_)
grid.best_score_


{'max_depth': 15}


0.6704771258519971

In [186]:
from sklearn.linear_model import LinearRegression


model = LinearRegression()
parameters = {"fit_intercept": [False], "copy_X": [True, False]}
gs = GridSearchCV(
    estimator=model,
    param_grid=parameters,
    cv=3,
    n_jobs=-1,
    verbose=1,
    return_train_score=True,
)
lin_reg = gs.fit(X_train, y_train)


Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [187]:
gs.best_score_

-15730146.154580802

# XGBoost is the most performant of these models so we will use it


In [188]:
xgb = XGBRegressor(
    n_estimators=1000, early_stopping_rounds=5, learning_rate=0.05, n_jobs=4
)
xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
predictions = xgb.predict(X_test)
acc = r2_score(y_test, predictions)
print(acc)

0.778584023057209


# Saving the model


In [189]:
xgb.save_model("model.xgb")