In [1]:
import sys

sys.path.append("..")

In [2]:
import pandas as pd

from src.transformers import *
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    BaggingRegressor,
)
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [3]:
train = pd.read_csv("../data/processed/train.csv", sep=",")
test = pd.read_csv("../data/processed/test.csv", sep=",")

print("The shape of train is {}".format(train.shape))
print("The shape of test is {}".format(test.shape))

The shape of train is (16780, 44)
The shape of test is (5000, 43)


In [5]:
class PostProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_to_drop = [
            "listing_id",
            "title",
            "make",
            "model",
            "description",
            "original_reg_date",
            "reg_date",
            "category",
            "lifespan",
            "features",
            "accessories",
            "eco_category",
            "indicative_price",
            "reg_date_year",
            "reg_date_month",
            "coe_start_date",
            "coe_start_year",
            "coe_rebate",
            "dereg_value_computed",
        ]
        pass

    def fit(self, X):
        return self

    def transform(self, X):
        df = X.copy()
        df = df.drop(self.columns_to_drop, axis=1, errors="ignore")
        return df

In [6]:
ct = ColumnTransformer(
    [
        (
            "ohe",
            OneHotEncoder(sparse=False, handle_unknown="ignore"),
            [
                "fuel_type",
                "type_of_vehicle",
                "transmission",
                "make_model",
                "opc_scheme",
            ],
        ),
    ],
    remainder="passthrough",
)

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

pipe = Pipeline(
    [
        ("add_cat", SplitValuesToColumn("category")),
        ("post_processing", PostProcessing()),
        ("normalization", ct),
    ]
)

In [8]:
trainX = train.loc[:, train.columns != "price"]
trainY = train.loc[:, "price"]

X_train = pipe.fit_transform(trainX)
# X_test = pipe.transform(test)
y_train = trainY
X_train.shape

(16780, 131)

In [9]:
X_test = pipe.fit_transform(test)
X_test.shape

(5000, 119)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=72
)

# X_train.reset_index(inplace=True, drop=True)
# X_test.reset_index(inplace=True, drop=True)

# y_train.reset_index(inplace=True, drop=True)
# y_test.reset_index(inplace=True, drop=True)

In [11]:
rfr_hpo = {
    "max_depth": [40, 50, 60, 100],
    "max_features": ["sqrt"],
    "n_estimators": [100, 200, 400],
}

gb_hpo = {
    "n_estimators": [200, 400],
    "learning_rate": [0.01, 0.1],
    "max_depth": [40, 60, 100],
    "max_features": ["sqrt"],
}
xgb_hpo = {"n_estimators": [200, 400, 600], "max_depth": [40, 60, 100]}


space = dict()
space["solver"] = ["svd", "cholesky", "lsqr", "sag"]
space["alpha"] = loguniform(1e-5, 100)
space["fit_intercept"] = [True, False]
space["normalize"] = [True, False]

# import xgboost as xgb

model = RandomForestRegressor
# model = GradientBoostingRegressor()
# model = Ridge()
# model = xgb.XGBRegressor
rf_random = RandomizedSearchCV(
    estimator=model(),
    param_distributions=rfr_hpo,
    n_iter=20,
    cv=3,
    verbose=3,
    random_state=42,
    n_jobs=-1,
)

rf_random.fit(X_train, y_train)
# best_model = RandomForestRegressor(**rf_random.best_params_)
# best_model = xgb.XGBRegressor(n_estimators=200,
#                              eta=0.2, gamma=1, max_depth=50,
#                              reg_lambda=1, n_jobs=-1)

Fitting 3 folds for each of 12 candidates, totalling 36 fits




RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'max_depth': [40, 50, 60, 100],
                                        'max_features': ['sqrt'],
                                        'n_estimators': [100, 200, 400]},
                   random_state=42, verbose=3)

In [13]:
rf_random.best_params_

{'n_estimators': 400, 'max_features': 'sqrt', 'max_depth': 50}

In [14]:
best_model = RandomForestRegressor(**rf_random.best_params_)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

25167.52420288357

In [68]:
y_pred = best_model.predict(X_test)
# df = DataFrame(y_pred, columns=["Predicted"], )
# df.index.rename("Id", inplace=True)
# df.to_csv("dump.csv")