In [1]:
import sys

sys.path.append("..")

In [27]:
import pandas as pd

from src.transformers import *
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    BaggingRegressor,
)
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [3]:
train = pd.read_csv("../data/processed/train.csv", sep=",")
test = pd.read_csv("../data/processed/test.csv", sep=",")

print("The shape of train is {}".format(train.shape))
print("The shape of test is {}".format(test.shape))

The shape of train is (16780, 51)
The shape of test is (5000, 50)


In [4]:
class PostProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_to_drop = [
            "listing_id",
            "title",
            "model",
            "description",
            "original_reg_date",
            "reg_date",
            "category",
            "lifespan",
            "features",
            "accessories",
            "eco_category",
            "indicative_price",
            "reg_date_year",
            "reg_date_month",
            "coe_start_date",
            "coe_start_year",
            "coe_rebate",
            "dereg_value_computed",
            "manufactured",
            "mileage", "make_model", "engine_cap_range", "price"
        ]
        pass

    def fit(self, X):
        return self

    def transform(self, X):
        df = X.copy()
        df.loc[:,'last_reg_age'] = datetime.now().year - pd.to_datetime(df.reg_date).dt.year
        df.loc[:, 'mileage_log'] = np.log(df.mileage)
        df = df.drop(self.columns_to_drop, axis=1, errors="ignore")
        return df

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

pipe = Pipeline(
    [
        ("add_cat", SplitValuesToColumn("category")),
        ("post_processing", PostProcessing()),
        ('ohe', OheCategorical(['type_of_vehicle', 'transmission', 'brand_rank', 'opc_scheme', 'fuel_type', "make"]))
    ]
)

In [6]:
trainX = train
trainY = train.loc[:, 'price']

In [7]:
# X_train = pipe.fit_transform(trainX)
# X_test = pipe.transform(test)
# y_train=trainY

In [8]:
# list(X_train.columns)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainX, trainY, test_size=0.2, random_state=72)

X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)

y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [10]:
X_train = pipe.fit_transform(X_train)
X_test = pipe.transform(X_test)

print(f"X_Train:{X_train.shape}")

X_Train:(13424, 142)


In [34]:
# Defining various algorithms

models = [
    {"name": type(RandomForestRegressor()).__name__, "model": RandomForestRegressor, "params":{
    "max_depth": [40, 50, 60, 100],
    "max_features": ["sqrt"],
    "n_estimators": [100, 200, 400]}
    },
    {"name": type(GradientBoostingRegressor()).__name__, "model": GradientBoostingRegressor, "params":{
    "n_estimators": [1000],
    "learning_rate": [0.05],
    "max_depth": [5],
    "max_features": ["sqrt"]}
    },
    {"name": type(Ridge()).__name__, "model": Ridge, "params":{
    "solver": ["svd", "cholesky", "lsqr", "sag"],
    "alpha": loguniform(1e-5, 100),
    "fit_intercept": [True, False],
    "normalize": [True, False]}
    },
    {"name": type(KNeighborsRegressor()).__name__, "model": KNeighborsRegressor, "params":{
    "n_neighbors": [1, 3, 5]}
    }
]

In [37]:
best_model = None
best_params = {}
for model in models:
    print(f"*** Training Model: {model['name']}")
    model_tuning = RandomizedSearchCV(
        estimator=model['model'](),
        param_distributions=model['params'],
        n_iter=10,
        cv=3,
        verbose=3,
        random_state=42,
        n_jobs=-1,
    )

    model_tuning.fit(X_train, y_train)
    print(f"Best Params {model_tuning.best_params_}")
    print(f"Best Score {model_tuning.best_score_}")
    print("\n\n")
    
    if (best_model is None or best_model.best_score_ < model_tuning.best_score_):
        best_model = model['model']
        best_params = model_tuning.best_params_

print(f"Best Model: {model['name']}")

*** Training Model: KNeighborsRegressor
Fitting 3 folds for each of 3 candidates, totalling 9 fits




Best Params {'n_neighbors': 3}
Best Score 0.8981495329092977



Best Model: KNeighborsRegressor


In [41]:
best_model = best_model(**best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
mean_squared_error(y_test, y_pred, squared=False)

40351.816134538254

In [None]:
y_pred = best_model.predict(X_test)
my_submission = pd.DataFrame({'Id': list(test.index), 'Predicted': y_pred})
my_submission.to_csv('submission.csv', index=False)

In [42]:
# plot feature importance

%matplotlib inline
import matplotlib.pyplot as plt

forest_importances = pd.Series(best_model.feature_importances_, index=list(X_train.columns))
forest_importances = forest_importances[forest_importances > 0.001]

plt.figure(figsize=(20,8))
forest_importances.plot.bar()

AttributeError: 'KNeighborsRegressor' object has no attribute 'feature_importances_'

In [None]:
y_pred = best_model.predict(X_test)
# df = DataFrame(y_pred, columns=["Predicted"], )
# df.index.rename("Id", inplace=True)
# df.to_csv("dump.csv")