In [1]:
import pickle

import catboost as cb
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn_pandas import DataFrameMapper, CategoricalImputer, FunctionTransformer
from sklearn.preprocessing import StandardScaler, LabelBinarizer, PolynomialFeatures, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor 


In [9]:
df = pd.read_csv('cars1.csv')


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61316 entries, 0 to 61315
Data columns (total 15 columns):
price           61316 non-null int64
year            61316 non-null float64
manufacturer    61316 non-null object
model           61316 non-null object
condition       61316 non-null object
cylinders       61316 non-null object
fuel            61316 non-null object
odometer        61316 non-null float64
title_status    61316 non-null object
transmission    61316 non-null object
vin             61316 non-null object
drive           61316 non-null object
size            61316 non-null object
type            61316 non-null object
paint_color     61316 non-null object
dtypes: float64(2), int64(1), object(12)
memory usage: 7.0+ MB


In [None]:
df = df[df.price > 100]

In [None]:
df = df[df.year > 2000]

In [None]:
df = df[df.odometer < 1000000]

In [None]:
le = LabelEncoder()
le.fit(df['model'])
df['model'] = le.transform(df['model']) 

In [None]:
target = 'price'
y = df[target]
X = df.drop(target, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [None]:
mapper = DataFrameMapper([
#     ('region', LabelBinarizer()),
    (['year'], StandardScaler()),
    ('manufacturer',[CategoricalImputer(), LabelBinarizer()]),
    ('model', [CategoricalImputer()]),
    ('cylinders', [CategoricalImputer(), LabelBinarizer()]),
    ('fuel', [CategoricalImputer(), LabelBinarizer()]),
    (['odometer'], [SimpleImputer(), StandardScaler()]),
    ('title_status', [CategoricalImputer(), LabelBinarizer()]),
    ('transmission', [CategoricalImputer(), LabelBinarizer()]),
    # (['vin'], StandardScaler()),
    ('size', [CategoricalImputer(), LabelBinarizer()]),
    ('type', [CategoricalImputer(), LabelBinarizer()]),
    ('paint_color', [CategoricalImputer(), LabelBinarizer()]),
#     ('state', [CategoricalImputer(), LabelBinarizer()]),
    ('condition', [CategoricalImputer(), LabelBinarizer()]),
     ], df_out=True)


In [None]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [None]:
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor
# from sklearn.svm import SVR

In [None]:


# linreg = LinearRegression()
# linreg.fit(Z_train, y_train)
# linscore = linreg.score(Z_train, y_train)
# linscore_test = linreg.score(Z_test, y_test)
# print(f'linreg r2 score - Train: {round(linscore,3)}, - Test: {round(linscore_test,3)}')
# #print(f'{linreg.})


# decreg = DecisionTreeRegressor()
# decreg.fit(Z_train, y_train)
# decscore = decreg.score(Z_train, y_train)
# decscore_test = decreg.score(Z_test, y_test)
# print(f'decreg r2 score - Train: {round(decscore,3)}, - Test: {round(decscore_test,3)}')

# ranreg = RandomForestRegressor(max_depth=2, random_state=0)
# ranreg.fit(Z_train, y_train)
# ranscore = ranreg.score(Z_train, y_train)
# ranscore_test = ranreg.score(Z_test, y_test)
# print(f'ranreg r2 score - Train: {round(ranscore,3)}, - Test: {round(ranscore_test,3)}')

# bagreg = BaggingRegressor()
# bagreg.fit(Z_train, y_train)
# bagscore = bagreg.score(Z_train, y_train)
# bagscore_test = bagreg.score(Z_test, y_test)
# print(f'bagreg r2 score - Train: {round(bagscore,3)}, - Test: {round(bagscore_test,3)}')



# extreg = ExtraTreesRegressor()
# extreg.fit(Z_train, y_train)
# extscore = extreg.score(Z_train, y_train)
# extscore_test = extreg.score(Z_test, y_test)
# print(f'ExtraTree r2 score - Train: {round(extscore,3)}, - Test: {round(extscore_test,3)}')

In [None]:
model = cb.CatBoostRegressor()
# GridSearchCV to find best params for the pipe
params = {
    'iterations': [10,500],
    'learning_rate': [0.1,0.5],
    'depth': [4, 10],
}
grid = GridSearchCV(model, params, cv=3, n_jobs=-1, verbose=1)
grid.fit(Z_train, y_train)
print(grid.best_score_)
print(grid.best_params_)

# CatBoostRegressor using the best params found above^


In [None]:
# # GridSearch
# grid = {'learning_rate': [0.03, 0.1, 0.5],
#         'depth': [4, 6, 10],
#         'l2_leaf_reg': [1, 5, 9],
#         'early_stopping_rounds': [200]}
#
#
#
# grid_search_result = model.grid_search(grid,
#                                        X=Z_train,
#                                        y=y_train,
#                                        plot=False)