In [879]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import TargetEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor


In [880]:
data_path = "./data/lot42_vectorized.csv"
df = pd.read_csv(data_path)
df.head(5)

Unnamed: 0,name,price,brand,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending,price_log,price_boxcox
0,43,14.48,3.360062,3.346548,1.0,3.272318,1,23,1,203000.0,1,2.739549,3.48899
1,2,29.19,3.770229,3.346548,1.0,3.893614,1,63,1,49000.0,1,3.407511,4.739931
2,3,15.89,3.694664,3.346548,1.0,3.893614,1,74,1,207000.0,1,2.826722,3.645312
3,1,33.23,3.78657,4.136576,1.0,3.893614,1,167,1,49000.0,1,3.533102,4.990177
4,49,15.81,3.750213,3.890525,1.0,3.893614,1,339,1,326000.0,0,2.821974,3.63675


In [881]:
df.shape
df["price"].max()

190.15

In [882]:
X = df.drop(['price', 'price_log', 'price_boxcox'], axis=1)
y = df['price_log']

In [895]:
def calculate_cv_scores(models):
    scores = dict()
    for name, model in models:
       scores[name] = cross_val_score(model, X_train, 
            y_train, cv=5, scoring='neg_mean_squared_error') 
    return scores

X_train, X_test, y_train, y_test = train_test_split(X, y, 
            test_size=0.2, random_state=42)
model_rf = RandomForestRegressor(n_estimators = 100, 
            random_state=42)
model_gb = GradientBoostingRegressor(alpha=0.95, 
            learning_rate=0.1, loss="huber", max_depth=9, 
            max_features=0.8500000000000001, min_samples_leaf=10, 
            min_samples_split=18, n_estimators=100, 
            subsample=0.9000000000000001)
model_xg = XGBRegressor(n_estimators=100, learning_rate=0.1, 
            max_depth=5, subsample=0.8, random_state=42)
model_lr = LinearRegression()

models = [
    ('Random Forest', model_rf),
    ('Gradient Boosting', model_gb),
    ('XG Boost', model_xg),
    ('Linear Regression', model_lr),
]
scores = calculate_cv_scores(models)
for key, value in scores.items():
    print(f'{key}: {value}')

Random Forest: [-0.20554655 -0.20098906 -0.20325429 -0.21073819 -0.21327445]
Gradient Boosting: [-0.18949392 -0.1825316  -0.19019264 -0.18829721 -0.20048785]
XG Boost: [-0.20684124 -0.19667603 -0.20105978 -0.20276344 -0.21463628]
Linear Regression: [-0.32286777 -0.30628199 -0.31049213 -0.31963723 -0.33519343]
Decision Tree: [-0.32694136 -0.34127407 -0.35508836 -0.35086763 -0.33989758]


In [884]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': [80, 100, 120],
#     'learning_rate': [0.05, 0.1, 0.15],
#     'max_depth': [7, 9, 11],
#     'min_samples_split': [16, 18, 20],
#     'min_samples_leaf': [8, 10, 12],
#     'max_features': [0.75, 0.85, 0.95],
#     'subsample': [0.85, 0.9, 0.95],
# }

# grid_search = GridSearchCV(model_gb, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(X_train, y_train)

# print(f'Best parameters: {grid_search.best_params_}')
# print(f'Best score: {grid_search.best_score_}')

In [885]:
# # Get the best parameters
# best_params = grid_search.best_params_

# # Create a new model with the best parameters
# model_gb = GradientBoostingRegressor(**best_params)

# # Train the model
# model_gb.fit(X_train, y_train)

In [886]:
def train_test(models):
    scores_log = dict()
    scores_actual = dict()
    actual_vs_predicted = dict()
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        test_rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
        scores_log[name] = test_rmsle
        actual_vs_predicted[name] = list(zip(y_test[:10], y_pred[:10], np.expm1(y_test[:10]), np.expm1(y_pred[:10])))
        y_test_actual = np.expm1(y_test)
        y_pred_actual = np.expm1(y_pred)
       
        test_rmsle_actual = np.sqrt(mean_squared_log_error(y_test_actual, y_pred_actual))
        scores_actual[name] = test_rmsle_actual
    return scores_log, scores_actual, actual_vs_predicted

In [894]:
log_scores, actual_scores, a = train_test(models)
df = pd.DataFrame(a["Gradient Boosting"], columns=['Actual Log', 'Predicted Log', "Actual Price", "Predicted Price"])

print("SCORING FOR LOGGED PRICE:")
for k, v in log_scores.items():
    print(f'{k}: {v}')
print("\n")
print("SCORING FOR ACTUAL PRICE:")
for k, v in actual_scores.items():
    print(f'{k}: {v}')

print(df.head(10))

SCORING FOR LOGGED PRICE:
Random Forest: 0.10155678614015011
Gradient Boosting: 0.09917262638479557
XG Boost: 0.1033098267075964
Linear Regression: 0.12707003457317223


SCORING FOR ACTUAL PRICE:
Random Forest: 0.44460984593134084
Gradient Boosting: 0.4275538697061751
XG Boost: 0.447464344095693
Linear Regression: 0.5664374620743207
   Actual Log  Predicted Log  Actual Price  Predicted Price
0    3.991389       4.019068         53.13        54.649216
1    5.078668       4.997543        159.56       147.048944
2    5.150629       4.894355        171.54       132.533855
3    3.400197       3.315847         28.97        26.545710
4    4.051437       4.249632         56.48        69.079645
5    2.325325       2.778543          9.23        15.095550
6    2.829678       2.630095         15.94        12.875083
7    4.374750       4.308196         78.42        73.306335
8    3.480932       3.452671         31.49        30.584649
9    2.747912       2.797132         14.61        15.397549
