In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor

In [2]:
train = pd.read_csv("data/train_final.csv")
test = pd.read_csv("data/test_final.csv")
target = pd.read_csv("data/target.csv")

In [3]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rating,15730.0,4.012873,0.29844,0.0,3.9,4.0,4.2,5.0
actprice1,15730.0,1369.286777,1240.900227,42.0,699.0,999.0,1299.0,13499.0
norating1,15730.0,2925.868404,11605.447095,0.0,50.0,267.0,1379.25,289973.0
noreviews1,15730.0,408.397266,1737.2696,0.0,7.0,38.0,198.0,45448.0
star_5f,15730.0,1599.965035,6233.811271,0.0,30.0,153.0,789.0,151193.0
star_4f,15730.0,662.491545,2843.657075,0.0,12.0,61.0,304.0,74037.0
star_3f,15730.0,360.168023,1416.859022,0.0,7.0,34.0,172.0,34978.0
star_2f,15730.0,155.085188,558.650254,0.0,3.0,17.0,77.0,11705.0
star_1f,15730.0,275.500572,958.589075,0.0,6.0,30.0,140.0,18060.0
fulfilled1,15730.0,0.601526,0.4896,0.0,0.0,1.0,1.0,1.0


In [4]:
test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rating,5244.0,4.01558,0.311346,0.0,3.9,4.0,4.2,5.0
actprice1,5244.0,1378.657895,1280.63007,139.0,699.0,999.0,1299.0,15999.0
norating1,5244.0,2988.580092,12881.253715,1.0,66.0,307.0,1428.0,289973.0
noreviews1,5244.0,415.491037,1910.726669,0.0,9.0,44.0,214.25,45448.0
star_5f,5244.0,1545.586003,6544.085444,0.0,32.0,156.0,756.5,151193.0
star_4f,5244.0,639.785469,2991.065223,0.0,12.0,61.0,310.25,74037.0
star_3f,5244.0,356.356789,1632.732834,0.0,7.0,34.0,166.0,49924.0
star_2f,5244.0,154.139969,611.006799,0.0,3.0,16.0,75.0,12629.0
star_1f,5244.0,260.807018,1017.789008,0.0,5.0,26.0,120.25,23139.0
fulfilled1,5244.0,0.6045,0.489004,0.0,0.0,1.0,1.0,1.0


In [5]:
train.dtypes

title          object
Rating        float64
maincateg      object
platform       object
actprice1       int64
norating1     float64
noreviews1    float64
star_5f       float64
star_4f       float64
star_3f       float64
star_2f         int64
star_1f       float64
fulfilled1      int64
combo            bool
category       object
dtype: object

In [6]:
train.columns

Index(['title', 'Rating', 'maincateg', 'platform', 'actprice1', 'norating1',
       'noreviews1', 'star_5f', 'star_4f', 'star_3f', 'star_2f', 'star_1f',
       'fulfilled1', 'combo', 'category'],
      dtype='object')

In [7]:
train.drop("title", axis=1, inplace=True)
# test_id = test.id
test.drop("title", axis=1, inplace=True)

X = train
y = target
assert X.shape[1] == test.shape[1], "X and test have different number of columns"

In [8]:
columns_to_one_hot = ["maincateg", "category", "platform", "combo"]
columns_to_standardize = ["actprice1", 'norating1', 'noreviews1', 'star_5f', 
    'star_4f', 'star_3f', 'star_2f', 'star_1f',]

In [5]:
X = pd.get_dummies(X)
test = pd.get_dummies(test)

In [6]:
assert X.shape[1] == test.shape[1], "X and test have different number of columns"

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)

In [16]:
# X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [36]:
len(X_train), len(X_test)

(12584, 3146)

In [37]:
label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(sparse=False)
standardizer = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("one_hot_encoder", one_hot_encoder, columns_to_one_hot),
        ("standardizer", standardizer, columns_to_standardize)],
    remainder="passthrough"
)

In [38]:
preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
# X_dev = preprocessor.transform(X_dev)
X_test = preprocessor.transform(X_test)
test = preprocessor.transform(test)

In [12]:
test = test.values

In [13]:
assert X_train.shape[1] == test.shape[1], "X and test have different number of columns"

In [15]:
X_train.min(axis=0), test.min(axis=0), X_train.max(axis=0), test.max(axis=0)

(array([0.0, 172, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0, False, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=object),
 array([0.0, 139, 1.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0, False, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=object),
 array([5.0, 12999, 289973.0, 45448.0, 151193.0, 74037.0, 34978.0, 11705,
        18060.0, 1, True, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       dtype=object),
 array([5.0, 15999, 289973.0, 45448.0, 151193.0, 74037.0, 49924.0, 12629,
        23139.0, 1, True, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       dtype=object))

# Models

## Base Model

In [16]:
lr = LinearRegression(positive=True)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("Linear Regression On Test Set:")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred)**0.5)
print("\n")
print("Linear Regression On Train Set:")
y_pred_t = lr.predict(X_train)
print("R2:", r2_score(y_train, y_pred_t))
print("MAE:", mean_absolute_error(y_train, y_pred_t))
print("RMSE:", mean_squared_error(y_train, y_pred_t)**0.5)

Linear Regression On Test Set:
R2: 0.8201633973943215
MAE: 181.5800380187771
RMSE: 288.3441190186154


Linear Regression On Train Set:
R2: 0.801435715703484
MAE: 182.75084391538357
RMSE: 285.8505664645249


In [17]:
xgbr = XGBRegressor(n_estimators=1000, max_depth=4, learning_rate=0.1)
xgbr.fit(X_train, y_train)
y_pred = xgbr.predict(X_test)
print("XGBR On test Set:")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred)**0.5)
print("\n")
print("XGBR On Train Set:")
y_pred_t = xgbr.predict(X_train)
print("R2:", r2_score(y_train, y_pred_t))
print("MAE:", mean_absolute_error(y_train, y_pred_t))
print("RMSE:", mean_squared_error(y_train, y_pred_t)**0.5)

XGBR On test Set:
R2: 0.9078777366448381
MAE: 110.76745518164807
RMSE: 206.3737012467632


XGBR On Train Set:
R2: 0.9688170369830018
MAE: 75.62243969366192
RMSE: 113.27841915234771


In [18]:
rfr = RandomForestRegressor(n_estimators=1000, max_depth=6, n_jobs=4)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
print("rfr On Test Set:")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred)**0.5)
print("\n")
print("rfr On Train Set:")
y_pred_t = rfr.predict(X_train)
print("R2:", r2_score(y_train, y_pred_t))
print("MAE:", mean_absolute_error(y_train, y_pred_t))
print("RMSE:", mean_squared_error(y_train, y_pred_t)**0.5)

  rfr.fit(X_train, y_train)


rfr On Test Set:
R2: 0.8641390326374818
MAE: 155.40081706345157
RMSE: 250.62212305629595


rfr On Train Set:
R2: 0.8624653423296297
MAE: 152.63929967269434
RMSE: 237.90006747301493


In [34]:
y_pred = rfr.predict(X_test)
print("rfr On Test Set:")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred)**0.5)
print("\n")
print("rfr On Train Set:")
y_pred_t = rfr.predict(X_train)
print("R2:", r2_score(y_train, y_pred_t))
print("MAE:", mean_absolute_error(y_train, y_pred_t))
print("RMSE:", mean_squared_error(y_train, y_pred_t)**0.5)

rfr On Test Set:
R2: 0.8843644285706092
MAE: 140.37400816642264
RMSE: 231.21594415304895


rfr On Train Set:
R2: 0.9038352083942
MAE: 130.08123969009722
RMSE: 198.92828440371628


In [30]:
svr = SVR()
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
print("svr On Test Set:")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("\n")
print("svr On Train Set:")
y_pred_t = svr.predict(X_train)
print("R2:", r2_score(y_train, y_pred_t))
print("MAE:", mean_absolute_error(y_train, y_pred_t))
print("MSE:", mean_squared_error(y_train, y_pred_t))

svr On Test Set:
R2: 0.41497833269179396
MAE: 257.3077333123637
MSE: 270468.10485040746


svr On Train Set:
R2: 0.4266329229337835
MAE: 244.90816648362159
MSE: 235944.43125092195


Let's do a grid search for the xgbr.

In [17]:
from sklearn.model_selection import GridSearchCV
params = {
    "n_estimators": [1500, 2000, 3000],
    "max_depth": [2, 4, 8],
    "learning_rate": [0.1,0.2, 0.3],
    "booster": ["gbtree"],
}

base_model = XGBRegressor()
grid_search = GridSearchCV(base_model, params, cv=3, n_jobs=-1, verbose=10)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV 1/3; 1/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=1500[CV 3/3; 1/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=1500

[CV 2/3; 3/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=3000
[CV 3/3; 2/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=2000
[CV 2/3; 2/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=2000
[CV 1/3; 3/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=3000
[CV 2/3; 1/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=1500
[CV 1/3; 2/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=2000
[CV 2/3; 1/27] END booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=1500;, score=0.866 total time=  32.0s
[CV 3/3; 3/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=3000
[CV 3/3; 1/27] END booster=gbtree, lea

GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, predictor=None,
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
       

In [18]:
grid_search.best_score_

0.8895123915370542

In [16]:
xgbr = XGBRegressor(learning_rate=0.3, max_depth=4, n_estimators=1000, booster="gbtree")
xgbr.fit(X_train, y_train)
y_pred = xgbr.predict(X_test)
print("XGBR On Test Set:")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("\n")
print("XGBR On Train Set:")
y_pred_t = xgbr.predict(X_train)
print("R2:", r2_score(y_train, y_pred_t))
print("MAE:", mean_absolute_error(y_train, y_pred_t))
print("MSE:", mean_squared_error(y_train, y_pred_t))

XGBR On Test Set:
R2: 0.9094067675501246
MAE: 97.86719559986578
MSE: 41883.20067824357


XGBR On Train Set:
R2: 0.991287236873442
MAE: 37.40795654774622
MSE: 3585.360971610742


In [21]:
import numpy as np
y_pred = np.abs(xgbr.predict(X_test))
print("XGBR On Test Set:")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("\n")
print("XGBR On Train Set:")
y_pred_t = np.abs(xgbr.predict(X_train))
print("R2:", r2_score(y_train, y_pred_t))
print("MAE:", mean_absolute_error(y_train, y_pred_t))
print("MSE:", mean_squared_error(y_train, y_pred_t))

XGBR On Test Set:
R2: 0.9094067675501246
MAE: 97.86719559986578
MSE: 41883.20067824357


XGBR On Train Set:
R2: 0.991287236873442
MAE: 37.40795654774622
MSE: 3585.360971610742


In [22]:
preds = xgbr.predict(test)

In [19]:
sample = pd.read_csv('data/Sample__submission.csv')

In [23]:
import numpy as np
preds = rfr.predict(test)
sample['price1'] = np.abs(preds)
sample['price1'].describe()

count    5244.000000
mean      691.047048
std       600.439637
min       228.101400
25%       444.744498
50%       456.417740
75%       617.053036
max      4824.775648
Name: price1, dtype: float64

In [24]:
sample.to_csv('data/submission_3.csv', index=False)

In [28]:
X_train.shape

(12584, 27)

In [26]:
!kaggle competitions submit -c sa2022 -f data/submission_3.csv -m ""


  0%|          | 0.00/82.3k [00:00<?, ?B/s]
 10%|▉         | 8.00k/82.3k [00:00<00:01, 65.0kB/s]
100%|██████████| 82.3k/82.3k [00:05<00:00, 14.8kB/s]


Successfully submitted to Final Capstone Project
Successfully submitted to Final Capstone Project


  0%|          | 0.00/127k [00:00<?, ?B/s]
  6%|▋         | 8.00k/127k [00:00<00:01, 64.0kB/s]
100%|██████████| 127k/127k [00:05<00:00, 23.4kB/s] 



