In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor

In [4]:
train = pd.read_csv("data/train_prepared.csv")
test = pd.read_csv("data/test_prepared.csv")

In [5]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,15730.0,10479.541577,6080.166276,3.0,5212.0,10458.5,15766.75,20973.0
Rating,15730.0,4.012873,0.29844,0.0,3.9,4.0,4.2,5.0
ActingPrice,15730.0,1369.286777,1240.900227,42.0,699.0,999.0,1299.0,13499.0
Offer,15730.0,46.802491,19.268675,0.0,35.94,50.07,60.16,88.93
NumberOfRatings,15730.0,3074.681055,11915.323486,0.0,64.0,311.0,1527.75,289973.0
NumberOfReviews,15730.0,426.252702,1753.202101,0.0,9.0,44.0,215.0,45448.0
5Star,15730.0,1599.965035,6233.811271,0.0,30.0,153.0,789.0,151193.0
4Star,15730.0,662.491545,2843.657075,0.0,12.0,61.0,304.0,74037.0
3Star,15730.0,360.168023,1416.859022,0.0,7.0,34.0,172.0,34978.0
2Star,15730.0,155.085188,558.650254,0.0,3.0,17.0,77.0,11705.0


In [6]:
test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,5244.0,10507.372616,5978.658892,0.0,5286.75,10561.5,15636.75,20964.0
Rating,5244.0,4.01558,0.311346,0.0,3.9,4.0,4.2,5.0
ActingPrice,5244.0,1378.657895,1280.63007,139.0,699.0,999.0,1299.0,15999.0
NumberOfRatings,5244.0,2988.580092,12881.253715,1.0,66.0,307.0,1428.0,289973.0
NumberOfReviews,5244.0,415.491037,1910.726669,0.0,9.0,44.0,214.25,45448.0
5Star,5244.0,1545.586003,6544.085444,0.0,32.0,156.0,756.5,151193.0
4Star,5244.0,639.785469,2991.065223,0.0,12.0,61.0,310.25,74037.0
3Star,5244.0,356.356789,1632.732834,0.0,7.0,34.0,166.0,49924.0
2Star,5244.0,154.139969,611.006799,0.0,3.0,16.0,75.0,12629.0
1Star,5244.0,260.807018,1017.789008,0.0,5.0,26.0,120.25,23139.0


In [7]:
train.dtypes

ID                   int64
Rating             float64
MainCategory        object
Plateform           object
ActingPrice          int64
Offer              float64
NumberOfRatings    float64
NumberOfReviews    float64
5Star              float64
4Star              float64
3Star              float64
2Star                int64
1Star                int64
Fulfilled            int64
Combo                 bool
Category            object
dtype: object

In [8]:
train.columns

Index(['ID', 'Rating', 'MainCategory', 'Plateform', 'ActingPrice', 'Offer',
       'NumberOfRatings', 'NumberOfReviews', '5Star', '4Star', '3Star',
       '2Star', '1Star', 'Fulfilled', 'Combo', 'Category'],
      dtype='object')

In [9]:
train.drop("ID", axis=1, inplace=True)
test_id = test.ID
test.drop("ID", axis=1, inplace=True)

X = train.drop(["Offer"], axis=1)
y = train["Offer"]
assert X.shape[1] == test.shape[1], "X and test have different number of columns"

In [10]:
columns_to_one_hot = ["MainCategory", "Category", "Plateform", "Combo"]
columns_to_standardize = ["ActingPrice", 'NumberOfRatings', 'NumberOfReviews', '5Star', 
    '4Star', '3Star', '2Star', '1Star',]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [14]:
len(X_train),len(X_test)

(14157, 1573)

In [15]:
label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(sparse=False)
standardizer = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("one_hot_encoder", one_hot_encoder, columns_to_one_hot),
        ("standardizer", standardizer, columns_to_standardize)],
    remainder="passthrough"
)

In [16]:
preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)
test_final = preprocessor.transform(test)

In [18]:
assert X_train.shape[1] == test_final.shape[1], "X_train and test have different number of columns"

# Models

## Base Model

In [21]:
def evaluate(model, on="test"):
    if on == "test":
        X, y = X_test, y_test
    else:
        X, y = X_train, y_train
    y_pred = model.predict(X)
    print("-----------------------------------------------------")
    print("R2:", r2_score(y, y_pred))
    print("MSE:", mean_squared_error(y, y_pred))
    print("MAE:", mean_absolute_error(y, y_pred))
    print("RMSE:", mean_squared_error(y, y_pred) ** 0.5)
    print("-----------------------------------------------------")
    return y_pred

In [23]:
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = evaluate(lr)

-----------------------------------------------------
R2: 0.15869169954340134
MSE: 295.99595434190053
MAE: 13.854730510966307
RMSE: 17.2045329591332
-----------------------------------------------------


In [48]:
xgbr = XGBRegressor(n_estimators=2000, max_depth=5, learning_rate=0.05)
xgbr.fit(X_train, y_train)
y_pred = evaluate(xgbr, on="test")
y_pred_t = evaluate(xgbr, on="train")

-----------------------------------------------------
R2: 0.7471895302116716
MSE: 88.94584331571191
MAE: 6.429136706880908
RMSE: 9.431110396751377
-----------------------------------------------------
-----------------------------------------------------
R2: 0.9009702235747971
MSE: 36.978122139662645
MAE: 4.215822702831295
RMSE: 6.080963915339627
-----------------------------------------------------


In [49]:
xgbr = XGBRegressor(n_estimators=2000, max_depth=10, learning_rate=0.1)
xgbr.fit(X_train, y_train)
y_pred = evaluate(xgbr, on="test")
y_pred_t = evaluate(xgbr, on="train")

-----------------------------------------------------
R2: 0.8174701793192585
MSE: 64.21913160601139
MAE: 4.026656538875659
RMSE: 8.013684022096914
-----------------------------------------------------
-----------------------------------------------------
R2: 0.9969790497357569
MSE: 1.1280351413637908
MAE: 0.22154867483060048
RMSE: 1.0620899874133976
-----------------------------------------------------


In [54]:
rfr = RandomForestRegressor(max_depth=20, n_estimators=4000, random_state=42)
rfr.fit(X_train, y_train)
y_pred = evaluate(rfr, on="test")
y_pred_t = evaluate(rfr, on="train")

-----------------------------------------------------
R2: 0.7897493656146988
MSE: 73.97209458422282
MAE: 5.3898286764214305
RMSE: 8.60070314475641
-----------------------------------------------------
-----------------------------------------------------
R2: 0.9496040834761641
MSE: 18.81804063211415
MAE: 2.898149001973174
RMSE: 4.33797655965476
-----------------------------------------------------


In [52]:
rfr = RandomForestRegressor(max_depth=10, n_estimators=3000, random_state=42)
rfr.fit(X_train, y_train)
y_pred = evaluate(rfr, on="test")
y_pred_t = evaluate(rfr, on="train")

-----------------------------------------------------
R2: 0.5923970749471805
MSE: 143.40618858518383
MAE: 9.001710172436413
RMSE: 11.975232297754555
-----------------------------------------------------
-----------------------------------------------------
R2: 0.6483821554259253
MSE: 131.29553627706684
MAE: 8.650465239083692
RMSE: 11.458426431105924
-----------------------------------------------------


In [35]:
svr = SVR(kernel="rbf", C=1e3, gamma=0.2)
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
print("svr On Test Set:")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred)**0.5)
print("\n")
print("svr On Train Set:")
y_pred_t = svr.predict(X_train)
print("R2:", r2_score(y_train, y_pred_t))
print("MAE:", mean_absolute_error(y_train, y_pred_t))
print("RMSE:", mean_squared_error(y_train, y_pred_t)**0.5)

svr On Test Set:
R2: 0.8587907093645688
MAE: 137.74941703270449
RMSE: 255.50752140688238


svr On Train Set:
R2: 0.8514367193952317
MAE: 134.092849309882
RMSE: 247.25451472893542


Let's do a grid search for the xgbr.

In [17]:
from sklearn.model_selection import GridSearchCV
params = {
    "n_estimators": [1500, 2000, 3000],
    "max_depth": [2, 4, 8],
    "learning_rate": [0.1,0.2, 0.3],
    "booster": ["gbtree"],
}

base_model = XGBRegressor()
grid_search = GridSearchCV(base_model, params, cv=3, n_jobs=-1, verbose=10)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV 1/3; 1/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=1500[CV 3/3; 1/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=1500

[CV 2/3; 3/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=3000
[CV 3/3; 2/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=2000
[CV 2/3; 2/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=2000
[CV 1/3; 3/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=3000
[CV 2/3; 1/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=1500
[CV 1/3; 2/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=2000
[CV 2/3; 1/27] END booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=1500;, score=0.866 total time=  32.0s
[CV 3/3; 3/27] START booster=gbtree, learning_rate=0.1, max_depth=2, n_estimators=3000
[CV 3/3; 1/27] END booster=gbtree, lea

GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, predictor=None,
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
       

In [18]:
grid_search.best_score_

0.8895123915370542

In [38]:
xgbr = XGBRegressor(learning_rate=0.3, max_depth=4, n_estimators=1000, booster="gbtree")
xgbr.fit(X_train, y_train)
y_pred = xgbr.predict(X_test)
print("XGBR On Test Set:")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred)**0.5)
print("\n")
print("XGBR On Train Set:")
y_pred_t = xgbr.predict(X_train)
print("R2:", r2_score(y_train, y_pred_t))
print("MAE:", mean_absolute_error(y_train, y_pred_t))
print("RMSE:", mean_squared_error(y_train, y_pred_t)**0.5)

XGBR On Test Set:
R2: 0.9094067675501246
MAE: 97.86719559986578
RMSE: 204.65385576197573


XGBR On Train Set:
R2: 0.991287236873442
MAE: 37.40795654774622
RMSE: 59.877883827092134


In [21]:
import numpy as np
y_pred = np.abs(xgbr.predict(X_test))
print("XGBR On Test Set:")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("\n")
print("XGBR On Train Set:")
y_pred_t = np.abs(xgbr.predict(X_train))
print("R2:", r2_score(y_train, y_pred_t))
print("MAE:", mean_absolute_error(y_train, y_pred_t))
print("MSE:", mean_squared_error(y_train, y_pred_t))

XGBR On Test Set:
R2: 0.9094067675501246
MAE: 97.86719559986578
MSE: 41883.20067824357


XGBR On Train Set:
R2: 0.991287236873442
MAE: 37.40795654774622
MSE: 3585.360971610742


In [56]:
preds = rfr.predict(test_final)
# preds = np.abs(preds)

In [58]:
sample = pd.read_csv('data/Sample__submission.csv')

In [59]:
sample.head()

Unnamed: 0,id,price1
0,19841,193
1,19157,340
2,19445,637
3,8046,679
4,4571,190


In [60]:
act_price = test["ActingPrice"]
act_price

0        999
1        499
2        999
3       2999
4        999
        ... 
5239     699
5240    1993
5241     999
5242     499
5243     749
Name: ActingPrice, Length: 5244, dtype: int64

In [61]:
sub = pd.DataFrame({"id": test_id, "price2": preds})
sub

Unnamed: 0,id,price2
0,2242,57.936235
1,20532,43.638880
2,10648,55.079625
3,20677,60.798651
4,12593,60.309087
...,...,...
5239,14033,46.558344
5240,297,56.468695
5241,18733,59.660029
5242,6162,51.778841


In [62]:
sub["price2"] = (act_price-sub["price2"]*act_price/100).astype(int)
sub.head()

Unnamed: 0,id,price2
0,2242,420
1,20532,281
2,10648,448
3,20677,1175
4,12593,396


In [63]:
sub = pd.merge(sample, sub, on="id")
sub.drop("price1", axis=1, inplace=True)
sub.columns = ["id", "price1"]
sub.head()

Unnamed: 0,id,price1
0,19841,793
1,19157,637
2,19445,200
3,8046,422
4,4571,1380


In [64]:
sub.to_csv("data/submission_using_offer.csv", index=False)

In [65]:
!kaggle competitions submit -c sa2022 -f data/submission_using_offer.csv -m ""

Successfully submitted to Final Capstone Project


  0%|          | 0.00/54.4k [00:00<?, ?B/s]
 15%|█▍        | 8.00k/54.4k [00:00<00:00, 67.6kB/s]
100%|██████████| 54.4k/54.4k [00:05<00:00, 9.68kB/s]



