In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor

In [13]:
train = pd.read_csv("data/train_prepared.csv")
test = pd.read_csv("data/test_prepared.csv")

In [14]:
train.dtypes

id              int64
Rating        float64
maincateg      object
platform       object
price1          int64
actprice1       int64
norating1     float64
noreviews1    float64
star_5f       float64
star_4f       float64
star_3f       float64
star_2f         int64
star_1f         int64
fulfilled1      int64
combo            bool
category       object
dtype: object

In [4]:
train.columns

Index(['id', 'Rating', 'maincateg', 'platform', 'price1', 'actprice1',
       'norating1', 'noreviews1', 'star_5f', 'star_4f', 'star_3f', 'star_2f',
       'star_1f', 'fulfilled1', 'combo', 'category'],
      dtype='object')

In [15]:
train.drop("id", axis=1, inplace=True)
test_id = test.id
test.drop("id", axis=1, inplace=True)

X = train.drop(["price1"], axis=1)
y = train["price1"]
assert X.shape[1] == test.shape[1], "X and test have different number of columns"

In [16]:
columns_to_one_hot = ["maincateg", "category", "platform", "combo"]
columns_to_standardize = ["actprice1", 'norating1', 'noreviews1', 'star_5f', 
    'star_4f', 'star_3f', 'star_2f', 'star_1f',]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(sparse=False)
standardizer = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("one_hot_encoder", one_hot_encoder, columns_to_one_hot),
        ("standardizer", standardizer, columns_to_standardize)],
    remainder="passthrough"
)

In [19]:
preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)
test = preprocessor.fit_transform(test)

In [22]:
assert X_train.shape[1] == test.shape[1], "X and test have different number of columns"

# Models

## Base Model

In [24]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("Linear Regression On Test Set:")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("\n")
print("Linear Regression On Train Set:")
y_pred_t = lr.predict(X_train)
print("R2:", r2_score(y_train, y_pred_t))
print("MAE:", mean_absolute_error(y_train, y_pred_t))
print("MSE:", mean_squared_error(y_train, y_pred_t))

Linear Regression On Test Set:
R2: 0.8221663908651199
MAE: 180.54676573426573
MSE: 82216.3039921726


Linear Regression On Train Set:
R2: 0.804004965782383
MAE: 181.19131436745073
MSE: 80653.2825587055


In [26]:
xgbr = XGBRegressor(n_estimators=1000, max_depth=4, learning_rate=0.1)
xgbr.fit(X_train, y_train)
y_pred = xgbr.predict(X_test)
print("XGBR On Test Set:")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("\n")
print("XGBR On Train Set:")
y_pred_t = xgbr.predict(X_train)
print("R2:", r2_score(y_train, y_pred_t))
print("MAE:", mean_absolute_error(y_train, y_pred_t))
print("MSE:", mean_squared_error(y_train, y_pred_t))

XGBR On Test Set:
R2: 0.9085747914684733
MAE: 110.39584102012212
MSE: 42267.84112262313


XGBR On Train Set:
R2: 0.9687509104324868
MAE: 75.77798978644272
MSE: 12859.211768562198


In [29]:
rfr = RandomForestRegressor(n_estimators=1000, max_depth=8)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
print("rfr On Test Set:")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("\n")
print("rfr On Train Set:")
y_pred_t = rfr.predict(X_train)
print("R2:", r2_score(y_train, y_pred_t))
print("MAE:", mean_absolute_error(y_train, y_pred_t))
print("MSE:", mean_squared_error(y_train, y_pred_t))

rfr On Test Set:
R2: 0.8843525531555992
MAE: 140.4658951892472
MSE: 53466.30309047111


rfr On Train Set:
R2: 0.9036228966954832
MAE: 130.1663404402116
MSE: 39659.830036194115


In [30]:
svr = SVR()
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
print("svr On Test Set:")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("\n")
print("svr On Train Set:")
y_pred_t = svr.predict(X_train)
print("R2:", r2_score(y_train, y_pred_t))
print("MAE:", mean_absolute_error(y_train, y_pred_t))
print("MSE:", mean_squared_error(y_train, y_pred_t))

svr On Test Set:
R2: 0.41497833269179396
MAE: 257.3077333123637
MSE: 270468.10485040746


svr On Train Set:
R2: 0.4266329229337835
MAE: 244.90816648362159
MSE: 235944.43125092195


Let's do a grid search for the xgbr.

In [31]:
from sklearn.model_selection import GridSearchCV
params = {
    "n_estimators": [500, 1000, 2000],
    "max_depth": [2, 4, 8],
    "learning_rate": [0.01, 0.1, 0.3],
    "booster": ["gbtree", "gblinear", "dart"],
}

base_model = XGBRegressor()
grid_search = GridSearchCV(base_model, params, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
