In [314]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from category_encoders import TargetEncoder
import numpy as np
import pandas as pd


In [315]:
df = pd.read_csv("./data/data1.csv")
print(df["price"].max())
df.head()

51021.85


Unnamed: 0,name,price,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending
0,"Nike Dunk Low Retro ""White Black Panda"" Shoes ...",133.1,1,1,30,0,"Clothing, Shoes & Accessories/Men/Men's Shoes/...",1,32000,1
1,Rolex Daytona Cosmograph Auto 40mm Yellow Gold...,51021.85,7,1,30,1,"Clothing, Shoes & Accessories/Jewelry & Watche...",1,17000,0
2,Women's Casual Comfort Mid Calf Knee High Roun...,35.33,4,0,30,0,"Clothing, Shoes & Accessories/Women/Women's Sh...",1,326000,1
3,G-Unit Style heavy weigh Tank Top Square Cut ...,16.03,3,0,30,0,"Clothing, Shoes & Accessories/Men/Men's Clothi...",1,207000,1
4,Men's 100% Cotton Big Polka Dot Design Spread...,29.44,3,1,30,1,"Clothing, Shoes & Accessories/Men/Men's Clothi...",1,48000,1


In [316]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

tfid = TfidfVectorizer(stop_words='english')
X = tfid.fit_transform(df['name'])
k_means = KMeans(n_clusters=10, random_state=0)
k_means.fit(X)
df['name'] = k_means.labels_
df.head()



Unnamed: 0,name,price,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending
0,4,133.1,1,1,30,0,"Clothing, Shoes & Accessories/Men/Men's Shoes/...",1,32000,1
1,4,51021.85,7,1,30,1,"Clothing, Shoes & Accessories/Jewelry & Watche...",1,17000,0
2,6,35.33,4,0,30,0,"Clothing, Shoes & Accessories/Women/Women's Sh...",1,326000,1
3,4,16.03,3,0,30,0,"Clothing, Shoes & Accessories/Men/Men's Clothi...",1,207000,1
4,1,29.44,3,1,30,1,"Clothing, Shoes & Accessories/Men/Men's Clothi...",1,48000,1


In [317]:
df.to_csv("./data/data_tpot.csv", index=False)
X = df.drop(['price'], axis=1)
y = df['price']

In [318]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [319]:
enc = TargetEncoder(cols=['category'])
X_train = enc.fit_transform(X_train, y_train)
X_test = enc.transform(X_test)
X_train

Unnamed: 0,name,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending
5070,4,3,0,11,1,56.384293,1,70,0
1374,4,4,1,30,1,73.150728,1,3200,0
1720,4,1,1,30,1,57.739749,1,45,0
9085,1,3,0,30,1,43.729098,1,152000,1
0,4,1,1,30,0,141.070639,1,32000,1
...,...,...,...,...,...,...,...,...,...
5734,0,1,1,46,1,74.659410,1,139000,1
5191,4,4,0,4,1,65.634130,1,51000,0
5390,4,1,0,30,1,73.150728,1,4000,1
860,4,4,1,30,1,57.739749,1,16000,1


In [320]:
X_train.head()

Unnamed: 0,name,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending
5070,4,3,0,11,1,56.384293,1,70,0
1374,4,4,1,30,1,73.150728,1,3200,0
1720,4,1,1,30,1,57.739749,1,45,0
9085,1,3,0,30,1,43.729098,1,152000,1
0,4,1,1,30,0,141.070639,1,32000,1


In [321]:
# model = RandomForestRegressor(n_estimators=100, random_state=42)
from xgboost import XGBRegressor

# model = XGBRegressor(n_estimators=100, random_state=42)
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, subsample=0.8, random_state=42)



In [322]:
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_scores

array([  -15836.84712474,    -6735.56627051,    -7880.67369057,
       -1730377.56886907,    -7735.17956936])

In [323]:
cv_rmse = np.sqrt(-cv_scores)
print(f'CV RMSE: {cv_rmse.mean()} ± {cv_rmse.std()}')


CV RMSE: 340.01524512845737 ± 487.957785248974


In [324]:
model.fit(X_train, y_train)


In [325]:
y_pred = model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Test RMSE: {test_rmse}')


Test RMSE: 151.83036010277334
