In [76]:
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
import seaborn as sns
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from xgboost import XGBRegressor


In [77]:
data_path = "./data/lot31_statistical.csv"
df = pd.read_csv(data_path)
df.head(5)

Unnamed: 0,name,price,brand,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending,price_boxcox,price_log
0,Mens Jogging Bottoms Elasticated Trouser Jogge...,14.48,Crazy Girl Ltd,New with tags,(approx C $10.20),United Kingdom,14 days return . Buyer pays for return shippi...,"Clothing, Shoes & Accessories/Men/Men's Clothi...",Yes,203K,Yes/3,3.545886,2.739549
1,Men's 100% Cotton Big Polka Dot Design Spread...,29.19,George/Fortino Landi,New with tags,(approx C $7.30),United States,30 days return . Buyer pays for return shippi...,"Clothing, Shoes & Accessories/Men/Men's Clothi...",Yes,49K,Yes/56,4.83976,3.407511
2,G-Unit Style heavy weigh Tank Top Square Cut ...,15.89,Basix,New with tags,(approx C $25.88),United States,30 days return . Buyer pays for return shippi...,"Clothing, Shoes & Accessories/Men/Men's Clothi...",Yes,207K,Yes/5,3.707012,2.826722
3,Men's Fashion Oxford Faux Leather Dress Shoes...,33.23,Milano Moda & Alberto Fellini,New with box,(approx C $56.04),United States,30 days return . Buyer pays for return shippi...,"Clothing, Shoes & Accessories/Men/Men's Shoes/...",Yes,49K,Yes/715,5.099772,3.533102
4,Women's Cute Caged Peep Toe Low High Platform ...,15.81,Top Moda,New without box,(approx C $30.72),United States,30 days return . Buyer pays for return shippi...,"Clothing, Shoes & Accessories/Women/Women's Sh...",Yes,326K,No,3.698182,2.821974


In [78]:
df.shape

(15917, 13)

In [79]:
tfid = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfid.fit_transform(df['name'])
k_means = KMeans(n_clusters=50, random_state=0)
k_means.fit(tfidf_matrix)
df['name'] = k_means.labels_
df.head()



Unnamed: 0,name,price,brand,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending,price_boxcox,price_log
0,19,14.48,Crazy Girl Ltd,New with tags,(approx C $10.20),United Kingdom,14 days return . Buyer pays for return shippi...,"Clothing, Shoes & Accessories/Men/Men's Clothi...",Yes,203K,Yes/3,3.545886,2.739549
1,44,29.19,George/Fortino Landi,New with tags,(approx C $7.30),United States,30 days return . Buyer pays for return shippi...,"Clothing, Shoes & Accessories/Men/Men's Clothi...",Yes,49K,Yes/56,4.83976,3.407511
2,7,15.89,Basix,New with tags,(approx C $25.88),United States,30 days return . Buyer pays for return shippi...,"Clothing, Shoes & Accessories/Men/Men's Clothi...",Yes,207K,Yes/5,3.707012,2.826722
3,23,33.23,Milano Moda & Alberto Fellini,New with box,(approx C $56.04),United States,30 days return . Buyer pays for return shippi...,"Clothing, Shoes & Accessories/Men/Men's Shoes/...",Yes,49K,Yes/715,5.099772,3.533102
4,1,15.81,Top Moda,New without box,(approx C $30.72),United States,30 days return . Buyer pays for return shippi...,"Clothing, Shoes & Accessories/Women/Women's Sh...",Yes,326K,No,3.698182,2.821974


##### Binarizing return_policy, money_back, trending

In [80]:
df['return_policy'] = df['return_policy'].apply(lambda x: 0 if 'no' in x.lower() else 1)
df['money_back'] = df['money_back'].apply(lambda x: 1 if 'yes' in x.lower() else 0)
df['trending'] = df['trending'].apply(lambda x: 1 if 'yes' in x.lower() else 0)
df.head()

Unnamed: 0,name,price,brand,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending,price_boxcox,price_log
0,19,14.48,Crazy Girl Ltd,New with tags,(approx C $10.20),United Kingdom,1,"Clothing, Shoes & Accessories/Men/Men's Clothi...",1,203K,1,3.545886,2.739549
1,44,29.19,George/Fortino Landi,New with tags,(approx C $7.30),United States,1,"Clothing, Shoes & Accessories/Men/Men's Clothi...",1,49K,1,4.83976,3.407511
2,7,15.89,Basix,New with tags,(approx C $25.88),United States,1,"Clothing, Shoes & Accessories/Men/Men's Clothi...",1,207K,1,3.707012,2.826722
3,23,33.23,Milano Moda & Alberto Fellini,New with box,(approx C $56.04),United States,1,"Clothing, Shoes & Accessories/Men/Men's Shoes/...",1,49K,1,5.099772,3.533102
4,1,15.81,Top Moda,New without box,(approx C $30.72),United States,1,"Clothing, Shoes & Accessories/Women/Women's Sh...",1,326K,0,3.698182,2.821974


#### Dealing with Category

In [81]:
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

df['category'] = enc.fit_transform(df[['category']]).astype(int)
kv_pair_ic = {category: i for i, category in enumerate(enc.categories_[0])}
print(kv_pair_ic)
print(len(enc.categories_[0]))
df.head()

{"Business & Industrial/Facility Maintenance & Safety/Personal Protective Equipment (PPE)/Protective Jackets/Clothing, Shoes & Accessories/Men/Men's Clothing/Men/Men's Clothing/Activewear/Hoodies & Sweatshirts": 0, "Business & Industrial/Facility Maintenance & Safety/Personal Protective Equipment (PPE)/Protective Jackets/Clothing, Shoes & Accessories/Men/Men's Clothing/Men/Men's Clothing/Shirts/Casual Button-Down Shirts": 1, "Business & Industrial/Facility Maintenance & Safety/Personal Protective Equipment (PPE)/Protective Pants/Clothing, Shoes & Accessories/Men/Men's Clothing/Pants": 2, "Business & Industrial/Facility Maintenance & Safety/Personal Protective Equipment (PPE)/Protective Suits & Coveralls/Clothing, Shoes & Accessories/Men/Men's Clothing/Pants": 3, 'Business & Industrial/Facility Maintenance & Safety/Personal Protective Equipment (PPE)/Work Boots & Shoes': 4, "Clothing, Shoes & Accessories/Kids/Boys/Boys' Shoes/Clothing, Shoes & Accessories/Women/Women's Shoes/Athletic Sh

Unnamed: 0,name,price,brand,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending,price_boxcox,price_log
0,19,14.48,Crazy Girl Ltd,New with tags,(approx C $10.20),United Kingdom,1,21,1,203K,1,3.545886,2.739549
1,44,29.19,George/Fortino Landi,New with tags,(approx C $7.30),United States,1,58,1,49K,1,4.83976,3.407511
2,7,15.89,Basix,New with tags,(approx C $25.88),United States,1,69,1,207K,1,3.707012,2.826722
3,23,33.23,Milano Moda & Alberto Fellini,New with box,(approx C $56.04),United States,1,159,1,49K,1,5.099772,3.533102
4,1,15.81,Top Moda,New without box,(approx C $30.72),United States,1,326,1,326K,0,3.698182,2.821974


#### Binarizing the shipping values

In [82]:
df.loc[~df['shipping'].astype(str).str.contains('\$'), 'shipping'] = 0
df.loc[df['shipping'].astype(str).str.contains('\$'), 'shipping'] = 1
df['shipping'] = df['shipping'].astype(float)
df.head()

Unnamed: 0,name,price,brand,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending,price_boxcox,price_log
0,19,14.48,Crazy Girl Ltd,New with tags,1.0,United Kingdom,1,21,1,203K,1,3.545886,2.739549
1,44,29.19,George/Fortino Landi,New with tags,1.0,United States,1,58,1,49K,1,4.83976,3.407511
2,7,15.89,Basix,New with tags,1.0,United States,1,69,1,207K,1,3.707012,2.826722
3,23,33.23,Milano Moda & Alberto Fellini,New with box,1.0,United States,1,159,1,49K,1,5.099772,3.533102
4,1,15.81,Top Moda,New without box,1.0,United States,1,326,1,326K,0,3.698182,2.821974


### Dealing with seller_item_sold

In [83]:
# For "seller_positive_feedback"
def to_numbers(short):
    if 'K' in short:
        return float(short.replace('K', '')) * 1000
    elif 'M' in short:
        return float(short.replace('M', '')) * 1000000
    else:
        return float(short)

df["seller_item_sold"] = df["seller_item_sold"].apply(to_numbers)

In [84]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

### Target Encoder for item_condition

In [85]:
encoder = TargetEncoder()
encoder.fit(train_df['item_condition'], train_df['price_log'])

train_df['item_condition'] = encoder.transform(train_df['item_condition'])
val_df['item_condition'] = encoder.transform(val_df['item_condition'])

In [86]:
train_df.head()

Unnamed: 0,name,price,brand,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending,price_boxcox,price_log
2879,7,22.15,Gymshark,3.347088,1.0,United Kingdom,0,49,1,9700.0,0,4.30838,3.141995
12167,23,23.92,Mossimo,4.175701,0.0,United States,1,308,1,1600.0,0,4.453427,3.215671
11129,7,46.45,Belle by Kim Gravel,3.232247,1.0,United States,1,209,1,12000.0,0,5.804033,3.859677
1665,2,59.83,Chaco,3.890444,1.0,United States,0,321,1,1900.0,0,6.368899,4.108083
12146,44,10.63,Gildan,3.347088,1.0,United States,0,66,1,3700.0,1,3.031218,2.453588


In [87]:
encoder.fit(train_df['located_in'], train_df['price_log'])

train_df['located_in'] = encoder.transform(train_df['located_in'])
val_df['located_in'] = encoder.transform(val_df['located_in'])
train_df.head()

Unnamed: 0,name,price,brand,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending,price_boxcox,price_log
2879,7,22.15,Gymshark,3.347088,1.0,3.285029,0,49,1,9700.0,0,4.30838,3.141995
12167,23,23.92,Mossimo,4.175701,0.0,3.909273,1,308,1,1600.0,0,4.453427,3.215671
11129,7,46.45,Belle by Kim Gravel,3.232247,1.0,3.909273,1,209,1,12000.0,0,5.804033,3.859677
1665,2,59.83,Chaco,3.890444,1.0,3.909273,0,321,1,1900.0,0,6.368899,4.108083
12146,44,10.63,Gildan,3.347088,1.0,3.909273,0,66,1,3700.0,1,3.031218,2.453588


In [88]:
encoder.fit(train_df['brand'], train_df['price_log'])

train_df['brand'] = encoder.transform(train_df['brand'])
val_df['brand'] = encoder.transform(val_df['brand'])
train_df.head()

Unnamed: 0,name,price,brand,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending,price_boxcox,price_log
2879,7,22.15,3.731692,3.347088,1.0,3.285029,0,49,1,9700.0,0,4.30838,3.141995
12167,23,23.92,3.757199,4.175701,0.0,3.909273,1,308,1,1600.0,0,4.453427,3.215671
11129,7,46.45,3.807702,3.232247,1.0,3.909273,1,209,1,12000.0,0,5.804033,3.859677
1665,2,59.83,3.69091,3.890444,1.0,3.909273,0,321,1,1900.0,0,6.368899,4.108083
12146,44,10.63,2.944644,3.347088,1.0,3.909273,0,66,1,3700.0,1,3.031218,2.453588


In [89]:
train_df.dtypes

name                  int32
price               float64
brand               float64
item_condition      float64
shipping            float64
located_in          float64
return_policy         int64
category              int64
money_back            int64
seller_item_sold    float64
trending              int64
price_boxcox        float64
price_log           float64
dtype: object

In [90]:
df.to_csv("./data/lot32_train.csv", index=False)

In [91]:
X = train_df.drop(['price'], axis=1)
y = train_df['price']

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)

# model2 = XGBRegressor(n_estimators=100, random_state=42)
model2 = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, subsample=0.8, random_state=42)
cv_scores1 = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_scores2 = cross_val_score(model2, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(cv_scores1, cv_scores2)

[-0.00186526 -0.00181041 -0.00175911 -0.00183137 -0.00129548] [-0.009476   -0.00831309 -0.00986023 -0.01131091 -0.00752522]


In [93]:
cv_xg_rmse = np.sqrt(-cv_scores1)
print(f'XGBOOST CV RMSE: {cv_xg_rmse.mean()} ± {cv_xg_rmse.std()}')

XGBOOST CV RMSE: 0.04129329551316421 ± 0.0026809779857223676


In [94]:
cv_rf_rmse = np.sqrt(-cv_scores2)
print(f'XGBOOST CV RMSE: {cv_rf_rmse.mean()} ± {cv_rf_rmse.std()}')

XGBOOST CV RMSE: 0.09618407374076687 ± 0.00676118915090328


In [95]:
model.fit(X_train, y_train)

In [96]:
model2.fit(X_train, y_train)

In [97]:
y_pred = model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Test RMSE: {test_rmse}')
print("RANDOM_FOREST_ACCURACY: ", (100 * (1 - test_rmse)))

Test RMSE: 0.036297769197085966
RANDOM_FOREST_ACCURACY:  96.37022308029141


In [98]:
y_pred = model2.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Test RMSE: {test_rmse}')
print("XGBOOST_ACCURACY: ", (100 * (1 - test_rmse)))

Test RMSE: 0.07695422818153434
XGBOOST_ACCURACY:  92.30457718184657


In [99]:
from sklearn.linear_model import LinearRegression

m = LinearRegression()
m.fit(X_train, y_train)
y_pred = m.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("LIN_ACCURACY: ", (100 * (1 - rmse)))

print("Test RMSE:", rmse)


LIN_ACCURACY:  -205.31080187630843
Test RMSE: 3.0531080187630844


In [100]:
SMOT, instead of SMOT We can use variational auto encoders, can it be replacement of GANS or Is GAN USING VAE??

Object `VAE` not found.
