In [59]:
import pandas as pd
from category_encoders import TargetEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder


In [60]:
data_path = "./data/lot51_cleaned.csv"
df = pd.read_csv(data_path)
df.head(5)

Unnamed: 0,name,price,brand,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending,price_boxcox,price_log
0,Mens Jogging Bottoms Elasticated Trouser Jogge...,14.48,Crazy Girl Ltd,New with tags,1.0,United Kingdom,1,"Clothing, Shoes & Accessories/Men/Men's Clothi...",1,203000.0,1,3.447473,2.739549
1,Men's 100% Cotton Big Polka Dot Design Spread...,29.19,George/Fortino Landi,New with tags,1.0,United States,1,"Clothing, Shoes & Accessories/Men/Men's Clothi...",1,49000.0,1,4.667444,3.407511
2,G-Unit Style heavy weigh Tank Top Square Cut ...,15.89,Basix,New with tags,1.0,United States,1,"Clothing, Shoes & Accessories/Men/Men's Clothi...",1,207000.0,1,3.600318,2.826722
3,Men's Fashion Oxford Faux Leather Dress Shoes...,33.23,Milano Moda & Alberto Fellini,New with box,1.0,United States,1,"Clothing, Shoes & Accessories/Men/Men's Shoes/...",1,49000.0,1,4.910672,3.533102
4,Women's Cute Caged Peep Toe Low High Platform ...,15.81,Top Moda,New without box,1.0,United States,1,"Clothing, Shoes & Accessories/Women/Women's Sh...",1,326000.0,0,3.591949,2.821974


In [61]:
df.shape

(31170, 13)

In [64]:
tfid = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfid.fit_transform(df['name'])
k_means = KMeans(n_clusters=50, random_state=0)
k_means.fit(tfidf_matrix)
df['name'] = k_means.labels_
df.head()



Unnamed: 0,name,price,brand,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending,price_boxcox,price_log
0,11,14.48,Crazy Girl Ltd,New with tags,1.0,United Kingdom,1,"Clothing, Shoes & Accessories/Men/Men's Clothi...",1,203000.0,1,3.447473,2.739549
1,16,29.19,George/Fortino Landi,New with tags,1.0,United States,1,"Clothing, Shoes & Accessories/Men/Men's Clothi...",1,49000.0,1,4.667444,3.407511
2,43,15.89,Basix,New with tags,1.0,United States,1,"Clothing, Shoes & Accessories/Men/Men's Clothi...",1,207000.0,1,3.600318,2.826722
3,36,33.23,Milano Moda & Alberto Fellini,New with box,1.0,United States,1,"Clothing, Shoes & Accessories/Men/Men's Shoes/...",1,49000.0,1,4.910672,3.533102
4,17,15.81,Top Moda,New without box,1.0,United States,1,"Clothing, Shoes & Accessories/Women/Women's Sh...",1,326000.0,0,3.591949,2.821974


#### Dealing with Category

In [65]:
label_encoder = LabelEncoder()
df['category'] = label_encoder.fit_transform(df['category'])
df.head()

Unnamed: 0,name,price,brand,item_condition,shipping,located_in,return_policy,category,money_back,seller_item_sold,trending,price_boxcox,price_log
0,11,14.48,Crazy Girl Ltd,New with tags,1.0,United Kingdom,1,26,1,203000.0,1,3.447473,2.739549
1,16,29.19,George/Fortino Landi,New with tags,1.0,United States,1,73,1,49000.0,1,4.667444,3.407511
2,43,15.89,Basix,New with tags,1.0,United States,1,86,1,207000.0,1,3.600318,2.826722
3,36,33.23,Milano Moda & Alberto Fellini,New with box,1.0,United States,1,187,1,49000.0,1,4.910672,3.533102
4,17,15.81,Top Moda,New without box,1.0,United States,1,395,1,326000.0,0,3.591949,2.821974


### Target Encoder for item_condition

In [66]:
encoder = TargetEncoder()

encoder.fit(df['item_condition'], df['price_log'])
df['item_condition'] = encoder.transform(df['item_condition'])

encoder.fit(df['brand'], df['price_log'])
df['brand'] = encoder.transform(df['brand'])

encoder.fit(df['located_in'], df['price_log'])
df['located_in'] = encoder.transform(df['located_in'])

# df.head()

In [67]:
df.dtypes

name                  int32
price               float64
brand               float64
item_condition      float64
shipping            float64
located_in          float64
return_policy         int64
category              int64
money_back            int64
seller_item_sold    float64
trending              int64
price_boxcox        float64
price_log           float64
dtype: object

In [68]:
df.to_csv("./data/lot51_vectorized.csv", index=False)