In [3]:
import pandas as pd

df = pd.read_csv('ecommerce_dataset.csv', names = ['category', 'description'], header = None)
print(df.shape)
df.head()

(50425, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [4]:
df.category.value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: category, dtype: int64

In [5]:
df.dropna(inplace = True) #To drop NA values

In [7]:
df.shape

(50424, 2)

In [9]:
#to replace spaces in category as fasttext expects certain labelling
df.category.replace('Clothing & Accessories', 'Clothing_Accessories', inplace = True)
df.category.unique()

array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

In [10]:
#add prefix __label__ for the convention of fasttext
df['category'] = '__label__' + df['category'].astype(str)
df.head()

Unnamed: 0,category,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [21]:
#merging category and description column 
df['category_description'] = df['category'] +' '+ df['description']
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__Household SAF Flower Print Framed Pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__Household Incredible Gifts India Wood...


In [22]:
df['category_description'][0]

'__label__Household Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and so

In [23]:
#preprocessing

import re

text = "  VIKI's | Bookcase/Bookshelf (3-Shelf/Shleve, White) | ? . hi"

def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(r' +',' ', text)
    return text.strip().lower() #to remove leading spaces

In [24]:
preprocess("  VIKI's | Bookcase/Bookshelf (3-Shelf/Shleve, White) | ? . hi")

"viki's bookcase bookshelf 3 shelf shleve white hi"

In [25]:
df['category_description'] = df['category_description'].map(preprocess)

In [26]:
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf 'floral' framed paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf 'uv textured modern art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...


In [27]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size = 0.2)

In [28]:
train.shape, test.shape

((40339, 3), (10085, 3))

In [29]:
train.head()

Unnamed: 0,category,description,category_description
8944,__label__Household,Multifunction Bathroom Passage Bright Night Li...,__label__household multifunction bathroom pass...
6419,__label__Household,Home Desirica Polyester Blend Beautiful Curtai...,__label__household home desirica polyester ble...
35884,__label__Clothing_Accessories,Levi's Men's Casual Shirt Levis are the pures...,__label__clothing_accessories levi's men's cas...
35173,__label__Clothing_Accessories,DFY Men's Track Jacket DFY sweat - garmets spe...,__label__clothing_accessories dfy men's track ...
39697,__label__Clothing_Accessories,Scott Women's Premium Cotton Pullover Hoodie S...,__label__clothing_accessories scott women's pr...


In [31]:
test.head()

Unnamed: 0,category,description,category_description
6505,__label__Household,Yellow Weaves™ Cotton Canvas Decorative Cushio...,__label__household yellow weaves cotton canvas...
40076,__label__Electronics,AmazonBasics USB Type-C to USB 3.1 Gen1 Female...,__label__electronics amazonbasics usb type c t...
13896,__label__Household,BNBrights Cast Iron Mini Electric Tandoor Comb...,__label__household bnbrights cast iron mini el...
15108,__label__Household,ANSIO Car Air Purifier HEPA Active Carbon Wash...,__label__household ansio car air purifier hepa...
15761,__label__Household,V-Guard Superflo Pedestal Fan with Timer (Yell...,__label__household v guard superflo pedestal f...


In [34]:
train.to_csv('ecommerce.train', columns = ['category_description'], index = False, header = False)
test.to_csv('ecommerce.test', columns = ['category_description'], index = False, header = False)

In [35]:
import fasttext

model = fasttext.train_supervised(input = 'ecommerce.train') #supervised is used for classification and unsupervised is for word embeddings
model.test('ecommerce.test')

(10085, 0.9707486365889936, 0.9707486365889936)

In [36]:
model.predict('wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor')

(('__label__electronics',), array([0.98775059]))

In [37]:
model.predict('think and grow rich')

(('__label__books',), array([1.00000906]))