In [1]:
import pandas as pd


In [29]:
df = pd.read_csv('Ecommerce_data.csv')
print(df.shape)
df.head()

(24000, 2)


Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [31]:
df.dropna(inplace = True)
df.shape

(24000, 2)

In [33]:
df.label.replace("Clothing & Accessories", "Clothing_Accesories", inplace = True)
df.label.unique()

array(['Household', 'Electronics', 'Clothing_Accesories', 'Books'],
      dtype=object)

In [35]:
df['label'] = "__label__" + df['label'].astype(str) # For fasttext I need to add prefix
df.head() 

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,__label__Household
1,"Contrast living Wooden Decorative Box,Painted ...",__label__Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,__label__Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,__label__Clothing_Accesories
4,Indira Designer Women's Art Mysore Silk Saree ...,__label__Clothing_Accesories


In [48]:
df['label_description'] = df['label'] + " " + df['Text'] # Now i merge two columns into one
df.head()

Unnamed: 0,Text,label,label_description
0,Urban Ladder Eisner Low Back Study-Office Comp...,__label__Household,__label__Household Urban Ladder Eisner Low Bac...
1,"Contrast living Wooden Decorative Box,Painted ...",__label__Household,__label__Household Contrast living Wooden Deco...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,__label__Electronics,__label__Electronics IO Crest SY-PCI40010 PCI ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,__label__Clothing_Accesories,__label__Clothing_Accesories ISAKAA Baby Socks...
4,Indira Designer Women's Art Mysore Silk Saree ...,__label__Clothing_Accesories,__label__Clothing_Accesories Indira Designer W...


In [44]:
import re

def preprocess(text):
  text = re.sub(r'[^\w\s\']', ' ', text) # Now i do preprocessing using regular expression, so i clean text from singns and no words
  text = re.sub(r' +', ' ', text) # I also clear white spaces
  return text.strip().lower() # remove spaces and make words lower

In [49]:
preprocess("  VIKI's | Bookcase/Bookshelf (3-Shelf/Shelve, White) | ? . hi")

"viki's bookcase bookshelf 3 shelf shelve white hi"

In [50]:
df['label_description'] = df['label_description'].map(preprocess)

In [52]:
df.head()

Unnamed: 0,Text,label,label_description
0,Urban Ladder Eisner Low Back Study-Office Comp...,__label__Household,__label__household urban ladder eisner low bac...
1,"Contrast living Wooden Decorative Box,Painted ...",__label__Household,__label__household contrast living wooden deco...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,__label__Electronics,__label__electronics io crest sy pci40010 pci ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,__label__Clothing_Accesories,__label__clothing_accesories isakaa baby socks...
4,Indira Designer Women's Art Mysore Silk Saree ...,__label__Clothing_Accesories,__label__clothing_accesories indira designer w...


In [53]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size = 0.2, random_state = 0)

In [54]:
train.shape, test.shape

((19200, 3), (4800, 3))

In [55]:
train.head()

Unnamed: 0,Text,label,label_description
15613,Yellow Chimes Circles of Love Stylish Crystal ...,__label__Household,__label__household yellow chimes circles of lo...
17719,Borosil Glass Aarti Diya (Golden) Make Borosil...,__label__Household,__label__household borosil glass aarti diya go...
16235,The Alchemist Pocket Edition Review ‘His books...,__label__Books,__label__books the alchemist pocket edition re...
467,DALUCI 20Pcs Flash Speedlite Color Gel Filters...,__label__Electronics,__label__electronics daluci 20pcs flash speedl...
21093,Enraciner Stylish Mini Clip MP3 Player with Mi...,__label__Electronics,__label__electronics enraciner stylish mini cl...


In [56]:
train.to_csv("ecommerce.train", columns = ["label_description"], index = False, header = False)
test.to_csv("ecommerce.test", columns = ["label_description"], index = False, header = False)

In [57]:
import fasttext

model = fasttext.train_supervised(input = "ecommerce.train") # Now I gonna train my model for classification
model.test("ecommerce.test")

(4797, 0.9672712111736502, 0.9672712111736502)

In [58]:
model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3")

(('__label__electronics',), array([0.99785078]))

In [66]:
model.predict("think and grow rich deluxe edition")

(('__label__books',), array([0.99999797]))

In [59]:
model.predict("ockey men's cotton t shirt fabric details 80 cotton 20 polyester super combed cotton rich fabric")

(('__label__clothing_accesories',), array([1.00000989]))

In [70]:
model.get_nearest_neighbors("processor")

[(0.9952085018157959, '415'),
 (0.9952083826065063, 'z6z13a'),
 (0.9951889514923096, 'd20'),
 (0.9951818585395813, 'insist'),
 (0.9951778650283813, 'ttr'),
 (0.9951778650283813, 'incode'),
 (0.9951741099357605, 'onetm'),
 (0.9951741099357605, 'convient'),
 (0.9951741099357605, 'maximized'),
 (0.9951543211936951, 'crackers')]

In [78]:
model.get_nearest_neighbors("cotton")

[(0.9996951818466187, 'suave'),
 (0.9995565414428711, 'pencils'),
 (0.9991541504859924, 'streets'),
 (0.9987474679946899, 'jute'),
 (0.998738169670105, 'preferable'),
 (0.9983108043670654, 'lxw'),
 (0.9977372884750366, 'gathers'),
 (0.9970248341560364, 'hurry'),
 (0.9966501593589783, 'throwing'),
 (0.9964810609817505, 'confines')]