In [56]:
import pandas as pd

df = pd.read_csv('ecommerce_dataset.csv',names = ['category','description'],header=None)
print(df.shape)
df.head()

(50425, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [57]:
df.category.value_counts()

category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64

In [58]:
df.isnull().sum()

category       0
description    1
dtype: int64

In [59]:
df.dropna(inplace = True)

In [60]:
df.isnull().sum()

category       0
description    0
dtype: int64

In [61]:
#replace the name of the column e.eg. remove spaces and put underscore BUT why????
# in fastext we need to specify label in a specific format
df.category.replace('Clothing & Accessories','Clothing_Accessories',inplace = True)
df.category.unique()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.category.replace('Clothing & Accessories','Clothing_Accessories',inplace = True)


array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

In [62]:
# the format 
df['category'] = "__label__" + df['category'].astype(str)
df.head()

Unnamed: 0,category,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [63]:
df['category_description'] = df['category'] + ' ' +df['description']
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__Household SAF Flower Print Framed Pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__Household Incredible Gifts India Wood...


In [64]:
import re

text = "  VIKI's | Bookcase/Bookshelf (3-Shelf/Shelve, White) | ? . hi"
text = re.sub(r'[^\w\s\']',' ',text).lower()
text = re.sub(r'\s+',' ',text)
text

" viki's bookcase bookshelf 3 shelf shelve white hi"

In [65]:
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower() 

In [66]:
df['category_description'] = df['category_description'].map(preprocess)
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf 'floral' framed paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf 'uv textured modern art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...


In [67]:
from sklearn.model_selection import train_test_split

In [68]:
train,test= train_test_split(df,test_size=0.2)

In [69]:
train.shape , test.shape

((40339, 3), (10085, 3))

In [70]:
train.to_csv("ecommerce.train" , columns = ['category_description'] , index = False , header = None )
test.to_csv("ecommerce.test" , columns = ['category_description'] , index = False , header = None )

In [71]:
import fasttext

model = fasttext.train_supervised(input="ecommerce.train")

In [72]:
model.test('ecommerce.test')

(10083, 0.9698502429832391, 0.9698502429832391)

In [73]:
model.predict('ockey men\'s cotton t shirt fabric details 80 cotton 20 polyester super combed cotton rich fabric')

(('__label__clothing_accessories',), array([1.00001001]))

In [74]:
model.predict("think and grow rich deluxe edition")


(('__label__books',), array([1.00000989]))

In [75]:
dir(model)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_labels',
 '_words',
 'bucket',
 'dim',
 'epoch',
 'f',
 'get_analogies',
 'get_dimension',
 'get_input_matrix',
 'get_input_vector',
 'get_label_id',
 'get_labels',
 'get_line',
 'get_meter',
 'get_nearest_neighbors',
 'get_output_matrix',
 'get_sentence_vector',
 'get_subword_id',
 'get_subwords',
 'get_word_id',
 'get_word_vector',
 'get_words',
 'is_quantized',
 'label',
 'labels',
 'loss',
 'lr',
 'lrUpdateRate',
 'maxn',
 'minCount',
 'minCountLabel',
 'minn',
 'neg',
 'predict',
 'pretrainedVectors',
 'quantize',
 'save_model',
 'set_args',
 'set_matrices',
 't',
 

In [76]:
model.get_nearest_neighbors("painting")


[(0.9983071684837341, 'vacuum'),
 (0.9967771172523499, 'temperature'),
 (0.9967284798622131, 'microwave'),
 (0.9964706301689148, 'furniture'),
 (0.9963348507881165, 'cake'),
 (0.9963142275810242, 'rack'),
 (0.9963034391403198, 'alarm'),
 (0.9957993030548096, 'usha'),
 (0.9957658648490906, 'cooking'),
 (0.9956832528114319, 'extended')]