### Text Classification Using FastText

Install fasttext

In [13]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m41.0/73.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313499 sha256=d7cc4

#### Prepare Data

In [1]:
import pandas as pd

df= pd.read_csv("Datasets/ecommerce_dataset.csv", names=["category", "description"], header=None)

print(df.shape)
df.head()

(50425, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [2]:
df.category.value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
Household,19313
Books,11820
Electronics,10621
Clothing & Accessories,8671


In [3]:
df.dropna(inplace=True)
df.shape

(50424, 2)

In [4]:
df.category.replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.category.replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)


In [5]:
df.category.unique()

array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

Add __label__ as prefix for text classification with fastText

In [6]:
df['category'] = '__label__' + df['category'].astype(str)

In [7]:
df.head()

Unnamed: 0,category,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [8]:
df['category_description'] = df['category'] + ' ' + df['description']

In [9]:
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__Household SAF Flower Print Framed Pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__Household Incredible Gifts India Wood...


#### Preprocessing data

\s = whitespace

^\w = not a word

\ ' = exclude '

' +' = 1 or more of ' '

In [10]:
import re

text = "  VIKI's | Bookcase/Bookshelf (3-Shelf/Shelve, White) | ? . hi"
text = re.sub(r'[^\w\s\']',' ', text) # remove unnecessary symbol/punctuation
text = re.sub(' +', ' ', text) # remove extra white space
text.strip().lower()

"viki's bookcase bookshelf 3 shelf shelve white hi"

preprocess function

In [11]:
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower()

In [12]:
df['category_description'] = df['category_description'].map(preprocess)
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf 'floral' framed paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf 'uv textured modern art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...


#### Train Test Split

In [13]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [14]:
train.shape, test.shape

((40339, 3), (10085, 3))

In [15]:
train.to_csv("Datasets/ecommerce.train.csv", columns=["category_description"], index=False, header=False)
test.to_csv("Datasets/ecommerce.test.csv", columns=["category_description"], index=False, header=False)

#### Train model

In [16]:
import fasttext

model = fasttext.train_supervised(input="Datasets/ecommerce.train.csv")
model.test("Datasets/ecommerce.test.csv")

(10085, 0.9679722359940506, 0.9679722359940506)

(size of test sample, precision, recall)

10085 = size of test sample

0.9706494794248884 = precision

0.9706494794248884 = recall

In [18]:
model.predict(["wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3"])

([['__label__electronics']], [array([0.9979638], dtype=float32)])

In [19]:
model.predict(["ockey men's cotton t shirt fabric details 80 cotton 20 polyester super combed cotton rich fabric"])

([['__label__clothing_accessories']], [array([1.00001], dtype=float32)])

In [20]:
model.predict(["think and grow rich deluxe edition"])

([['__label__books']], [array([1.0000099], dtype=float32)])

In [27]:
model.get_nearest_neighbors("computer")

[(0.9983075857162476, 'headers'),
 (0.9982072710990906, 'shoved'),
 (0.9981251358985901, 'postcard'),
 (0.9980930685997009, 'expense'),
 (0.9979454278945923, 'hyundai'),
 (0.9978104829788208, 'numerical'),
 (0.9977319240570068, 'hcl'),
 (0.9977213144302368, 'blotches'),
 (0.9976576566696167, 'useless'),
 (0.9975681900978088, 'recurrent')]

In [29]:
model.get_nearest_neighbors("smartphone")

[(0.99254310131073, 'unto'),
 (0.9925255179405212, 'thats'),
 (0.9920874238014221, 'kuwait'),
 (0.9917961955070496, 'meridian'),
 (0.9917893409729004, 'convent'),
 (0.9916715621948242, '250g'),
 (0.9913493394851685, 'onwards'),
 (0.9912425875663757, 'hq'),
 (0.991237223148346, 'msg'),
 (0.991001307964325, '1680d')]