In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [6]:
#let's create the vectorizer and fit the corpus and transform them accordingly
v = TfidfVectorizer()
transformed = v.fit_transform(corpus)

In [7]:
print(v.vocabulary_)

{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 26, 'tesla': 24, 'model': 19, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'grapes': 13}


In [10]:
#let's print the idf of each word:

all_feature_names = v.get_feature_names_out()
all_feature_names

array(['already', 'am', 'amazon', 'and', 'announcing', 'apple', 'are',
       'ate', 'biryani', 'dot', 'eating', 'eco', 'google', 'grapes',
       'iphone', 'ironman', 'is', 'loki', 'microsoft', 'model', 'new',
       'pixel', 'pizza', 'surface', 'tesla', 'thor', 'tomorrow', 'you'],
      dtype=object)

In [11]:
for word in all_feature_names:
    indx = v.vocabulary_.get(word)
    idf_score = v.idf_[indx]
    print(f"{word} {idf_score}")

already 2.386294361119891
am 2.386294361119891
amazon 2.386294361119891
and 2.386294361119891
announcing 1.2876820724517808
apple 2.386294361119891
are 2.386294361119891
ate 2.386294361119891
biryani 2.386294361119891
dot 2.386294361119891
eating 1.9808292530117262
eco 2.386294361119891
google 2.386294361119891
grapes 2.386294361119891
iphone 2.386294361119891
ironman 2.386294361119891
is 1.1335313926245225
loki 2.386294361119891
microsoft 2.386294361119891
model 2.386294361119891
new 1.2876820724517808
pixel 2.386294361119891
pizza 2.386294361119891
surface 2.386294361119891
tesla 2.386294361119891
thor 2.386294361119891
tomorrow 1.2876820724517808
you 2.386294361119891


In [13]:
#let's print the transformed output from tf-idf
print(transformed.toarray())

[[0.24266547 0.         0.         0.         0.         0.
  0.         0.24266547 0.         0.         0.40286636 0.
  0.         0.         0.         0.24266547 0.11527033 0.24266547
  0.         0.         0.         0.         0.72799642 0.
  0.         0.24266547 0.         0.        ]
 [0.         0.         0.         0.         0.30652086 0.5680354
  0.         0.         0.         0.         0.         0.
  0.         0.         0.5680354  0.         0.26982671 0.
  0.         0.         0.30652086 0.         0.         0.
  0.         0.         0.30652086 0.        ]
 [0.         0.         0.         0.         0.30652086 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.26982671 0.
  0.         0.5680354  0.30652086 0.         0.         0.
  0.5680354  0.         0.30652086 0.        ]
 [0.         0.         0.         0.         0.30652086 0.
  0.         0.         0.         0.         0.         0.
  0.

Problem Statement: Given a description about a product sold on e-commerce website, classify it in one of the 4 categories

Dataset Credits: https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification

This data consists of two columns.
Text	Label
Indira Designer Women's Art Mysore Silk Saree With Blouse Piece (Star-Red) This Saree Is Of Art Mysore Silk & Comes With Blouse Piece.	Clothing & Accessories
IO Crest SY-PCI40010 PCI RAID Host Controller Card Brings new life to any old desktop PC. Connects up to 4 SATA II high speed SATA hard disk drives. Supports Windows 8 and Server 2012	Electronics
Operating Systems in Depth About the Author Professor Doeppner is an associate professor of computer science at Brown University. His research interests include mobile computing in education, mobile and ubiquitous computing, operating systems and distribution systems, parallel computing, and security.	Books
*Text*: Description of an item sold on e-commerce website
*Label*: Category of that item. Total 4 categories: "Electronics", "Household", "Books" and "Clothing & Accessories", which almost cover 80% of any E-commerce website.

In [14]:
import pandas as pd

In [15]:
df = pd.read_csv("Ecommerce_data.csv")
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [16]:
df.isnull().any()

Text     False
label    False
dtype: bool

In [17]:
df.label.value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

The dataset is a balanced dataset as all the 4 classes have the same number of data

In [18]:
df['category']=df['label'].map({
    'Household': 0,
    'Electronics': 1,
    'Clothing & Accessories': 2,
    'Books': 3

})
    

In [19]:
df.head()

Unnamed: 0,Text,label,category
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2


In [22]:
#Let's do the preprocessing like lemmatization and removing the stop words and punctuation marks
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)
    

In [23]:
df['preprocessed_text'] = df['Text'].apply(preprocess)


In [24]:
df.head()

Unnamed: 0,Text,label,category,preprocessed_text
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0,Urban Ladder Eisner low Study Office Computer ...
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0,contrast live Wooden Decorative Box Painted Bo...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1,IO Crest SY PCI40010 PCI raid Host Controller ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2,ISAKAA Baby Socks bear 8 Years- Pack 4 6 8 12 ...
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2,Indira Designer Women Art Mysore Silk Saree Bl...


In [25]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train, y_test = train_test_split(df['preprocessed_text'], df['category'], test_size = 0.2,
                                                  random_state = 22,
                                                  stratify = df['category'])

In [26]:
X_train.shape , X_test.shape

((19200,), (4800,))

In [27]:
y_train.value_counts()

category
2    4800
0    4800
1    4800
3    4800
Name: count, dtype: int64

y_test.value_counts()

We see that all the classes have been divided equally in both the training and test datasets

In [29]:
# Now let's build a model
# Importing the required libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [30]:
model_nb = Pipeline([
    ('tf_idf_vectorizer',TfidfVectorizer()),
    ('nb',MultinomialNB())
])

model_nb.fit(X_train, y_train)
y_pred_nb = model_nb.predict(X_test)

In [31]:
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1200
           1       0.96      0.97      0.97      1200
           2       0.98      0.99      0.98      1200
           3       0.99      0.94      0.96      1200

    accuracy                           0.96      4800
   macro avg       0.97      0.96      0.96      4800
weighted avg       0.97      0.96      0.96      4800



In [32]:
model_rf = Pipeline([
    ('tf_idf_vectorizer',TfidfVectorizer()),
    ('rf',RandomForestClassifier(n_estimators=50))
])

model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

In [33]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1200
           1       0.98      0.97      0.97      1200
           2       0.98      0.98      0.98      1200
           3       0.98      0.97      0.97      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [34]:
model_knn = Pipeline([
    ('tf_idf_vectorizer',TfidfVectorizer()),
    ('KNN',KNeighborsClassifier())
])

model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_test)

In [35]:
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95      1200
           1       0.96      0.96      0.96      1200
           2       0.98      0.97      0.97      1200
           3       0.97      0.94      0.96      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [36]:
# Let's check some predictions
X_test[0:5]

2231     Lenovo 310 20iap AIO 19.5 inch desktop cel J33...
4113     HeadTurners Milky White Cricket Dress White Cr...
12199    DearJoy Cotton toddler Training Seat Baby Safe...
15020    White Willow Memory Foam Pillow 21.5"x13.5"x4 ...
21084                Concise   General    Knowledge   2019
Name: preprocessed_text, dtype: object

In [38]:
X_test.loc[2231]

'Lenovo 310 20iap AIO 19.5 inch desktop cel J3355/4GB/1TB Windows 10 Home Integrated Graphics Black Size name:2017 Model    Ready box plug play||Single cord PC day day computing||light weight modern design weight 3kg*||integrated web cam Mic Speakers WiFi Bluetooth||19.5 Screen photo frame stand design||7th Generation processor integrated Graphics round performance'

In [39]:
y_test[0:5]

2231     1
4113     2
12199    0
15020    0
21084    3
Name: category, dtype: int64

In [40]:
y_pred_rf[0:5]

array([1, 2, 0, 0, 3], dtype=int64)

In [41]:
y_pred_nb[0:5]

array([1, 2, 0, 0, 3], dtype=int64)

In [42]:
y_pred_knn[0:5]

array([1, 3, 0, 0, 3], dtype=int64)

In [43]:
X_test.loc[4113]

'HeadTurners Milky White Cricket Dress White Cricket T Shirt Trousers Combo Uniform Dress men Boys kid'

In [47]:
df.head(10)

Unnamed: 0,Text,label,category,preprocessed_text
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0,Urban Ladder Eisner low Study Office Computer ...
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0,contrast live Wooden Decorative Box Painted Bo...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1,IO Crest SY PCI40010 PCI raid Host Controller ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2,ISAKAA Baby Socks bear 8 Years- Pack 4 6 8 12 ...
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2,Indira Designer Women Art Mysore Silk Saree Bl...
5,Selfie: How We Became So Self-Obsessed and Wha...,Books,3,Selfie self obsessed
6,Quantum QHM8810 Keyboard with Mouse (Black) Ul...,Electronics,1,Quantum QHM8810 Keyboard Mouse Black Ultra sli...
7,Y&S Uv Protected Non Polarized Wayfarer Boy's ...,Clothing & Accessories,2,Y&S Uv Protected Non polarized Wayfarer Boy Gi...
8,HP external USB DVD Drive DVDRW DVD-ROM A2U56A...,Electronics,1,hp external usb DVD Drive DVDRW DVD ROM A2U56A...
9,Fujifilm Instax Mini Monochrome Film (10 Sheet...,Books,3,fujifilm Instax Mini Monochrome Film 10 sheet ...
