Text Classification based on the products

In [1]:
import spacy
nlp=spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report,confusion_matrix

Reading the data from CSV file

In [3]:
train=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/val_10k.csv")
train.columns

Index(['productId', 'Title', 'userId', 'Helpfulness', 'Score', 'Time', 'Text',
       'Cat1', 'Cat2', 'Cat3'],
      dtype='object')

In [4]:
train=train.drop(labels=['productId','Title','userId','Helpfulness','Score','Time','Cat2','Cat3'],axis=1)
print(train.isnull().sum())
train.head()

Text    0
Cat1    0
dtype: int64


Unnamed: 0,Text,Cat1
0,We've only had it installed about 2 weeks. So ...,pet supplies
1,My bunny had a hard time eating this because t...,pet supplies
2,would never in a million years have guessed th...,health personal care
3,"Being the jerky fanatic I am, snackmasters han...",grocery gourmet food
4,Wondered how quick my dog would catch on to th...,pet supplies


In [5]:
train.Cat1.value_counts()

health personal care    2992
beauty                  2135
toys games              1759
pet supplies            1576
grocery gourmet food     840
baby products            698
Name: Cat1, dtype: int64

In [6]:
x_train,x_test,y_train,y_test=train_test_split(train['Text'],train['Cat1'],test_size=0.3)
print(x_train.shape)
print(x_test.shape)

(7000,)
(3000,)


Text Preprocessing

In [7]:
def text_preprocessing(sentence):
  doc=nlp(sentence);
  tokens=[];
  for word in doc:
    if not word.is_punct and not word in stopwords:
      if word.lemma!='-PRON':
        temp=word.lemma_.lower().strip()
      else:
        temp=word.lower()
      tokens.append(temp);
  return tokens

Text Vectorization

In [8]:
tfidf=TfidfVectorizer(tokenizer=text_preprocessing)

Machine Learning using SVM

In [10]:
classifier=LinearSVC()
clf=Pipeline([('tfidf',tfidf),('clf',classifier)])
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)

In [14]:
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

                      precision    recall  f1-score   support

       baby products       0.72      0.49      0.58       224
              beauty       0.77      0.81      0.79       625
grocery gourmet food       0.79      0.66      0.72       239
health personal care       0.71      0.79      0.75       916
        pet supplies       0.88      0.82      0.85       473
          toys games       0.80      0.82      0.81       523

            accuracy                           0.77      3000
           macro avg       0.78      0.73      0.75      3000
        weighted avg       0.77      0.77      0.77      3000

[[109  19   0  48  12  36]
 [  3 508   3  96   2  13]
 [  1  15 157  50   6  10]
 [ 20  92  31 720  20  33]
 [  6  19   2  43 387  16]
 [ 12  10   5  52  14 430]]
