In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [2]:
v=TfidfVectorizer()
v.fit(corpus)
transformed_output=v.transform(corpus)
print(v.vocabulary_)

{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 26, 'tesla': 24, 'model': 19, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'grapes': 13}


In [3]:
all_feature_names=v.get_feature_names_out()
for word in all_feature_names:
    indx=v.vocabulary_.get(word)
    print(f"{word}{v.idf_[indx]}")

already2.386294361119891
am2.386294361119891
amazon2.386294361119891
and2.386294361119891
announcing1.2876820724517808
apple2.386294361119891
are2.386294361119891
ate2.386294361119891
biryani2.386294361119891
dot2.386294361119891
eating1.9808292530117262
eco2.386294361119891
google2.386294361119891
grapes2.386294361119891
iphone2.386294361119891
ironman2.386294361119891
is1.1335313926245225
loki2.386294361119891
microsoft2.386294361119891
model2.386294361119891
new1.2876820724517808
pixel2.386294361119891
pizza2.386294361119891
surface2.386294361119891
tesla2.386294361119891
thor2.386294361119891
tomorrow1.2876820724517808
you2.386294361119891


In [4]:

print(transformed_output.toarray()[:2])

[[0.24266547 0.         0.         0.         0.         0.
  0.         0.24266547 0.         0.         0.40286636 0.
  0.         0.         0.         0.24266547 0.11527033 0.24266547
  0.         0.         0.         0.         0.72799642 0.
  0.         0.24266547 0.         0.        ]
 [0.         0.         0.         0.         0.30652086 0.5680354
  0.         0.         0.         0.         0.         0.
  0.         0.         0.5680354  0.         0.26982671 0.
  0.         0.         0.30652086 0.         0.         0.
  0.         0.         0.30652086 0.        ]]


### Case study on ecommerce data
#### Given a description about a product sold on e-commerce website ,classify it in one of the 4 categories

In [6]:
import pandas as pd
df=pd.read_csv("Ecommerce_data.csv")

In [7]:
print(df.shape)
df.head(5)

(24000, 2)


Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [8]:
df['label'].value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

- **Text:** Description of an item sold on e-commerce website
- **Label:** Category of that item. Total 4 categories: "Electronics", "Household", "Books" and "Clothing & Accessories", which almost cover 80% of any E-commerce website.

- From the above, we can see that almost all the labels(classes) occured equal number of times and perfectly balanced. There is no problem of class imbalance and hence no need to apply any balancing techniques like undersampling, oversampling etc.

In [11]:
df['label_num']=df['label'].map({
    'Household':0,
    'Books':1,
    'Electronics':2,
    'Clothing & Accessories':3
    
})

In [12]:
df.head(5)

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3


In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(
    df.Text,
    df.label_num,
    test_size=0.2,
    random_state=2022,
    stratify=df.label_num
)
#The stratify parameter ensures that the proportion of each class in label_num 
#is maintained in both the training and test sets.bt if we don use it then we might get imbalance x_train

In [14]:
print(X_train.shape)

(19200,)


In [19]:
print(X_test.shape)

(4800,)


In [20]:
y_train.value_counts()

label_num
0    4800
2    4800
3    4800
1    4800
Name: count, dtype: int64

- X_train and X_test contain data from the Text column (features), which are the product descriptions or any other text information.
- y_train and y_test contain data from the label_num column (labels), which are the numeric representations of the product categories.

In [22]:
y_test.value_counts()

label_num
0    1200
2    1200
3    1200
1    1200
Name: count, dtype: int64

### use KNeighbors classifier

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf=Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('KNN',KNeighborsClassifier())
    
])

clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1200
           1       0.97      0.95      0.96      1200
           2       0.97      0.97      0.97      1200
           3       0.97      0.98      0.97      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [25]:
y_test[:5]

20706    0
19166    2
15209    3
2462     1
6621     3
Name: label_num, dtype: int64

In [26]:
y_pred[:5]

array([0, 2, 3, 1, 0], dtype=int64)

**only  last is wrongly predicted**

### Use Naive Bayes Algorrithm

In [34]:
from sklearn.naive_bayes import MultinomialNB

clf=Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('Multi NB',MultinomialNB())
])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1200
           1       0.98      0.92      0.95      1200
           2       0.97      0.97      0.97      1200
           3       0.97      0.99      0.98      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [37]:
from sklearn.ensemble import RandomForestClassifier

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),        #using the ngram_range parameter 
     ('Random Forest', RandomForestClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1200
           1       0.97      0.98      0.97      1200
           2       0.98      0.97      0.97      1200
           3       0.98      0.98      0.98      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



#### USE text processing to remove stop words,punctuation marks,and apply lemmatization

In [42]:
import spacy
nlp=spacy.load('en_core_web_sm')

def preprocess(text):
    doc=nlp(text)
    filtered_words=[]
    for word in doc:
        if word.is_stop or word.is_punct:
            continue
        else:
            filtered_words.append(word.lemma_)
    return " ".join(filtered_words)

df['preprocessed_txt'] = df['Text'].apply(preprocess) 

In [43]:
df.head()

Unnamed: 0,Text,label,label_num,preprocessed_txt
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0,Urban Ladder Eisner low Study Office Computer ...
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0,contrast live Wooden Decorative Box Painted Bo...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2,IO Crest SY PCI40010 PCI raid Host Controller ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3,ISAKAA Baby Socks bear 8 Years- Pack 4 6 8 12 ...
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3,Indira Designer Women Art Mysore Silk Saree Bl...


In [44]:
df.Text[0].shape

AttributeError: 'str' object has no attribute 'shape'

In [None]:
df.preprocessed_txt[0].shape
