In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Define the corpus.
corpus = {
    "Thor eating pizza,Lokki is eating pizzza,Ironmam ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating birayani and you are eating graphes"
}

In [2]:
#Lets create the vectorizer and fit the corpus and transform them accordingly
# Initialize the TfidfVectorizer
v = TfidfVectorizer()
# Fit and transform the corpus
##transform_output =v.fit_transform(corpus)-->we can use fit_transform() instead of calling fit() and transform() separately.
v.fit(corpus)
transform_output = v.transform(corpus)

In [3]:
#Lets print the vocabulary(word: index)
print(v.vocabulary_)


{'thor': 24, 'eating': 10, 'pizza': 21, 'lokki': 17, 'is': 16, 'pizzza': 22, 'ironmam': 15, 'ate': 7, 'already': 0, 'am': 1, 'birayani': 8, 'and': 3, 'you': 26, 'are': 6, 'graphes': 13, 'tesla': 23, 'announcing': 4, 'model': 18, 'tomorrow': 25, 'amazon': 2, 'new': 19, 'eco': 11, 'dot': 9, 'google': 12, 'pixel': 20, 'apple': 5, 'iphone': 14}


In [4]:
#Lets print the idf of each word

#get all features names
all_feature_names = v.get_feature_names_out()

# Print IDF scores for each feature
for word in all_feature_names:
    #Lets get the index in vocabulary
    indx = v.vocabulary_.get(word)
    #get the IDF score
    idf_score = v.idf_[indx]
    print(f"{word}:{idf_score}")

already:2.252762968495368
am:2.252762968495368
amazon:2.252762968495368
and:2.252762968495368
announcing:1.336472236621213
apple:2.252762968495368
are:2.252762968495368
ate:2.252762968495368
birayani:2.252762968495368
dot:2.252762968495368
eating:1.8472978603872037
eco:2.252762968495368
google:2.252762968495368
graphes:2.252762968495368
iphone:2.252762968495368
ironmam:2.252762968495368
is:1.1541506798272583
lokki:2.252762968495368
model:2.252762968495368
new:1.5596157879354227
pixel:2.252762968495368
pizza:2.252762968495368
pizzza:2.252762968495368
tesla:2.252762968495368
thor:2.252762968495368
tomorrow:1.336472236621213
you:2.252762968495368


In [5]:
import pandas as pd

#read the data into a pandas dataframe
df = pd.read_csv("C:/8-text_mining/text_mining/Ecommerce_data.csv")
print(df.shape)##---->This prints the shape of the DataFrame, which shows the number of rows and columns.

df.head(5)##----->This prints the first 5 rows of the DataFrame for a quick overview of the data(text,label).


(24000, 2)


Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [6]:
#Check the distribution of labesls
df["label"].value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [7]:
 #balanced data and imbalance data
#we can see that there are equal no.of times and perfectly almost all labels occurred
#There is no problem of class imbalnace and hence no need to apply balancing technique
#Add the new column which gives a unique number to each of these labels


In [8]:
#Add the new column antch gives a unique number to such of these Labels

# Define the label mapping
label_mapping = {
    'Household': 0,
    'Books': 1,
    'Electronics': 2,
    'Clothing & Accessories': 3  # Adjusted to match the exact label names
}

# Map the labels to numeric values
df['label_num'] = df['label'].map(label_mapping)

# Display the first 5 rows of the DataFrame to confirm mapping
df.head(5)

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3


## Train test split

In [9]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(
  df.Text,               # Features (text data)
  df.label_num,          # Target labels (numeric)
  test_size = 0.2,       # 20% of the data will be used for testing
  random_state=2022,     # Seed for reproducibility
  stratify=df.label_num  # Maintain the proportion of each label in train and test sets

)

In [10]:
print("Shape of X_train:",X_train.shape)
print("Shape of X_test:",X_test.shape)

Shape of X_train: (19200,)
Shape of X_test: (4800,)


In [11]:
X_train.head()

15820    IRIS Furniture Children Deluxe Spiderman Toddl...
23224    Godox CB-09 Hard Carrying Storage Suitcase Car...
4638     Ugreen All in 1 USB 3.0 Card Reader USB Memory...
15245    Spread Spain Metallic Gold Bar Trolley/Kitchen...
5378     Chromozome Men's Calf Socks (Pack of 3) (SX-3 ...
Name: Text, dtype: object

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('KNN',KNeighborsClassifier())
])

#2. fit with X_train and y_train
clf.fit(X_train,y_train)

#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

#4. print the claassification report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1200
           1       0.97      0.95      0.96      1200
           2       0.97      0.97      0.97      1200
           3       0.97      0.98      0.97      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [13]:
X_test[:5]

20706    Lal Haveli Designer Handmade Patchwork Decorat...
19166    GOTOTOP Classical Retro Cotton & PU Leather Ne...
15209    FabSeasons Camouflage Polyester Multi Function...
2462     Indian Superfoods: Change the Way You Eat Revi...
6621     Milton Marvel Insulated Steel Casseroles, Juni...
Name: Text, dtype: object

In [14]:
y_test[:5]

20706    0
19166    2
15209    3
2462     1
6621     3
Name: label_num, dtype: int64

In [15]:
y_pred[:5]

array([0, 2, 3, 1, 0], dtype=int64)

##### Data acquisition-->TextExtraction & Cleanup-->Pre-Processing-->Feature Engineering-->Model Building-->Evaulation-->Deployment-->Moniter & Update