In [8]:
#! pip install -q pytorch-lightning
#! pip install -q bs4
#! pip install -q transform

In [9]:
# Import all libraries
import pandas as pd
import numpy as np
import re
from dataset import PCDataset

# Huggingface transformers
import transformers
from transformers import BertModel,BertTokenizer, get_linear_schedule_with_warmup

import torch
from torch import nn ,cuda
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler
from torch.optim import AdamW

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

#handling html data
#from bs4 import BeautifulSoup

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
%matplotlib inline
from sklearn.preprocessing import MultiLabelBinarizer

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [10]:
df = pd.read_csv('pivot_table1.csv')
df.head()

Unnamed: 0,Product Name,Audio & Electronics,Automotive Tools,Body Parts,Exterior,Interior,Lighting,Parts
0,"""8 468"" Emblems",0,0,1,1,0,0,0
1,"""8 LS1"" Emblems",0,0,1,1,0,0,0
2,"""A"" Letter Black Tailgate Emblem (68245565AE)",0,0,1,1,0,0,0
3,"""A"" Letter Black Tailgate Emblem (68282757AA)",0,0,1,1,0,0,0
4,"""A"" Letter Chrome Tailgate Emblem (68245543AE)",0,0,1,1,0,0,0


In [11]:
'''def pre_process(text):

  text = BeautifulSoup(text).get_text()
  
  # fetch alphabetic characters
  text = re.sub("[^a-zA-Z]", " ", text)

  # convert text to lower case
  text = text.lower()

  # split text into tokens to remove whitespaces
  tokens = text.split()

  return " ".join(tokens)

SyntaxError: incomplete input (1623205809.py, line 1)

In [None]:
'''df['Clean_Name'] = df['Product Name'].apply(pre_process)


In [None]:
'''df.head()

In [12]:
import pandas as pd

# Assuming 'df_products' is your initial products DataFrame

# Melt the DataFrame to go from wide to long format
df_long = pd.melt(df, id_vars=['Product Name'], 
                  var_name='Category', value_name='Is_Tagged')

# Filter out the rows where 'Is_Tagged' is 0, as these are not actual tags
df_long = df_long[df_long['Is_Tagged'] == 1]

# Group by 'Product Name' and aggregate the category names into lists to form the 'tags'
df_tags = df_long.groupby('Product Name')['Category'].apply(list).reset_index(name='tags')

# Your 'df_tags' DataFrame now has the 'Product Name' and a 'tags' column similar to your second DataFrame


In [13]:
df_tags.head()

Unnamed: 0,Product Name,tags
0,"""8 468"" Emblems","[Body Parts, Exterior]"
1,"""8 LS1"" Emblems","[Body Parts, Exterior]"
2,"""A"" Letter Black Tailgate Emblem (68245565AE)","[Body Parts, Exterior]"
3,"""A"" Letter Black Tailgate Emblem (68282757AA)","[Body Parts, Exterior]"
4,"""A"" Letter Chrome Tailgate Emblem (68245543AE)","[Body Parts, Exterior]"


In [14]:
X = df_tags['Product Name']

In [15]:
lst_top_tags = ['Audio & Electronics', 'Automotive Tools', 'Body Parts', 'Exterior', 'Interior', 'Lighting', 'Parts']


In [16]:
df = df_tags

In [17]:
# Filter out records ( values in clean_body and tags) that have atleast one of the top tags

x=[] # To store the filtered clean_body values
y=[] # to store the corresponding tags

for i in range(len(df['tags'])):
    temp=[]
    for tag in df['tags'][i]:
        if tag in lst_top_tags:
            temp.append(tag)

    if(len(temp)>0):
        x.append(df['Product Name'][i])
        y.append(temp)


In [18]:
# Encode the tags(labels) in a binary format in order to be used for training
mlb = MultiLabelBinarizer()
 
yt = mlb.fit_transform(y)
yt.shape

(23423, 7)

In [19]:
# Getting a sense of how the tags data looks like
print(yt[0])
print(mlb.inverse_transform(yt[0].reshape(1,-1)))
print(mlb.classes_)

[0 0 1 1 0 0 0]
[('Body Parts', 'Exterior')]
['Audio & Electronics' 'Automotive Tools' 'Body Parts' 'Exterior'
 'Interior' 'Lighting' 'Parts']


In [20]:
from sklearn.model_selection import train_test_split
# First Split for Train and Test
x_train,x_test,y_train,y_test = train_test_split(x, yt, test_size=0.1, random_state=RANDOM_SEED,shuffle=True)
# Next split Train in to training and validation
x_tr,x_val,y_tr,y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=RANDOM_SEED,shuffle=True)


In [21]:
len(x_tr) ,len(x_val), len(x_test)


(16864, 4216, 2343)

In [22]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader

class PCDataModule(pl.LightningDataModule):
    
    def __init__(self, x_tr, y_tr, x_val, y_val, x_test, y_test, tokenizer, batch_size=16, max_token_len=200):
        super().__init__()
        self.tr_text = x_tr
        self.tr_label = y_tr
        self.val_text = x_val
        self.val_label = y_val
        self.test_text = x_test
        self.test_label = y_test
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        # Setting up datasets based on the stage of the training/validation/testing
        if stage == 'fit' or stage is None:
            self.train_dataset = PCDataset(product=self.tr_text, tags=self.tr_label, tokenizer=self.tokenizer, max_len=self.max_token_len)
            self.val_dataset = PCDataset(product=self.val_text, tags=self.val_label, tokenizer=self.tokenizer, max_len=self.max_token_len)
        if stage == 'test' or stage is None:
            self.test_dataset = PCDataset(product=self.test_text, tags=self.test_label, tokenizer=self.tokenizer, max_len=self.max_token_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=11)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=16)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=16)


In [23]:
# Initialize the Bert tokenizer
BERT_MODEL_NAME = "bert-base-cased" # we will use the BERT base model(the smaller one)
Bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

In [24]:
# Initialize the parameters that will be use for training
N_EPOCHS = 18
BATCH_SIZE = 32
MAX_LEN = 300
LR = 2e-05

In [25]:
# Instantiate and set up the data_module
PCdata_module = PCDataModule(x_tr,y_tr,x_val,y_val,x_test,y_test,Bert_tokenizer,BATCH_SIZE,MAX_LEN)
PCdata_module.setup()

In [26]:
class ProductClassifier(pl.LightningModule):
    # Set up the classifier
    def __init__(self, n_classes=7, steps_per_epoch=None, n_epochs=3, lr=2e-5 ):
        super().__init__()

        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
        self.classifier = nn.Linear(self.bert.config.hidden_size,n_classes) # outputs = number of labels
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr
        self.criterion = nn.BCEWithLogitsLoss()
        
    def forward(self,input_ids, attn_mask):
        output = self.bert(input_ids = input_ids ,attention_mask = attn_mask)
        output = self.classifier(output.pooler_output)
                
        return output
    
    
    def training_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('train_loss',loss , prog_bar=True,logger=True)
        
        return {"loss" :loss, "predictions":outputs, "labels": labels }


    def validation_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('val_loss',loss , prog_bar=True,logger=True)
        
        return loss

    def test_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('test_loss',loss , prog_bar=True,logger=True)
        
        return loss
    
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters() , lr=self.lr)
        warmup_steps = self.steps_per_epoch//3
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps

        scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,total_steps)

        return [optimizer], [scheduler]
    

In [27]:
# Instantiate the classifier model
steps_per_epoch = len(x_tr)//BATCH_SIZE
model = ProductClassifier(n_classes=7, steps_per_epoch=steps_per_epoch,n_epochs=N_EPOCHS,lr=LR)


In [28]:
#Initialize Pytorch Lightning callback for Model checkpointing

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',# monitored quantity
    filename='PC-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3, #  save the top 3 models
    mode='min', # mode of the monitored quantity  for optimization
)
# Instantiate the Model Trainer
trainer = pl.Trainer(max_epochs = N_EPOCHS, callbacks=[checkpoint_callback])


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [29]:
trainer.fit(model, PCdata_module)


Missing logger folder: /Users/maryamahmadi/Documents/GitHub/ML-Driven-E-commerce-Categorization-with-Web-Scraping/lightning_logs

  | Name       | Type              | Params
-------------------------------------------------
0 | bert       | BertModel         | 108 M 
1 | classifier | Linear            | 5.4 K 
2 | criterion  | BCEWithLogitsLoss | 0     
-------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.263   Total estimated model params size (MB)


Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

/opt/anaconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/opt/anaconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:436: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Training: |                                               | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=18` reached.


In [30]:
# Retreive the checkpoint path for best model
best_model_path = checkpoint_callback.best_model_path
model = ProductClassifier.load_from_checkpoint(best_model_path)
best_model_path 

'/Users/maryamahmadi/Documents/GitHub/ML-Driven-E-commerce-Categorization-with-Web-Scraping/lightning_logs/version_0/checkpoints/PC-epoch=17-val_loss=0.12.ckpt'

In [31]:
# Evaluate the model performance on the test dataset
trainer.test(model,datamodule= PCdata_module)

/opt/anaconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Testing: |                                                | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.10831587761640549}]

In [32]:
# Visualize the logs using tensorboard.
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

In [33]:
len(y_test), len(x_test)


(2343, 2343)

In [34]:
from torch.utils.data import TensorDataset

# Tokenize all product titles in x_test
input_ids = []
attention_masks = []


for pro in x_test:
    encoded_pro =  Bert_tokenizer.encode_plus(
                    pro,
                    None,
                    add_special_tokens=True,
                    max_length= MAX_LEN,
                    padding = 'max_length',
                    return_token_type_ids= False,
                    return_attention_mask= True,
                    truncation=True,
                    return_tensors = 'pt'      
    )
    
    # Add the input_ids from encoded product title to the list.    
    input_ids.append(encoded_pro['input_ids'])
    # Add its attention mask 
    attention_masks.append(encoded_pro['attention_mask'])
    
# Now convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(y_test)

# Set the batch size.  
TEST_BATCH_SIZE = 64  

# Create the DataLoader.
pred_data = TensorDataset(input_ids, attention_masks, labels)
pred_sampler = SequentialSampler(pred_data)
pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=TEST_BATCH_SIZE)
    

In [35]:
flat_pred_outs = 0
flat_true_labels = 0

In [36]:
# Put model in evaluation mode
model = model.to(device) # moving model to cuda
model.eval()

# Tracking variables 
pred_outs, true_labels = [], []
#i=0
# Predict 
for batch in pred_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
  
    # Unpack the inputs from our dataloader
    b_input_ids, b_attn_mask, b_labels = batch
 
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        pred_out = model(b_input_ids,b_attn_mask)
        pred_out = torch.sigmoid(pred_out)
        # Move predicted output and labels to CPU
        pred_out = pred_out.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        #i+=1
        # Store predictions and true labels
        #print(i)
        #print(outputs)
        #print(logits)
        #print(label_ids)
    pred_outs.append(pred_out)
    true_labels.append(label_ids)


In [37]:
pred_outs[0][0]

array([0.98021233, 0.01940514, 0.02395695, 0.02230301, 0.02977475,
       0.04395498, 0.01976036], dtype=float32)

In [38]:
# Combine the results across all batches. 
flat_pred_outs = np.concatenate(pred_outs, axis=0)

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

In [39]:
flat_pred_outs.shape , flat_true_labels.shape


((2343, 7), (2343, 7))

In [40]:
#define candidate threshold values
threshold  = np.arange(0.2,0.45,0.01)
threshold

array([0.2 , 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 ,
       0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41,
       0.42, 0.43, 0.44])

In [41]:
# convert probabilities into 0 or 1 based on a threshold value
def classify(pred_prob,thresh):
    y_pred = []

    for tag_label_row in pred_prob:
        temp=[]
        for tag_label in tag_label_row:
            if tag_label >= thresh:
                temp.append(1) # Infer tag value as 1 (present)
            else:
                temp.append(0) # Infer tag value as 0 (absent)
        y_pred.append(temp)

    return y_pred

In [42]:
flat_pred_outs[3]

array([0.02295901, 0.9676558 , 0.02776394, 0.01347516, 0.01975102,
       0.02394114, 0.03220677], dtype=float32)

In [43]:
flat_true_labels[3]


array([0, 1, 0, 0, 0, 0, 0])

In [44]:
from sklearn import metrics
scores=[] # Store the list of f1 scores for prediction on each threshold

#convert labels to 1D array
y_true = flat_true_labels.ravel() 

for thresh in threshold:
    
    #classes for each threshold
    pred_bin_label = classify(flat_pred_outs,thresh) 

    #convert to 1D array
    y_pred = np.array(pred_bin_label).ravel()

    scores.append(metrics.f1_score(y_true,y_pred))

In [45]:
# find the optimal threshold
opt_thresh = threshold[scores.index(max(scores))]
print(f'Optimal Threshold Value = {opt_thresh}')

Optimal Threshold Value = 0.4300000000000002


In [60]:
#y_true = flat_true_labels.ravel() 


In [46]:
#predictions for optimal threshold
y_pred_labels = classify(flat_pred_outs,opt_thresh)
y_pred = np.array(y_pred_labels).ravel() # Flatten
print(metrics.classification_report(y_true,y_pred))


              precision    recall  f1-score   support

           0       0.98      0.98      0.98     13590
           1       0.92      0.89      0.90      2811

    accuracy                           0.97     16401
   macro avg       0.95      0.94      0.94     16401
weighted avg       0.97      0.97      0.97     16401



In [50]:
y_pred = mlb.inverse_transform(np.array(y_pred_labels))
y_act = mlb.inverse_transform(flat_true_labels)

df_result = pd.DataFrame({'Body':x_test,'Actual Tags':y_act,'Predicted Tags':y_pred})

In [51]:
df_result.sample(10)


Unnamed: 0,Body,Actual Tags,Predicted Tags
173,Chrome Third Brake Light Cover,"(Body Parts, Exterior, Lighting)","(Body Parts, Exterior, Lighting)"
1023,NV Series Speedometer Gauges,"(Interior,)","(Interior,)"
599,Elixir HP™ Calcium Sulfonate Synthetic Blend G...,"(Parts,)","(Parts,)"
1472,Aspen Key Chain,"(Body Parts, Exterior)","(Body Parts, Exterior)"
1846,3061/2T-Series 2 t 132 mm to 326 mm 2-Stage Ai...,"(Automotive Tools,)","(Automotive Tools,)"
952,Air Conditioning Expansion Valve,"(Parts,)","(Parts,)"
2149,Remote Vehicle Starter Kit (68186558AB),"(Audio & Electronics,)","(Audio & Electronics,)"
2248,Daytime Running Light Replacement Bulbs,"(Body Parts, Exterior, Lighting)","(Lighting,)"
2117,GM Original Equipment™ Ebony Steering Wheel Au...,"(Interior, Parts)","(Interior, Parts)"
785,Black Mud Flaps,"(Exterior,)","(Exterior,)"


In [60]:
print(flat_true_labels.shape)

(2343, 7)


In [63]:
y_pred_array = np.array(y_pred_labels)

In [64]:
print(y_pred_array)

[[1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 1 0 ... 0 0 0]]


In [97]:
print(y_true.shape[1])

7


In [86]:
num_classes = 7

performance_metrics = {
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': []
}

# Assuming that there are no divisions by zero in precision and recall calculations
for i in range(num_classes):
    performance_metrics['Accuracy'].append(accuracy_score(flat_true_labels[:, i], y_pred_array[:, i]))
    performance_metrics['Precision'].append(precision_score(flat_true_labels[:, i], y_pred_array[:, i], zero_division=0))
    performance_metrics['Recall'].append(recall_score(flat_true_labels[:, i], y_pred_array[:, i], zero_division=0))
    performance_metrics['F1 Score'].append(f1_score(flat_true_labels[:, i], y_pred_array[:, i], zero_division=0))


In [84]:
labels = ['Audio & Electronics', 'Automotive Tools', 'Body Parts', 'Exterior', 'Interior', 'Lighting', 'Parts']
df_performance = pd.DataFrame(performance_metrics)
df_performance.insert(0, 'Label', labels)

# Save the dataframe to a CSV file
csv_file_path = 'BERT_model_performance.csv'
df_performance.to_csv(csv_file_path, index=False)

# Display the DataFrame to ensure it matches the uploaded screenshot
df_performance


Unnamed: 0,Label,Accuracy,Precision,Recall,F1 Score
0,Audio & Electronics,0.986342,0.949807,0.928302,0.938931
1,Automotive Tools,0.976953,0.933333,0.89172,0.912052
2,Body Parts,0.960734,0.889855,0.850416,0.869688
3,Exterior,0.93726,0.89635,0.889855,0.893091
4,Interior,0.96799,0.923858,0.889976,0.9066
5,Lighting,0.985062,0.924658,0.849057,0.885246
6,Parts,0.95988,0.936134,0.908646,0.922185


In [85]:
#flat_true_labels.shape
#np.array(y_pred_labels).shape
#y_temp = mlb.inverse_transform(flat_true_labels)
#y_temp

### Inference

In [76]:
# load a model along with its weights, biases and hyperparameters
QTmodel = ProductClassifier.load_from_checkpoint(best_model_path)
QTmodel.eval()

ProductClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

In [77]:
def predict(product, model, device='cpu'):
    # Move model to the chosen device
    model.to(device)

    text_enc = Bert_tokenizer.encode_plus(
        product,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt'
    )
    input_ids = text_enc['input_ids'].to(device)
    attention_mask = text_enc['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
    
    pred_out = torch.sigmoid(outputs).to('cpu').numpy()
    #print(f'Outputs = {outputs}')
    #print(f'Type = {type(outputs)}')
    #print(f'Pred Outputs = {pred_out}')
    #print(f'Type = {type(pred_out)}')
    preds = np.round(pred_out)
    preds = (pred_out > opt_thresh)
    new_preds = preds.astype(int)
    pred_tags = mlb.inverse_transform(new_preds)

    return pred_tags


### Try the model, give a product name. (Selected Randomly from Amazon website as a benchmark)

In [79]:
product = "Power Stop CRK5377 Coated Brake Rotor & Ceramic Brake Pads- front"

tags = predict(product,model)
if not tags[0]:
    print('This Product can not be associated with any known category - Please review to see if a new category is required ')
else:
    print(f'Following Tags are associated : \n {tags}')

Following Tags are associated : 
 [('Parts',)]


In [80]:
product = "ACANII - For 2006-2014 Honda Ridgeline Headlights Headlamps Replacement 06-14 Driver + Passenger Side"

tags = predict(product,model)
if not tags[0]:
    print('This Product can not be associated with any known category - Please review to see if a new category is required ')
else:
    print(f'Following Tags are associated : \n {tags}')

Following Tags are associated : 
 [('Exterior', 'Lighting')]
