## Import Necessary Libraries

In [None]:
!pip install transformers datasets

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import f1_score

## Load the datasets

In [None]:
train_df = pd.read_excel('/content/bodywash-train .xlsx')
test_df = pd.read_excel('/content/bodywash-test .xlsx')

In [None]:
train_df

Unnamed: 0,Core Item,Level 1 Factors
0,31069 we've looked every where for your body ...,Accessibility
1,I love all the scents. I buy all three at onc...,Fragrance
2,I see several in this pic I haven't tried! Go...,Accessibility
3,I'm a big fan of everything honestly. I use t...,Brand Value
4,The Best! Hands down,Brand Value
...,...,...
7739,"YUM-It has a wonderful woodsy smell. I know, i...",Fragrance
7740,YUM-My man has tried all these flavors now and...,Companion Approval
7741,YUM-My man has tried all these flavors now and...,Fragrance
7742,"YUMMY!!!-My nam smells great, what else can I ...",Companion Approval


Finding the unique labels

In [None]:
level_1_counts = train_df['Level 1 Factors'].value_counts()
level_1_counts

Unnamed: 0_level_0,count
Level 1 Factors,Unnamed: 1_level_1
Fragrance,2335
Brand Value,1012
Price,828
Product Texture,573
Cleansing,534
Feel / Finish,435
Companion Approval,390
Product Safety,376
Accessibility,239
Skin Care,211


## Text Preprocessing

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def preprocess_text(text):
    # Remove links if any
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^A-Za-z0-9\s]', ' ', text)
    # Lowercase
    text = text.lower()
    # Remove stopwords
    words = text.split()
    processed_words = [word for word in words if word not in stop_words or word.isdigit()]
    return " ".join(processed_words)

train_df['Core Item'] = train_df['Core Item'].astype(str).apply(preprocess_text)

In [None]:
train_df

Unnamed: 0,Core Item,Level 1 Factors
0,31069 looked every body wash nobody within 2 h...,Accessibility
1,love scents buy three theydo last house,Fragrance
2,see several pic tried gotten harder find never...,Accessibility
3,big fan everything honestly use lathering shav...,Brand Value
4,best hands,Brand Value
...,...,...
7739,yum wonderful woodsy smell know men woman love,Fragrance
7740,yum man tried flavors loves one second fave ce...,Companion Approval
7741,yum man tried flavors loves one second fave ce...,Fragrance
7742,yummy nam smells great else say,Companion Approval


## Converting the dataset into a Multi-Label Dataset

In [None]:
# Group by 'Core Item' and concatenate 'Level 1 Factors'
def concatenate_factors(df, text_column, factor_column):
    return (
        df.groupby(text_column, as_index=False)
        .agg({factor_column: lambda x: ','.join(map(str, x))})
    )

# Apply transformation
train_df_new = concatenate_factors(train_df, "Core Item", "Level 1 Factors")

In [None]:
train_df_new

Unnamed: 0,Core Item,Level 1 Factors
0,,Product Safety
1,1 choice man house buy guy smells soooooo good...,"Accessibility,Brand Value,Companion Approval,F..."
2,1 product order future,Brand Value
3,1 shower gel excellent,Brand Value
4,100 correct axe im big fan son loves cedar woo...,"Brand Value,Cleansing,Product Texture"
...,...,...
3483,yum man tried flavors loves one second fave ce...,"Companion Approval,Fragrance"
3484,yum wonderful woodsy smell know men woman love,Fragrance
3485,yummy nam smells great else say,Companion Approval
3486,zero scent body wash zero scent,Fragrance


In [None]:
# Create a MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the 'Level 1 Factors' column
train_df_new['Level 1 Factors'] = train_df_new['Level 1 Factors'].str.split(',')
multilabel_data = mlb.fit_transform(train_df_new['Level 1 Factors'])

# Create a new DataFrame with the multi-label data
multilabel_df = pd.DataFrame(multilabel_data, columns=mlb.classes_)

# Concatenate the multi-label DataFrame with the original DataFrame
train_df_new = pd.concat([train_df_new, multilabel_df], axis=1)

# Remove the original 'Level 1 Factors' column (optional)
train_df_new = train_df_new.drop('Level 1 Factors', axis=1)

train_df_new

Unnamed: 0,Core Item,Accessibility,Brand Accountability,Brand For Me,Brand Value,Cleansing,Companion Approval,Convenience,Efficacy,Feel / Finish,Fragrance,Packaging,Price,Product Safety,Product Texture,Skin Care,Skin Texture Improvement
0,,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,1 choice man house buy guy smells soooooo good...,1,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0
2,1 product order future,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,1 shower gel excellent,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,100 correct axe im big fan son loves cedar woo...,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3483,yum man tried flavors loves one second fave ce...,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
3484,yum wonderful woodsy smell know men woman love,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3485,yummy nam smells great else say,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3486,zero scent body wash zero scent,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


## Train Validation Splitting and Model Selection

In [None]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(train_df_new, test_size=0.2, random_state=42)

In [None]:
labels = list(train_df_new.columns[1:])
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}

In [None]:
# Tokenization
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# Define the model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=len(labels), problem_type="multi_label_classification",
    id2label=id2label, label2id=label2id
)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['Core Item'], padding="max_length", truncation=True)

def map_labels(examples):
    return {'labels': [[float(examples[label][i]) for label in labels] for i in range(len(examples[labels[0]]))]}

# Convert to HuggingFace dataset
train_dataset = Dataset.from_pandas(train_data)
train_dataset = train_dataset.map(tokenize_function, batched=True)
train_dataset = train_dataset.map(map_labels, batched=True)
train_dataset = train_dataset.remove_columns(['Core Item'])
train_dataset.set_format("torch")

val_dataset = Dataset.from_pandas(val_data)
val_dataset = val_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(map_labels, batched=True)
val_dataset = val_dataset.remove_columns(['Core Item'])
val_dataset.set_format("torch")

## Model Training

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = (pred.predictions > 0).astype(int)
    f1 = f1_score(labels, preds, average="weighted")
    return {"f1": f1}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,F1
1,No log,0.270719,0.381205
2,No log,0.215019,0.625114
3,0.244500,0.187994,0.684244
4,0.244500,0.174531,0.719317
5,0.244500,0.169786,0.725781


TrainOutput(global_step=875, training_loss=0.19729711478097098, metrics={'train_runtime': 1394.4528, 'train_samples_per_second': 10.004, 'train_steps_per_second': 0.627, 'total_flos': 3670860592742400.0, 'train_loss': 0.19729711478097098, 'epoch': 5.0})

## Predictions on test dataset

In [None]:
# Preprocess the test data
test_df_texts = test_df['Core Item'].tolist()
test_df['Core Item'] = test_df['Core Item'].astype(str).apply(preprocess_text)

# Tokenize the test data
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.remove_columns(['Core Item'])
test_dataset.set_format("torch")

# Make predictions on the test dataset
predictions = trainer.predict(test_dataset)

predictions

In [None]:
predicted_labels = (predictions.predictions > 0).astype(int)
predicted_labels

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
# Check if any row of predicted labels have all values 0, then mark the index with maximum prediction value
def check_and_mark_zero_rows(predicted_labels):
    zero_rows_indices = np.where(np.all(predicted_labels == 0, axis=1))[0]
    if len(zero_rows_indices)>0:
        print("Zero rows detected")
        for i in zero_rows_indices:
            max_pred_index_in_row = np.argmax(predictions.predictions[i])
            predicted_labels[i][max_pred_index_in_row] = 1
            print(f"Row {i} marked with max probability at index {max_pred_index_in_row}")
    else:
        print("No zero rows detected")

    return predicted_labels


predicted_labels = check_and_mark_zero_rows(predicted_labels)

Zero rows detected
Row 86 marked with max probability at index 9
Row 105 marked with max probability at index 12
Row 109 marked with max probability at index 12
Row 125 marked with max probability at index 14


In [None]:
# Convert the multi-hot encoded predictions back to labels
predicted_labels_strings = mlb.inverse_transform(predicted_labels)

# Create a DataFrame with the original 'Core Item' and predicted labels
results_df = pd.DataFrame({'Core Item': test_df_texts, 'Predicted Level 1 Factors': predicted_labels_strings})

# Display or save the results
results_df

Unnamed: 0,Core Item,Predicted Level 1 Factors
0,"""All of the body washes are excellent and they...","(Fragrance, Skin Care)"
1,"""Cremo is by far the best!""","(Brand Value,)"
2,"""I use the Nivea's during the spring and summe...","(Brand Value,)"
3,"""Nivea and Dove. Both are great on my skin. No...","(Cleansing, Feel / Finish, Product Safety, Ski..."
4,"""OG, The one thing that would hold me back fro...","(Brand Value,)"
...,...,...
122,What's that smell honey?-That's usually the li...,"(Companion Approval, Fragrance)"
123,WOmanLY-Smells like a womans body wash. Way to...,"(Companion Approval, Fragrance)"
124,"Wonderful stuff, but what's with this price???...","(Fragrance, Price, Product Texture)"
125,Works better than advertised.-This product wor...,"(Skin Care,)"


## Output Postprocessing

In [None]:
results_df['Predicted Level 1 Factors'] = results_df['Predicted Level 1 Factors'].apply(lambda x: ','.join(x))

In [None]:
results_df

Unnamed: 0,Core Item,Predicted Level 1 Factors
0,"""All of the body washes are excellent and they...","Fragrance,Skin Care"
1,"""Cremo is by far the best!""",Brand Value
2,"""I use the Nivea's during the spring and summe...",Brand Value
3,"""Nivea and Dove. Both are great on my skin. No...","Cleansing,Feel / Finish,Product Safety,Skin Ca..."
4,"""OG, The one thing that would hold me back fro...",Brand Value
...,...,...
122,What's that smell honey?-That's usually the li...,"Companion Approval,Fragrance"
123,WOmanLY-Smells like a womans body wash. Way to...,"Companion Approval,Fragrance"
124,"Wonderful stuff, but what's with this price???...","Fragrance,Price,Product Texture"
125,Works better than advertised.-This product wor...,Skin Care


In [None]:
results_df.to_csv('bodywash-predicted.csv', index=False)