In [19]:
!pip install datasets
!pip install -U accelerate



In [2]:
!pip install transformers
!pip install rouge_score
!pip install scikit-learn



In [1]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('nlp-3 training data.csv', encoding="utf-8")
#input_csv = 'nlp-3 training data.csv'
dataset = Dataset.from_pandas(df)

In [3]:
df.columns = [col.lower() for col in df.columns]
df = df.applymap(lambda s: s.lower() if type(s) == str else s)

In [4]:
#convert_columns_to_lowercase(input_csv)
df


Unnamed: 0,questions,labels
0,where can i find the crème de la mer document?,moisturizer/la mer
1,show me the file for crème de la mer.,moisturizer/la mer
2,i need the data on crème de la mer.,moisturizer/la mer
3,locate the crème de la mer information.,moisturizer/la mer
4,find the report on crème de la mer.,moisturizer/la mer
...,...,...
751,how can i locate the moisture bound sleeping r...,face mask/amorepacific
752,where can i find the moisture bound sleeping r...,face mask/amorepacific
753,what is the directory for the moisture bound s...,face mask/amorepacific
754,how do i access the moisture bound sleeping re...,face mask/amorepacific


In [5]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
unique_labels = df['labels'].unique()
label_encoder.fit(unique_labels)
df['label_encoded'] = label_encoder.transform(df['labels'])
encoded_labels = label_encoder.fit_transform(unique_labels)

In [6]:
def tokenize_function(examples):
    return tokenizer(examples['questions'], padding='max_length', truncation=True)


In [7]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_data = df['questions'].apply(lambda x: tokenizer(x, padding='max_length', truncation=True, max_length=128))
df['input_ids'] = tokenized_data.apply(lambda x: x['input_ids'])
df['attention_mask'] = tokenized_data.apply(lambda x: x['attention_mask'])
df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,questions,labels,label_encoded,input_ids,attention_mask
0,where can i find the crème de la mer document?,moisturizer/la mer,28,"[101, 2073, 2064, 1045, 2424, 1996, 13675, 213...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
1,show me the file for crème de la mer.,moisturizer/la mer,28,"[101, 2265, 2033, 1996, 5371, 2005, 13675, 213...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
2,i need the data on crème de la mer.,moisturizer/la mer,28,"[101, 1045, 2342, 1996, 2951, 2006, 13675, 213...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
3,locate the crème de la mer information.,moisturizer/la mer,28,"[101, 12453, 1996, 13675, 21382, 2139, 2474, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ..."
4,find the report on crème de la mer.,moisturizer/la mer,28,"[101, 2424, 1996, 3189, 2006, 13675, 21382, 21...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."


In [8]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(df['labels'])))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import torch
from torch.utils.data import Dataset

class FAQDataset(Dataset):
    def __init__(self, df):
        self.input_ids = list(df['input_ids'])
        self.attention_mask = list(df['attention_mask'])
        self.labels = list(df['label_encoded'])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
dataset = FAQDataset(df)

In [10]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = FAQDataset(train_df)
val_dataset = FAQDataset(val_df)

In [11]:

from torch.utils.data import DataLoader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Original Model

In [28]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_metric
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    accuracy = accuracy_score(p.label_ids, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    rouge = load_metric("rouge")
    rouge_score = rouge.compute(predictions=preds, references=p.label_ids)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'rouge1': rouge_score['rouge1'].mid.fmeasure,
        'rouge2': rouge_score['rouge2'].mid.fmeasure,
        'rougeL': rouge_score['rougeL'].mid.fmeasure
    }
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    logging_strategy='steps',
    logging_steps=10,
    learning_rate=2e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.005,
    report_to="none",
    save_strategy="epoch"
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Rouge1,Rouge2,Rougel
1,3.1496,2.674202,0.348684,0.188281,0.348684,0.229033,0.348684,0.0,0.348684
2,1.9912,1.704301,0.618421,0.536574,0.618421,0.547949,0.618421,0.0,0.618421
3,1.4288,1.307843,0.717105,0.623362,0.717105,0.649409,0.717105,0.0,0.717105


  _warn_prf(average, modifier, msg_start, len(result))
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


TrainOutput(global_step=114, training_loss=2.311977921870717, metrics={'train_runtime': 61.4677, 'train_samples_per_second': 29.479, 'train_steps_per_second': 1.855, 'total_flos': 60051607852032.0, 'train_loss': 2.311977921870717, 'epoch': 3.0})

In [13]:
import pickle
model_file_path = 'distilbert_model.pkl'
with open(model_file_path, 'wb') as f:
    pickle.dump((model.state_dict(), model.config), f)

print(f"Model saved to {model_file_path}")

# Load the model using pickle
with open(model_file_path, 'rb') as f:
    state_dict, config = pickle.load(f)

# Recreate the model using the loaded state_dict and config
loaded_model = AutoModelForSequenceClassification.from_config(config)
loaded_model.load_state_dict(state_dict)

Model saved to distilbert_model.pkl


<All keys matched successfully>

In [14]:
results = trainer.evaluate()
print(results)

  _warn_prf(average, modifier, msg_start, len(result))
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 1.397429347038269, 'eval_accuracy': 0.6973684210526315, 'eval_precision': 0.6245863237639554, 'eval_recall': 0.6973684210526315, 'eval_f1': 0.632279034175319, 'eval_rouge1': 0.6973684210526315, 'eval_rouge2': 0.0, 'eval_rougeL': 0.6973684210526315, 'eval_runtime': 1.0488, 'eval_samples_per_second': 144.921, 'eval_steps_per_second': 9.534, 'epoch': 3.0}


In [18]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
def predict_top_3(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
    top_3_indices = torch.topk(probabilities, 3, dim=1).indices[0].cpu().numpy()
    top_3_labels = label_encoder.inverse_transform(top_3_indices)
    top_3_probs = probabilities[0, top_3_indices].cpu().numpy()
    return list(zip(top_3_labels, top_3_probs))

def print_folders_and_subfolders(predictions):
    for label, prob in predictions:
        folder, sub_folder = label.split('/')
        print(f"Folder: {folder}, Sub-folder: {sub_folder}, Probability: {prob:.4f}")

example_text = "where is the vitamin glowing face mask located?"
predictions = predict_top_3(example_text)
print_folders_and_subfolders(predictions)


Folder: face mask, Sub-folder: fresh, Probability: 0.8884
Folder: eye cream, Sub-folder: fresh, Probability: 0.0170
Folder: face mask, Sub-folder: bobbi brown, Probability: 0.0138


## Abalation Study
Testing the model parameters for higher learning rate

In [22]:
training_args_higher_lr = TrainingArguments(
    output_dir='./results_higher_lr',
    learning_rate=1e-3,
)

trainer_higher_lr = Trainer(
    model=model,
    args=training_args_higher_lr,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer_higher_lr.train()
results_higher_lr = trainer_higher_lr.evaluate()
print("Higher Learning Rate Results:", results_higher_lr)


Step,Training Loss


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Lower Learning Rate Results: {'eval_loss': 0.16461966931819916, 'eval_accuracy': 0.9802631578947368, 'eval_precision': 0.982865057380293, 'eval_recall': 0.9802631578947368, 'eval_f1': 0.9790386829860515, 'eval_rouge1': 0.9802631578947368, 'eval_rouge2': 0.0, 'eval_rougeL': 0.9802631578947368, 'eval_runtime': 1.4498, 'eval_samples_per_second': 104.844, 'eval_steps_per_second': 13.105, 'epoch': 3.0}


Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Higher Learning Rate Results: {'eval_loss': 3.1831648349761963, 'eval_accuracy': 0.18421052631578946, 'eval_precision': 0.03420209176788124, 'eval_recall': 0.18421052631578946, 'eval_f1': 0.057629942578322574, 'eval_rouge1': 0.18421052631578946, 'eval_rouge2': 0.0, 'eval_rougeL': 0.18421052631578946, 'eval_runtime': 1.0176, 'eval_samples_per_second': 149.366, 'eval_steps_per_second': 18.671, 'epoch': 3.0}


In [23]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
def predict_top_3(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
    top_3_indices = torch.topk(probabilities, 3, dim=1).indices[0].cpu().numpy()
    top_3_labels = label_encoder.inverse_transform(top_3_indices)
    top_3_probs = probabilities[0, top_3_indices].cpu().numpy()
    return list(zip(top_3_labels, top_3_probs))

def print_folders_and_subfolders(predictions):
    for label, prob in predictions:
        folder, sub_folder = label.split('/')
        print(f"Folder: {folder}, Sub-folder: {sub_folder}, Probability: {prob:.4f}")

example_text = "where is the vitamin glowing face mask located?"
predictions = predict_top_3(example_text)
print_folders_and_subfolders(predictions)

Folder: sun protect, Sub-folder: supergoop!, Probability: 0.0906
Folder: face mask, Sub-folder: fresh, Probability: 0.0821
Folder: eye cream, Sub-folder: fresh, Probability: 0.0527


## Analysis of Model Outputs
### Original Model Output
  Before the ablation study, let’s assume the original model provided the following outputs for a given input:

Folder: face mask, Sub-folder: fresh, Probability: 0.7897\
Folder: eye cream, Sub-folder: fresh, Probability: 0.0319\
Folder: moisturizer, Sub-folder: fresh, Probability: 0.0210

###New Model Output After Ablation Study
After conducting the ablation study and removing weight decay, the model provided the following outputs:

Folder: sun protect, Sub-folder: supergoop!, Probability: 0.0906\
Folder: face mask, Sub-folder: fresh, Probability: 0.0821\
Folder: eye cream, Sub-folder: fresh, Probability: 0.0527\

### Detailed Analysis
#### Impact on Top Prediction

Original Top Prediction: face mask/fresh with a probability of 0.7897\
Ablation Top Prediction: sun protect/supergoop! with a probability of 0.0906\
\
**Analysis**: The removal of weight decay led to a significant change in the top prediction. The original model was highly confident in predicting face mask/fresh, while the new model is less confident and predicts sun protect/supergoop! as the top category. This indicates that the regularization previously helped in making more confident and potentially accurate predictions.
Probability Distribution

**Original Model**: The top prediction had a much higher probability (0.7897) compared to the rest.
**Ablation Model**: The probabilities are more evenly distributed (0.0906, 0.0821, and 0.0527).
**Analysis**: Removing weight decay resulted in a less confident model, as indicated by the more even distribution of probabilities. This could mean that without regularization, the model's decision boundaries have become less clear, leading to more uncertainty in predictions.