Leveraging a pre-trained model from Hugging Face and fine-tune it with data  to help classify interactions depending on the risk for churn.

In [8]:
import pandas as pd

# Create dummy dataframes for demonstration
train_data = pd.DataFrame({
    "interaction": ["I'm really upset with the delays on delivering this item. Where is it?", "The support I've had on this issue has been terrible and really unhelpful. Why will no one help me?", 'I have a question about how to use this product. Can you help me?', 'This product is listed as out of stock. When will it be available again?'],
    "label": ['high risk', 'high risk', 'low risk', 'low risk'] # Added labels to match the number of interactions
})

test_data = pd.DataFrame({
    "interaction": ['You charged me twice for the one item. I need a refund.'], # Added another test interaction
    "label": ['high risk'] # Added a label to match the number of test interactions
})

In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the data
tokenized_training_data = tokenizer(train_data["interaction"].tolist(), return_tensors="pt", padding=True,truncation=True, max_length=20)

tokenized_test_data = tokenizer(test_data["interaction"].tolist(), return_tensors="pt", padding=True, truncation=True, max_length=20)

print(tokenized_training_data)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_ids': tensor([[  101,  1045,  1005,  1049,  2428,  6314,  2007,  1996, 14350,  2006,
         12771,  2023,  8875,  1012,  2073,  2003,  2009,  1029,   102,     0],
        [  101,  1996,  2490,  1045,  1005,  2310,  2018,  2006,  2023,  3277,
          2038,  2042,  6659,  1998,  2428,  4895, 16001, 14376,  5313,   102],
        [  101,  1045,  2031,  1037,  3160,  2055,  2129,  2000,  2224,  2023,
          4031,  1012,  2064,  2017,  2393,  2033,  1029,   102,     0,     0],
        [  101,  2023,  4031,  2003,  3205,  2004,  2041,  1997,  4518,  1012,
          2043,  2097,  2009,  2022,  2800,  2153,  1029,   102,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}


The text has been encoded into numeric tensor tokens.

tokenizing the data in rows or batches

In [10]:
def tokenize_function(data):
    return tokenizer(data["interaction"],
                     return_tensors='pt',
                     padding=True,
                     truncation=True,
                     max_length=64)

tokenized_in_batches = train_data.map(tokenize_function, batched=True)

TypeError: tokenize_function() got an unexpected keyword argument 'batched'

In [11]:
def tokenize_function(data):
    return tokenizer(data["interaction"].tolist(), # Convert Series to list
                     return_tensors='pt',
                     padding=True,
                     truncation=True,
                     max_length=64)

batch_size = 16  # Define your desired batch size
tokenized_batches = []

for i in range(0, len(train_data), batch_size):
    batch_data = train_data[i : i + batch_size]
    tokenized_batch = tokenize_function(batch_data)
    tokenized_batches.append(tokenized_batch)

# You can now work with the tokenized_batches list
print(tokenized_batches)

[{'input_ids': tensor([[  101,  1045,  1005,  1049,  2428,  6314,  2007,  1996, 14350,  2006,
         12771,  2023,  8875,  1012,  2073,  2003,  2009,  1029,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1996,  2490,  1045,  1005,  2310,  2018,  2006,  2023,  3277,
          2038,  2042,  6659,  1998,  2428,  4895, 16001, 14376,  5313,  1012,
          2339,  2097,  2053,  2028,  2393,  2033,  1029,   102],
        [  101,  1045,  2031,  1037,  3160,  2055,  2129,  2000,  2224,  2023,
          4031,  1012,  2064,  2017,  2393,  2033,  1029,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2023,  4031,  2003,  3205,  2004,  2041,  1997,  4518,  1012,
          2043,  2097,  2009,  2022,  2800,  2153,  1029,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 

 We have created dataset objects that we  can use during the training process for fine-tuning

In [14]:
from transformers import TrainingArguments

# Set up an instance of TrainingArguments
training_args = TrainingArguments(
  output_dir="./finetuned",
  # Set the evaluation strategy
  eval_strategy='epoch',
  # Specify the number of epochs
  num_train_epochs=3,
  learning_rate=2e-5,
  # Set the batch sizes
  per_device_train_batch_size=3,
  per_device_eval_batch_size=3,
  weight_decay=0.01
)

In [18]:
from transformers import Trainer, DistilBertForSequenceClassification
from datasets import Dataset
import torch

# Map labels to integers
label_map = {'low risk': 0, 'high risk': 1}
train_data['label'] = train_data['label'].map(label_map)
test_data['label'] = test_data['label'].map(label_map)

# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(test_data)

# Tokenize the datasets
train_dataset = train_dataset.map(lambda examples: tokenizer(examples["interaction"], padding="max_length", truncation=True), batched=True)
eval_dataset = eval_dataset.map(lambda examples: tokenizer(examples["interaction"], padding="max_length", truncation=True), batched=True)

# Set the format for PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


# Set up the trainer object
trainer = Trainer(
    model=DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_map)),
    # Assign the training arguments and tokenizer
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.726608
2,No log,0.720246
3,No log,0.72003




TrainOutput(global_step=6, training_loss=0.700273354848226, metrics={'train_runtime': 54.1676, 'train_samples_per_second': 0.222, 'train_steps_per_second': 0.111, 'total_flos': 1589608783872.0, 'train_loss': 0.700273354848226, 'epoch': 3.0})

In [21]:
input_text = ["I'd just like to say, I didnt like the product! Thank you!"]

# Tokenize the new data
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Pass the tokenized inputs through the model
with torch.no_grad():
    outputs = model(**inputs)

# Extract the new predictions
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

label_map = {0: "Low risk", 1: "High risk"}
for i, predicted_label in enumerate(predicted_labels):
    churn_label = label_map[predicted_label]
    print(f"\n Input Text {i + 1}: {input_text[i]}")
    print(f"Predicted Label: {predicted_label}")


 Input Text 1: I'd just like to say, I didnt like the product! Thank you!
Predicted Label: 1


Transfer learning with one-shot learning

In [23]:
# Include an example in the input ext
input_text = """
Text: "The dinner we had was great and the service too."
Classify the sentiment of this sentence as either positive or negative.
Example:
Text: "The food was delicious"
Sentiment: Positive
Text: "The dinner we had was great and the service too."
Sentiment:
"""

# Tokenize the new data
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=100)

# Apply the example to the model
with torch.no_grad():
    outputs = model(**inputs)

# Extract the new predictions
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()


sentiment_map = {0: "Negative", 1: "Positive"}

# Since the input text is a single example, we take the first prediction
predicted_sentiment = sentiment_map[predicted_labels[0]]

print(f"Predicted Sentiment: {predicted_sentiment}")

Predicted Sentiment: Positive
