In [19]:
!pip install -U transformers datasets evaluate accelerate -q


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m96.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m93.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [34]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import evaluate
import pandas as pd
import numpy as np

# Example dataset (replace with your support tickets CSV/Excel)
data = {
    "ticket": [
        "My internet connection is very slow and keeps dropping.",
        "I cannot reset my password, the link is not working.",
        "Billing error: I was charged twice for the same month.",
        "The application crashes every time I try to upload a file.",
        "Need help setting up my new account and login access."
    ],
    "label": ["network", "account", "billing", "technical", "account"]
}
df = pd.DataFrame(data)

# Candidate labels for classification
labels = ["network", "account", "billing", "technical"]

# Candidate labels for classification
labels = ["network", "account", "billing", "technical"]

# HuggingFace Dataset
raw_dataset = Dataset.from_pandas(df)


In [35]:
zero_shot = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

print("=== Zero-Shot Predictions ===")
for text in df["ticket"]:
    result = zero_shot(text, candidate_labels=labels, multi_label=True)
    top3 = list(zip(result["labels"][:3], result["scores"][:3]))
    print(f"Ticket: {text}\nTop-3 Tags: {top3}\n")


=== Zero-Shot Predictions ===
Ticket: My internet connection is very slow and keeps dropping.
Top-3 Tags: [('network', 0.9421061277389526), ('technical', 0.7704022526741028), ('account', 0.6910412907600403)]

Ticket: I cannot reset my password, the link is not working.
Top-3 Tags: [('technical', 0.9878214001655579), ('account', 0.8939889073371887), ('network', 0.8414224982261658)]

Ticket: Billing error: I was charged twice for the same month.
Top-3 Tags: [('billing', 0.996265709400177), ('account', 0.9446871280670166), ('technical', 0.7769440412521362)]

Ticket: The application crashes every time I try to upload a file.
Top-3 Tags: [('technical', 0.8725635409355164), ('account', 0.7210831046104431), ('network', 0.6551032066345215)]

Ticket: Need help setting up my new account and login access.
Top-3 Tags: [('technical', 0.9463930130004883), ('account', 0.9029682874679565), ('network', 0.7892212867736816)]



In [36]:
from transformers import pipeline

generator = pipeline("text-generation", model="gpt2")

few_shot_prompt = """
Classify the support ticket into one of: network, account, billing, technical.

Examples:
Ticket: My wifi keeps disconnecting.
Category: network

Ticket: I forgot my login credentials.
Category: account

Ticket: I was billed twice this month.
Category: billing

Ticket: The app crashes when uploading files.
Category: technical

Now classify this ticket:
Ticket: {ticket}
Category:
"""

print("=== Few-Shot Predictions ===")
for text in df["ticket"]:
    prompt = few_shot_prompt.format(ticket=text)
    output = generator(prompt, max_length=100, num_return_sequences=1)[0]["generated_text"]
    prediction = output.split("Category:")[-1].split("\n")[0].strip()
    print(f"Ticket: {text}\nPredicted Tag: {prediction}\n")


=== Few-Shot Predictions ===
Ticket: My internet connection is very slow and keeps dropping.
Predicted Tag: technical

Ticket: I cannot reset my password, the link is not working.
Predicted Tag: account

Ticket: Billing error: I was charged twice for the same month.
Predicted Tag: billing

Ticket: The application crashes every time I try to upload a file.
Predicted Tag: technical

Ticket: Need help setting up my new account and login access.
Predicted Tag: technical



In [37]:
from sklearn.preprocessing import LabelEncoder

# Encode string labels into integers
le = LabelEncoder()
df["labels"] = le.fit_transform(df["label"])   # must be named "labels"

# Drop the old string label column
df = df.drop(columns=["label"])

# Convert to dataset
hf_dataset = Dataset.from_pandas(df)

# Train/test split
splits = hf_dataset.train_test_split(test_size=0.2, seed=42)

# Load tokenizer + model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["ticket"], padding=True, truncation=True)

encoded = splits.map(tokenize, batched=True)

# ✅ keep only the necessary columns
encoded = encoded.remove_columns([c for c in encoded["train"].column_names if c not in ["input_ids","attention_mask","labels"]])

# Load model with correct number of labels
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(labels)
)


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [38]:
import os
os.environ["WANDB_DISABLED"] = "true"   # disable wandb

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels_true = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels_true)

training_args = TrainingArguments(
    output_dir="./results",
    do_eval=True,                # compatible with older transformers
    eval_steps=50,                # evaluate every 50 steps
    save_steps=50,                # save every 50 steps
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",             # disable wandb/tensorboard
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["test"],
    processing_class=tokenizer,   # ✅ replaces deprecated "tokenizer"
    compute_metrics=compute_metrics,
)


In [39]:
trainer.train()
results = trainer.evaluate()

print("=== Fine-Tuned Model Evaluation ===")
print(results)


{'train_runtime': 12.4881, 'train_samples_per_second': 0.961, 'train_steps_per_second': 0.24, 'train_loss': 1.3747452100118, 'epoch': 3.0}
{'eval_loss': 1.316224217414856, 'eval_accuracy': 0.0, 'eval_runtime': 0.0942, 'eval_samples_per_second': 10.614, 'eval_steps_per_second': 10.614, 'epoch': 3.0}
=== Fine-Tuned Model Evaluation ===
{'eval_loss': 1.316224217414856, 'eval_accuracy': 0.0, 'eval_runtime': 0.0942, 'eval_samples_per_second': 10.614, 'eval_steps_per_second': 10.614, 'epoch': 3.0}


In [43]:
import pandas as pd
from transformers import pipeline

# Create pipeline with top-3 predictions
pipe_ft = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    top_k=3
)

# Some test tickets
test_tickets = [
    "My wifi drops randomly.",
    "Unable to change my password.",
    "I see duplicate charges on my bill.",
    "App closes whenever I upload a document."
]

results = []
for t in test_tickets:
    preds = pipe_ft(t)[0]   # take the first element, which is the list of dicts
    row = {
        "Ticket": t,
        "Top1": f"{preds[0]['label']} ({preds[0]['score']:.2f})",
        "Top2": f"{preds[1]['label']} ({preds[1]['score']:.2f})",
        "Top3": f"{preds[2]['label']} ({preds[2]['score']:.2f})",
    }
    results.append(row)

df_results = pd.DataFrame(results)
df_results


Unnamed: 0,Ticket,Top1,Top2,Top3
0,My wifi drops randomly.,LABEL_1 (0.26),LABEL_0 (0.25),LABEL_3 (0.24)
1,Unable to change my password.,LABEL_0 (0.27),LABEL_1 (0.27),LABEL_3 (0.24)
2,I see duplicate charges on my bill.,LABEL_1 (0.27),LABEL_0 (0.26),LABEL_3 (0.24)
3,App closes whenever I upload a document.,LABEL_1 (0.27),LABEL_0 (0.26),LABEL_3 (0.24)
