In [2]:
import torch
from transformers import pipeline
import pandas as pd

# Load the embeddings from the pickle file
file_path = "/content/structured_conversations_10k_cleaned.xlsx"
sample_data = pd.read_excel(file_path)
KEY = "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"

In [3]:
system_prompt = """You are a conversational assistant designed to analyze customer support conversations. Your task is to:
1. Extract entities like product, service, issue type, and others relevant to the conversation.
2. Classify the conversation into one or more intents.

---

### **Intents**
Here are the possible intents that you MUST classify:
1. **Complaint**: Expressions of dissatisfaction or frustration with a product, service, or experience.
2. **Request for Assistance**: Requests for help, support, or clarification.
3. **Inquiry**: Questions about products, services, or policies.
4. **Feedback**: Opinions, suggestions, or gratitude (positive, negative, or neutral).
5. **Escalation Request**: Requests to speak to a manager or higher authority for faster resolution.
6. **Account or Billing Issue**: Problems or queries about account management, billing, refunds, or payment.
7. **Technical Support**: Requests for troubleshooting or technical assistance.
8. **Delivery or Fulfillment Issue**: Issues with order delivery, delays, or couriers.
9. **Promotion or Offer Inquiry**: Questions or complaints about promotions or offers.
10. **Uncategorized**: Messages that don't fit into the above categories.

---

### **Entities**
You must extract the following entities wherever applicable:
- **Product**: Name or type of product mentioned in the conversation.
- **Service**: Name or type of service discussed.
- **Issue Type**: Nature of the problem or complaint.
- **Time Frame**: Any time period or duration mentioned.
- **Other**: Any other relevant information.

---

### **Input Structure**
The input is a JSON object with the following format:
[
  {
    "Company_name": "company_name_here"
  },
  {
    "conversation": [
      {
        "role": "Customer",
        "message": "text_here"
      },
      {
        "role": "Company",
        "message": "text_here"
      }
    ]
  }
]

- **Company_name**: The name of the company involved in the conversation.
- **Conversation**: A list of messages exchanged, where:
  - **role**: Indicates who is speaking (`Customer` or `Company`).
  - **message**: The content of the message.

---

### **Output Structure**
The output MUST be a JSON object with the following format:
{
  "Company_name": "sprintcare",
  "entities": [
    {
      "role": "Customer",
      "entities": {
        "product": "customer service",
        "service": null,
        "issue_type": "poor service",
        "time_frame": null,
        "other": null
      }
    },
    {
      "role": "Company",
      "entities": {
        "product": null,
        "service": "account review",
        "issue_type": null,
        "time_frame": null,
        "other": null
      }
    }
  ],
  "classified_intents": ["Complaint", "Request for Assistance"]
}

---

### **What You MUST Do**
1. Extract entities from every message in the conversation.
2. Classify intents using the defined categories only.
3. Output both entities and intents in the specified format.
4. Ensure all roles and messages in the input have corresponding entity entries in the output.
5. YOU HAVE TO USE OUTPUT STRUCTURE AND DO NOT ADD ANY EXTRA TEXT BEFORE OR AFTER THE OUTPUT STRUCTURE.
6. JUST ANSWER WITH OUTPUT STRUCTURE.
7. You MUST select at least one intent.
8. "Company_name", "classified_intents" and "entities" fields in the output structure MUST be filled.

---

### **What You MUST NOT Do**
1. Do not invent new entity or intent categories.
2. Do not output intents or entities not listed in the specifications.
3. Do not omit any part of the input conversation from the output.

---
"""

In [6]:
# Load the Llama pipeline
def load_llama_pipeline(huggingface_key):

    from huggingface_hub import login

    login(huggingface_key)

    model_id = "meta-llama/Llama-3.2-3B-Instruct"
    pipe = pipeline(
        "text-generation",
        model=model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    return pipe

  # Define a function for entity extraction
from tqdm import tqdm

def extract_entities(tweet_text,pipe):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Identify all relevant entities in this tweet, including product, service, and issue type: '{tweet_text}'"},
    ]
    output = pipe(messages, max_new_tokens=256)
    return output[0]["generated_text"][-1].get("content", "")

def extract_from_dataset(H_KEY,df,start_ind,end_ind):
    pipe = load_llama_pipeline(H_KEY)
    sample_convs = df["structured_conversation"][start_ind:end_ind].values
    entities_intents = []


    for tweet in tqdm(sample_convs):
        entities = extract_entities(tweet,pipe)
        entities_intents.append(entities)

    sample_df = df.iloc[start_ind:end_ind]
    sample_df["entities_intents"] = entities_intents

    sample_df.to_excel(f"/content/Intents_{start_ind}-{end_ind}.xlsx", index=False)


In [7]:
extract_from_dataset(KEY,sample_data,0,10)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 10%|█         | 1/10 [00:11<01:40, 11.12s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 10%|█         | 1/10 [00:14<02:08, 14.28s/it]


KeyboardInterrupt: 