In [44]:
import pandas as pd
import torch
from transformers import pipeline
import time
import ast

In [34]:
# Load the data from the Excel file
file_path = "..\\..\\Data\processed\sample\\twcs_structured_UniqueCount-10_time-20241222-1549.xlsx"
sample_data = pd.read_excel(file_path)

# Display the first few rows of the data to understand its structure
sample_data.head()

Unnamed: 0,user_id,conversations,company_name,cleaned_conversations,structured_conversations
0,115712,Customer: @sprintcare is the worst customer se...,sprintcare,Customer is the worst customer service\nCompan...,"[{'Company_name': 'sprintcare'}, {'conversatio..."
1,115712,Customer: @sprintcare is the worst customer se...,sprintcare,Customer is the worst customer service\nCompan...,"[{'Company_name': 'sprintcare'}, {'conversatio..."
2,115712,Customer: @sprintcare is the worst customer se...,sprintcare,Customer is the worst customer service\nCompan...,"[{'Company_name': 'sprintcare'}, {'conversatio..."
3,115712,Customer: @115714 @sprintcare how can I get in...,sprintcare,Customer how can I get in touch with someone t...,"[{'Company_name': 'sprintcare'}, {'conversatio..."
4,115712,Customer: @115714 @sprintcare how can I get in...,sprintcare,Customer how can I get in touch with someone t...,"[{'Company_name': 'sprintcare'}, {'conversatio..."


In [35]:
# Load the Llama pipeline
model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [36]:
system_prompt = """You are a conversational assistant designed to analyze customer support conversations. Your task is to:
1. Extract services from the conversation.
2. Provide the output in the specified format.
3. Services is Name or type of service discussed.
---

### **Input Structure**
The input is a JSON object with the following format:
[
  { "Company_name": "company_name_here" },
  { "conversation": [
      { "role": "Customer", "message": "text_here" },
      { "role": "Company", "message": "text_here" }
    ]
  }
]

- **Company_name**: The name of the company involved in the conversation.
- **Conversation**: A list of messages exchanged, where:
  - **role**: Indicates who is speaking (`Customer` or `Company`).
  - **message**: The content of the message.

---

### **Output Structure**
The output MUST be a JSON object with the following format:
{
  "Company_name": "company_name_here",
  "service": [
    "service_name_here", 
  ]
}

### END OF FORMAT
---

### **What You MUST Do**
1. Use exact JSON format—no extra text.
2. If no service is mentioned, use "service": null.
3. List all services if multiple are mentioned.
---

### **What You MUST NOT Do**
1. Do not omit any part of the input conversation from the output.
---

"""

In [45]:
# Define a function for entity extraction
def extract_entities(conversation_text):
    """
    Extract services from structured conversation text.
    """
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Identify all services in this conversation: '{conversation_text}'"},
    ]
    output = pipe(messages, max_new_tokens=256)


    # Convert string to Python data structure
    data = ast.literal_eval(conversation_text)  # Safely converts the string into a list

    # Extract company name
    company_name = data[0]['Company_name']

    # Print company name
    print(f"Company: {company_name}\n")

    # Print conversation in a readable format
    print("Conversation:")
    for entry in data[1]['conversation']:
        role = entry['role']
        message = entry['message']
        print(f"{role}: {message}")
    print("*" * 50)
    print(output[0]["generated_text"][-1].get("content", ""))
    print("-" * 50)

    # Extract and return the generated content
    return output[0]["generated_text"][-1].get("content", "") if output else ""

In [46]:
# Add intents to the dataset
def add_intents_to_data(data, text_column="structured_conversations", intent_column="intents", subset=None):
    """
    Add a new column with extracted intents to the dataset.
    Optionally process only a subset of the data.
    """
    # If subset is provided, slice the data
    if subset:
        data = data.iloc[subset[0]:subset[1]]
    
    # Extract intents for each conversation
    data[intent_column] = data[text_column].apply(lambda text: extract_entities(text))
    
    return data

In [47]:
# Define subset range (Optional)
subset_range = [0,5]  # Example: Only process rows 1441 to 1442

# Add intents to the dataset
updated_data = add_intents_to_data(sample_data, subset=subset_range) if subset_range else add_intents_to_data(sample_data)

# # Save the updated dataset to an Excel file
# unique_user_count = len(sample_data["user_id"].unique())
# output_file_path = f'..\\..\\Data\\processed\\Intents\\ExtractedIntents_UniqueCount-{unique_user_count}_time-{time.strftime("%Y%m%d-%H%M")}.xlsx'
# updated_data.to_excel(output_file_path, index=False)

# print(f"Updated dataset with intents saved to {output_file_path}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Company: sprintcare

Conversation:
Customer: is the worst customer service
Company: I would love the chance to review the account and provide assistance
**************************************************
{
  "Company_name": "sprintcare",
  "service": [
    "Customer Service"
  ]
}
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Company: sprintcare

Conversation:
Customer: is the worst customer service
Company: Can you please send us a private message so that I can gain further details about your account
Customer: I did
Company: Please send us a Private Message so that we can further assist you Just click Message at the top of your profile
Customer: I have sent several private messages and no one is responding as usual
Company: I understand I would like to assist you We would need to get you into a private secured link to further assistCustomer and how do you propose we do thatCustomer the only way I can get a response is to tweet apparently
**************************************************
{
  "Company_name": "sprintcare",
  "service": [
    "Customer Service"
  ]
}
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Company: sprintcare

Conversation:
Customer: is the worst customer service
Company: Hello We never like our customers to feel like they are not valued
**************************************************
{
  "Company_name": "sprintcare",
  "service": [
    "Customer Service"
  ]
}
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Company: sprintcare

Conversation:
Customer: how can I get in touch with someone that matters at Spring Corporate
Company: Just click Message at the top of your profile Were always here and eager to assist
Customer: Ive sent several private messages and no one responds
Company: Do you have a good contact number where you can be reachedCustomer I sent it twice now to apparently noone
**************************************************
{
  "Company_name": "sprintcare",
  "service": [
    "customer support",
    "message service",
    "contact number"
  ]
}
--------------------------------------------------
Company: sprintcare

Conversation:
Customer: how can I get in touch with someone that matters at Spring Corporate
Company: Do you have a good contact number where you can be reached Please send through the private link
**************************************************
{
  "Company_name": "sprintcare",
  "service": [
    "customer service",
    "corporate support"
  ]
}
----------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[intent_column] = data[text_column].apply(lambda text: extract_entities(text))
