# 01-03 : Intent Cleanup

The intents extracted using the Mistral LLM might not exactly match the original category definitions. This notebook is used to clean up the intents as much as possible to match the intents in the dataset.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

sys.path.append(os.path.abspath("../../src"))

In [3]:
import glob
import pandas as pd
from typing import List, Tuple

import llm.classification as llm_classification

In [4]:
data_path = '../../data'

class_data_path = f'{data_path}/intent_extraction'
class_data_ext = 'parquet.gz'

output_path = f'{data_path}/multiclass_model'
output_file = f'{output_path}/01-03_intents.parquet.gz'

## 1. Load Data

Load the classifications generated with the Large Language Model (LLM).

In [5]:
df_class = pd.concat(
    [pd.read_parquet(f) for f in glob.glob(
        f'{class_data_path}/*.{class_data_ext}')
    ]) \
    .sort_values(['id', 'category'])

print(df_class.shape)
with pd.option_context('display.max_colwidth', None):
    display(df_class.head(6))

(10327, 5)


Unnamed: 0,category,reason,relevance,sentiment,id
33,Cancellation,The text mentions 'cancelled the contract' and 'they didn't cancel my contract'.,1.0,negative,3950516
34,Fraud,The text uses the term 'scam' and 'fraudsters' to describe Vodacom.,1.0,negative,3950516
32,Response,The text describes the customer's frustration with not receiving any feedback or response from Vodacom's legal department despite following up multiple times.,1.0,negative,3950535
29,Billing,The text mentions that an amount was debited from the account twice.,1.0,negative,3950575
30,Cancellation,The text expresses the intent to cancel all contracts with Vodacom due to the billing issue.,1.0,negative,3950575
31,Customer's Feeling,The text contains a negative sentiment towards Vodacom.,0.5,negative,3950575


Load the category definitions from the JSONL configuration file.

In [6]:
# load the category definitions
category_def = llm_classification.load_category_definitions()
category_def = pd.DataFrame(category_def)

# get a list of the valid categories
categories = category_def['name'].unique()
categories.sort()

print(f'Number of Categories: {len(categories)}\n')
print(categories)


Number of Categories: 19

['Abuse' 'Account Management' 'Billing' 'Brand' 'Call Center'
 'Cancellation' "Customer's Feeling" 'Devices' 'Network Coverage' 'Other'
 'Policy' 'Price Plans' 'Products' 'Resolution' 'Response' 'SIM' 'Service'
 'Services' 'Staff Level']


## 2. Intent Analysis

### 2.1 New Categories

Find the categories that were extracted, but not present in the original category definitions.

In [7]:
def get_new_categories(df:pd.DataFrame, categories:List) -> List:
    new_categories = df['category'].unique()
    new_categories = [c for c in new_categories if c not in categories]
    new_categories.sort()

    return new_categories

In [8]:
# find the new categories
new_categories = get_new_categories(df_class, categories) 
print(f'Number of New Categories: {len(new_categories)}\n')

Number of New Categories: 417



In [9]:
# show the first 20 new categories
print(new_categories[:20])

['Abuse (Fraud)', 'Abuse (Harassment)', 'Abuse - Unprofessional behavior', 'Account Management (Activation/Deactivation)', 'Account Management (Billing)', 'Account Management (Cancellation Enquiry or Lost/Stolen Device)', 'Account Management (Cancellation Enquiry)', 'Account Management (Cancellation)', 'Account Management (Change Account Details)', 'Account Management (Change of MSISDN)', 'Account Management (Contract Enquiry, Communication)', 'Account Management (Contract)', 'Account Management (Customer Service Response Time)', 'Account Management (Fraud)', 'Account Management (MSISDN change or activation)', 'Account Management (MSISDN change)', 'Account Management (Payment Issue)', 'Account Management (Relocations, Installation, Cancellation)', 'Account Management (Resolution)', 'Account Management (Transfer of Ownership)']


### 2.2 Prefix Matching
Replace new categories with a category from the list, if a category is not in this predefined list, and if the lowercase of an original category matches the prefix of the lowercase of this new category.

In [10]:
def replace_categories(original_category, valid_categories):
    # If the original category is in the valid categories, return it
    if original_category in valid_categories:
        return original_category

    # If the lowercase of the original category matches the prefix of the lowercase of a valid category, return the valid category
    for valid_category in valid_categories:
        if original_category.lower().startswith(valid_category.lower()):
            return valid_category

    # If no match is found, return the original category
    return original_category

# Apply the function to the 'category' column 
df_mapped = df_class.copy()
df_mapped['category'] = df_mapped['category'].apply(lambda x: replace_categories(x, categories))

In [11]:
# find the new categories
new_categories = get_new_categories(df_mapped, categories) 
print(f'Number of New Categories: {len(new_categories)}\n')

Number of New Categories: 115



### 2.3 Qualifier Matching

Replace new categories with a category from the list, if a category is not in this predefined list, and if the lowercase of an original category appears in round brackets in lowercase of this new category.

In [12]:
def replace_qualifiers(original_category, valid_categories):
    # If the original category is in the valid categories, return it
    if original_category in valid_categories:
        return original_category

    # If the lowercase of the original category matches the prefix of the lowercase of a valid category, return the valid category
    for valid_category in valid_categories:
        if f'({valid_category.lower()})' in original_category.lower():
            return valid_category

    # If no match is found, return the original category
    return original_category

# Apply the function to the 'category' column 
df_mapped['category'] = df_mapped['category'].apply(lambda x: replace_qualifiers(x, categories))

In [13]:
# find the new categories
new_categories = get_new_categories(df_mapped, categories) 
print(f'Number of New Categories: {len(new_categories)}\n')

Number of New Categories: 107



### 2.4 Manual Category Mapping

Manually map the extracted categories to the original categories.

In [14]:
def show_new_category_counts(df:pd.DataFrame, new_categories:List, top:int=20) -> None:
    new_category_rows = df[df_class['category'].isin(new_categories)]
    print(new_category_rows.category.value_counts().head(top))

In [15]:
show_new_category_counts(df_mapped, new_categories)

category
Customer Feeling                121
Customer Service                100
Delivery Enquiry                 60
Fraud                            33
Insurance                        21
Complaint                        20
Contract                         15
Communication                    13
Sales Enquiry                    11
Loadshedding                      8
Customer Feeling (Negative)       6
Data Bundle Enquiry               6
Customer Feeling (Not Happy)      6
Charge Dispute                    4
Payment                           4
Upgrade Enquiry                   3
Vodacom Shop                      2
Customer Service/Call Center      2
Breach of Contract                2
Handset Enquiry                   2
Name: count, dtype: int64


In [16]:
category_map = category_map = {
    "Customer Feeling": "Customer's Feeling",
    "Customer Feeling (Negative)": "Customer's Feeling",
    "Customer Feeling (Not Happy)": "Customer's Feeling",
    "Customer Feeling (Frustration)": "Customer's Feeling",
    "Customer Feeling (optional)": "Customer's Feeling",
    "Customer Feeling (Dissatisfaction)": "Customer's Feeling",
    "Customer Feeling (Disappointing Service": "Customer's Feeling",
    "Customer Feeling (Positive)": "Customer's Feeling", 
    "Customer Feeling (Disappointing Service)": "Customer's Feeling",  
    "Customer Service": "Call Center",
    "Vodacom Service (or Customer Service)": "Call Center",
    "Customer Care enquiry": "Call Center",
    "Customer Service enquiry": "Call Center",
    "Customer Service > Call Center Interaction": "Call Center",
    "Customer Service - Positive Feedback": "Call Center",
    "Customer Service - Call Center": "Call Center",
    "Customer Service (Call Center or Shop)": "Call Center",
    "Communication or Assistance & Helpfulness": "Call Center",
    "Customer Service (poor)": "Call Center",
    "Customer Service / Resolution": "Call Center",
    "Customer Service > Products > Data Bundle Enquiry > Issue with purchased data > Incorrect information provided by Vodacom": "Call Center",
    "Customer Service (Call Center or Customer Care enquiry)": "Call Center",
    "Customer Service Enquiry": "Call Center",
    "Customer Service (or Response)": "Response",
    "Customer Service or Response": "Response",
    "Customer Experience > Complaint > Phone Repair": "Call Center",
    "Customer Experience (specifically, Renewal or Upgrade process)": "Call Center", 
    "Device": "Devices",
    "Delivery Enquiry": "Devices", # require new category
    "Delivery or Logistics": "Devices", 
    "Delivery Enquiry or Logistics/Delivery": "Devices", 
    "Delivery Enquiry (under Devices category": "Devices",
    "Delivery Enquiry or Logistics": "Devices",
    "Delivery Enquiry (under Devices category)": "Devices",
    "Fraud": "Policy",
    "Insurance": "Devices",
    "Complaint": "Other",
    "Contract": "Products",
    "Contracts": "Products",
    "Communication": "Other",
    "Sales Enquiry": "Staff Level",
    "Sales Enquiry or Product Purchase": "Staff Level",
    "Loadshedding": "Network Coverage",
    "Load Shedding": "Network Coverage",
    "Data Bundle Enquiry": "Products",
    "Charge Dispute": "Billing",
    "Unauthorized Charges": "Billing",
    "Payment": "Billing",
    "Payment Arrangement": "Billing",
    "Refund": "Billing",
    "Upgrade Enquiry": "Devices",
    "Vodacom Shop": "Staff Level",
    "Shop/Store": "Staff Level",
    "Shop Experience": "Staff Level",
    "Staff": "Staff Level",
    "Customer Service/Call Center": "Call Center",
    "Breach of Contract": "Policy",
    "Handset Enquiry": "Devices",
    "Marketing": "Staff Level",
    "Sales Enquiry (or Unwanted Calls)": "Staff Level",
    "Repair - Complaint": "Devices",
    "Efficiency": "Staff Level",
    "Data Usage": "Products",
    "Promotions or Offers": "Products",
    "Marketing/Promotions": "Products",
    "Promotions or Campaigns": "Products",
    "Activation": "SIM",
    "Email Communication": "Other",
    "Security or Account Management": "Billing",
    "Refund Enquiry": "Billing",
    "Promotions": "Products",
    "Insurance: Vodacom Insurance": "Devices",
    "Returns": "Devices",
    "Crisis": "Other",
    "Security": "Policy",
    "Security or Privacy": "Policy",
    "Privacy": "Policy",
    "Security/Privacy": "Policy",
    "Ethics": "Policy",
    "Urgent Requests": "Other",
    "Complaint / Resolution": "Resolution",
    "Customer Service > Call Center > Unresolved Support Tickets": "Resolution",
    "Unwanted Services or Products": "Products",
    "Vouchers/Rewards": "Products",
    "Rewards Program": "Products",
    "Customer Service - Resolution": "Resolution",
    "Communication or Assistance & Helpfulness": "Staff Level",
    "Customer Satisfaction": "Staff Level",
    "Poor Customer Service": "Staff Level",
    "Online Service": "Services",
    "Online Services": "Services",
    "Online Services > Upgrade Enquiry": "Services",
    "Online Services (Website)": "Services",
    "Vodacom Online": "Services",
    "User Experience (Website)": "Services",
    "Vodacom Repair - Complaint": "Devices",
    "Installation and Repairs": "Devices",
    "Advertising and Marketing - Misrepresentation": "Products",
    "Application Process": "Services",
    "Reliability": "Network Coverage",
    "Internet": "Network Coverage",
    "Messaging (SMS)": "Products",
    "SMS": "Products",
    "Online Sales": "Services",
    "Fibre": "Devices",
    "Repair": "Devices",
    "Unwanted": "Other",
    "Business": "Other",
    "Misinformation": "Other",
    "Email Service": "Other",
}

In [17]:
# # sort the category_map by the values
# category_map = dict(sorted(category_map.items(), key=lambda item: item[1]))

# import json
# category_map_json = json.dumps(category_map, indent=4)
# print(category_map_json)


In [18]:
def map_categories(df:pd.DataFrame, category_map:dict) -> pd.DataFrame:
    """Use the caregory_map to replace categories in the dataframe"""
    df_mapped = df.copy()
    df_mapped['category'] = df_mapped['category'].replace(category_map)
    return df_mapped


In [19]:
# map the categories using the updated category map
df_mapped = map_categories(df_mapped, category_map)

# find the new categories
new_categories = get_new_categories(df_mapped, categories)

# print the number of new categories
print(f'Number of New Categories: {len(new_categories)}\n')

# show the first 50 new categories
show_new_category_counts(df_mapped, new_categories, top=50)

Number of New Categories: 0

Series([], Name: count, dtype: int64)


## Save Data

In [20]:
df_mapped.drop_duplicates(inplace=True)
df_mapped.to_parquet(output_file, compression='gzip')