In [1]:
import pandas as pd
import numpy as np


In [3]:
df = pd.read_csv("../data/raw/customer_support_dataset.csv")


In [4]:
df.sample(10)

Unnamed: 0,flags,instruction,category,intent,response
24131,BEQ,i do not know what i need to do to switch to t...,ACCOUNT,switch_account,Not a problem! I'm here to provide you with cl...
23576,BL,trouble setting the shipping address up,SHIPPING,set_up_shipping_address,I'm sorry to hear that you're having trouble s...
11252,BILPQZ,would it be possible to close a {{Account Cate...,ACCOUNT,delete_account,Thank you for your message to us about closing...
5242,BCL,"I have to list the payment modalities, help me",PAYMENT,check_payment_methods,For sure! I'd be delighted to assist you in li...
10822,BKLZ,openng {{Account Category}} account,ACCOUNT,create_account,Absolutely! I'm thrilled to assist you in open...
9497,BIL,can I contact somebody?,CONTACT,contact_human_agent,We're listening! I'm keyed into the fact that ...
20447,BL,I don't know how I can recover the password of...,ACCOUNT,recover_password,"Oh, I understand that you're unsure about the ..."
5362,BIL,what do I have to do to see what payment optio...,PAYMENT,check_payment_methods,Thank you for contacting and expressing your i...
13375,BELZ,I do not knoe how I can see when will my packa...,DELIVERY,delivery_period,We understand that you're unsure of how to tra...
23489,BL,assistance to set up a secondary shipping address,SHIPPING,set_up_shipping_address,I acknowledge that you require guidance in set...


In [5]:
df = df.drop(columns=["response", "flags"])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26872 entries, 0 to 26871
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  26872 non-null  object
 1   category     26872 non-null  object
 2   intent       26872 non-null  object
dtypes: object(3)
memory usage: 629.9+ KB


### Data cleaning and Transformation

In [7]:
df["instruction"].head(10)

0     question about cancelling order {{Order Number}}
1    i have a question about cancelling oorder {{Or...
2      i need help cancelling puchase {{Order Number}}
3           I need to cancel purchase {{Order Number}}
4    I cannot afford this order, cancel purchase {{...
5       can you help me cancel order {{Order Number}}?
6    I can no longer afford order {{Order Number}},...
7      I am trying to cancel purchase {{Order Number}}
8       I have got to cancel purchase {{Order Number}}
9      i need help canceling purchase {{Order Number}}
Name: instruction, dtype: object

In [8]:
import re

def clean_instruction(text):
    if pd.isna(text):
        return text

    text = str(text).lower()

    # Remove {{placeholders}}
    text = re.sub(r"\{\{.*?\}\}", " ", text)

    # Remove hash-number patterns like #12345
    text = re.sub(r"#\d+", " ", text)

    # Remove standalone numbers
    text = re.sub(r"\d+", " ", text)

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


# Apply cleaning to instruction column
df["instruction"] = df["instruction"].apply(clean_instruction)


In [9]:
# Lowercase all values in category column
df["category"] = df["category"].astype(str).str.lower().str.strip()

# Lowercase intent column and remove underscores
df["intent"] = (
    df["intent"]
    .astype(str)
    .str.lower()
    .str.replace("_", " ", regex=False)
    .str.strip()
)


In [10]:
df= df.rename(columns={
    "instruction": "User Query",
    "category": "Assigned Team",
    "intent": "User Intent"
})


In [11]:
df = df.drop(df[df["User Intent"] == "check cancellation fee"].index)

In [12]:
df["Assigned Team"].value_counts()

Assigned Team
account         5986
order           3988
refund          2992
contact         1999
invoice         1999
payment         1998
feedback        1997
delivery        1994
shipping        1970
subscription     999
Name: count, dtype: int64

In [13]:
df["Assigned Team"].unique()

array(['order', 'shipping', 'invoice', 'payment', 'refund', 'feedback',
       'contact', 'account', 'delivery', 'subscription'], dtype=object)

In [14]:
df["User Intent"].unique()

array(['cancel order', 'change order', 'change shipping address',
       'check invoice', 'check payment methods', 'check refund policy',
       'complaint', 'contact customer service', 'contact human agent',
       'create account', 'delete account', 'delivery options',
       'delivery period', 'edit account', 'get invoice', 'get refund',
       'newsletter subscription', 'payment issue', 'place order',
       'recover password', 'registration problems', 'review',
       'set up shipping address', 'switch account', 'track order',
       'track refund'], dtype=object)

In [15]:
df.columns

Index(['User Query', 'Assigned Team', 'User Intent'], dtype='object')

In [16]:
# Normalize User Intent text
df["User Intent"] = (
    df["User Intent"]
    .astype(str)
    .str.lower()
    .str.strip()
)

# intent â†’ assigned team mapping
intent_to_team = {
    "place order": "order team",
    "change order": "order team",
    "cancel order": "order team",

    "track order": "delivery team",
    "change shipping address": "delivery team",
    "set up shipping address": "delivery team",
    "delivery options": "delivery team",
    "delivery period": "delivery team",

    "payment issue": "payment team",
    "check payment methods": "payment team",
    "get invoice": "payment team",
    "check invoice": "payment team",

    "get refund": "refund team",
    "check refund policy": "refund team",
    "track refund": "refund team",

    "create account": "account team",
    "edit account": "account team",
    "delete account": "account team",
    "switch account": "account team",
    "recover password": "account team",
    "registration problems": "account team",

    "complaint": "support team",
    "review": "support team",
    "contact customer service": "support team",
    "contact human agent": "support team",

    "newsletter subscription": "subscription team"
}

# Apply mapping FROM User Intent TO Assigned Team
df["Assigned Team"] = (
    df["User Intent"]
    .map(intent_to_team)
    .fillna("unknown")
)


In [17]:
df.sample(10)

Unnamed: 0,User Query,Assigned Team,User Intent
5654,can you help me seeing the payment options?,payment team,check payment methods
23808,i cannot submit my shipping address,delivery team,set up shipping address
19228,"i have got to acquire some items, i need help",order team,place order
13707,can you help me to check how soon can i expect...,delivery team,delivery period
24286,using account,account team,switch account
20359,can uhelp me resetting the access key of my us...,account team,recover password
8506,i am trying to get in touch with customer support,support team,contact customer service
8123,i have to see at what time i can reach custome...,support team,contact customer service
11927,is it possible to ordet from,delivery team,delivery options
23078,i need to set the secondary shipping address up,delivery team,set up shipping address


In [18]:
df.shape

(25922, 3)

In [19]:
df.to_csv("../data/processed/cleaned_dataset.csv", index=False)