In [None]:
import pandas as pd
import numpy as np


In [2]:
df=pd.read_csv('dataset.csv')

In [3]:
df.sample(10)

Unnamed: 0,flags,instruction,category,intent,response
7217,BELZ,I do not know how I can make a customerr claim,FEEDBACK,complaint,I'm sorry to hear that you're unsure of how to...
19094,BLM,order several of your article,ORDER,place_order,We appreciate your interest in ordering severa...
18437,BLM,report an error with payments,PAYMENT,payment_issue,I appreciate you bringing this error in paymen...
18982,BLM,help me earn several of your item,ORDER,place_order,Thank you for expressing your interest in earn...
14593,BKL,modify information on {{Account Type}} account,ACCOUNT,edit_account,Your commitment to modifying and enhancing the...
5913,BL,I don't know how I can list the payment options,PAYMENT,check_payment_methods,"Not to worry, I'm here to help! To list the pa..."
18908,BCILM,"I have to order some of your item, can I get s...",ORDER,place_order,"Thank you for reaching out, and I appreciate y..."
23255,BLM,I have troubles setting another delivery addre...,SHIPPING,set_up_shipping_address,I'm attuned to the idea that you're experienci...
6731,BZ,I don't know what I need to do to check your ...,REFUND,check_refund_policy,I get it your uncertainty about how to access ...
2701,BILQ,can ya help me to modify the delivery address,SHIPPING,change_shipping_address,I'll make it happen! I can assist you with upd...


In [4]:
df = df.drop(columns=["response", "flags"])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26872 entries, 0 to 26871
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  26872 non-null  object
 1   category     26872 non-null  object
 2   intent       26872 non-null  object
dtypes: object(3)
memory usage: 629.9+ KB


### Data cleaning and Transformation

In [6]:
df["instruction"].head(10)

0     question about cancelling order {{Order Number}}
1    i have a question about cancelling oorder {{Or...
2      i need help cancelling puchase {{Order Number}}
3           I need to cancel purchase {{Order Number}}
4    I cannot afford this order, cancel purchase {{...
5       can you help me cancel order {{Order Number}}?
6    I can no longer afford order {{Order Number}},...
7      I am trying to cancel purchase {{Order Number}}
8       I have got to cancel purchase {{Order Number}}
9      i need help canceling purchase {{Order Number}}
Name: instruction, dtype: object

In [7]:
import re

def clean_instruction(text):
    if pd.isna(text):
        return text

    text = str(text).lower()

    # Remove {{placeholders}}
    text = re.sub(r"\{\{.*?\}\}", " ", text)

    # Remove hash-number patterns like #12345
    text = re.sub(r"#\d+", " ", text)

    # Remove standalone numbers
    text = re.sub(r"\d+", " ", text)

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


# Apply cleaning to instruction column
df["instruction"] = df["instruction"].apply(clean_instruction)


In [8]:
# Lowercase all values in category column
df["category"] = df["category"].astype(str).str.lower().str.strip()

# Lowercase intent column and remove underscores
df["intent"] = (
    df["intent"]
    .astype(str)
    .str.lower()
    .str.replace("_", " ", regex=False)
    .str.strip()
)


In [9]:
df= df.rename(columns={
    "instruction": "User Query",
    "category": "Assigned Team",
    "intent": "User Intent"
})


In [10]:
df = df.drop(df[df["User Intent"] == "check cancellation fee"].index)

In [11]:
df["Assigned Team"].value_counts()

Assigned Team
account         5986
order           3988
refund          2992
contact         1999
invoice         1999
payment         1998
feedback        1997
delivery        1994
shipping        1970
subscription     999
Name: count, dtype: int64

In [12]:
df["Assigned Team"].unique()

array(['order', 'shipping', 'invoice', 'payment', 'refund', 'feedback',
       'contact', 'account', 'delivery', 'subscription'], dtype=object)

In [13]:
df["User Intent"].unique()

array(['cancel order', 'change order', 'change shipping address',
       'check invoice', 'check payment methods', 'check refund policy',
       'complaint', 'contact customer service', 'contact human agent',
       'create account', 'delete account', 'delivery options',
       'delivery period', 'edit account', 'get invoice', 'get refund',
       'newsletter subscription', 'payment issue', 'place order',
       'recover password', 'registration problems', 'review',
       'set up shipping address', 'switch account', 'track order',
       'track refund'], dtype=object)

In [14]:
df.columns

Index(['User Query', 'Assigned Team', 'User Intent'], dtype='object')

In [15]:
# Normalize User Intent text
df["User Intent"] = (
    df["User Intent"]
    .astype(str)
    .str.lower()
    .str.strip()
)

# intent â†’ assigned team mapping
intent_to_team = {
    "place order": "order team",
    "change order": "order team",
    "cancel order": "order team",

    "track order": "delivery team",
    "change shipping address": "delivery team",
    "set up shipping address": "delivery team",
    "delivery options": "delivery team",
    "delivery period": "delivery team",

    "payment issue": "payment team",
    "check payment methods": "payment team",
    "get invoice": "payment team",
    "check invoice": "payment team",

    "get refund": "refund team",
    "check refund policy": "refund team",
    "track refund": "refund team",

    "create account": "account team",
    "edit account": "account team",
    "delete account": "account team",
    "switch account": "account team",
    "recover password": "account team",
    "registration problems": "account team",

    "complaint": "support team",
    "review": "support team",
    "contact customer service": "support team",
    "contact human agent": "support team",

    "newsletter subscription": "subscription team"
}

# Apply mapping FROM User Intent TO Assigned Team
df["Assigned Team"] = (
    df["User Intent"]
    .map(intent_to_team)
    .fillna("unknown")
)


In [16]:
df.sample(10)

Unnamed: 0,User Query,Assigned Team,User Intent
5043,i don't know how to list the payment options,payment team,check payment methods
8734,what time can i get in touch with customer ass...,support team,contact customer service
8330,what do i have to do to talk with bloody custo...,support team,contact customer service
18969,i want assistance earning some of your product,order team,place order
11268,i want help closing a freemium account,account team,delete account
2838,"my address has changed, how could i change it?",delivery team,change shipping address
13257,can you help me see when my product is going t...,delivery team,delivery period
22005,where to leave a comment for your services?,support team,review
14183,edit information on gold account,account team,edit account
12527,give me information about the options for ship...,delivery team,delivery options


In [17]:
df.shape

(25922, 3)

In [None]:
df.to_csv("../data/processed/cleaned_dataset.csv", index=False)