# Data Analysis

In [11]:
import pandas as pd
import numpy as np

In [158]:
df = pd.read_csv('../csv/hf_combined.csv')
df.sample(3)

Unnamed: 0,label,processed_text,priority,urgency,type,queue
2731,1.0,Mobile Menu Fails to Expand on First Click,medium,not_urgent,,
1408,1.0,Disrupted Access to Digital Archives,medium,not_urgent,,
4259,1.0,Automated Tests Failing to Catch Regression Bugs,medium,not_urgent,,


In [159]:
# number of urgent and non-urgent tickets
df['urgency'].value_counts()

urgency
not_urgent    5676
urgent        2973
Name: count, dtype: int64

In [160]:
# Clip the values of not urgent tickets to 3000
df['processed_text'].duplicated().sum()
# view the duplicated rows
df.loc[df['processed_text'].duplicated(keep=False), :]

Unnamed: 0,label,processed_text,priority,urgency,type,queue
7103,,AWS Service Update Request. Please provide upd...,low,not_urgent,Request,Technical Support
7162,,Intermittent connectivity on ISR4331. Conflict...,low,not_urgent,Problem,IT Support
7364,,Printer Setup Assistance. Cannot connect to wifi.,medium,not_urgent,Request,Customer Service
7543,,Intermittent connectivity on ISR4331. Conflict...,low,not_urgent,Problem,IT Support
7781,,"Problems with Cisco ISR4331 router. Hi, I am e...",high,urgent,Problem,Technical Support
7805,,Printer Setup Assistance. Cannot connect to wifi.,medium,not_urgent,Request,Customer Service
8010,,"Problems with Cisco ISR4331 router. Hi, I am e...",high,urgent,Problem,Technical Support
8083,,AWS Service Update Request. Please provide upd...,low,not_urgent,Request,Technical Support
8169,,AWS Service Update Request. Please provide upd...,low,not_urgent,Request,Technical Support
8253,,AWS Service Update Request. Please provide upd...,low,not_urgent,Request,Technical Support


In [161]:
# drop the duplicated rows
df.drop_duplicates(subset='processed_text', keep='first', inplace=True)
df['processed_text'].duplicated().sum()

np.int64(0)

In [162]:
df['urgency'].value_counts()

urgency
not_urgent    5669
urgent        2972
Name: count, dtype: int64

### **Conclusion** - Model is biased towards non-urgent class
Two approaches - 
- Clip off non-urgent records to match number of urgent records 
- Synthesize mor urgent records to mathc number of non-urgent records 

### Approach I

In [55]:
from imblearn.under_sampling import RandomUnderSampler
X = df[['processed_text', 'priority']]
y = df['urgency']

X_resampled, y_resampled = RandomUnderSampler(random_state=42).fit_resample(X, y)

print("Class distribution after undersampling:")
print(pd.Series(y_resampled).value_counts())


Class distribution after undersampling:
urgency
not_urgent    2972
urgent        2972
Name: count, dtype: int64




In [56]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Combine back into a DataFrame
df_undersampled = pd.DataFrame({
    "processed_text": X_resampled["processed_text"],
    "priority": X_resampled["priority"],
    "urgency": y_resampled
})

print("Class distribution after undersampling:")
print(df_undersampled["urgency"].value_counts())


Class distribution after undersampling:
urgency
not_urgent    2972
urgent        2972
Name: count, dtype: int64




In [57]:
df_undersampled.sample(3)
df_undersampled.to_csv('../csv/undersampled_data.csv', index=False)

### Approach II

In [61]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Combine back into a DataFrame
df_oversampled = pd.DataFrame({
    "processed_text": X_resampled["processed_text"],
    "priority": X_resampled["priority"],
    "urgency": y_resampled
})

print("Class distribution after oversampling:")
print(df_oversampled["urgency"].value_counts())


Class distribution after oversampling:
urgency
urgent        5669
not_urgent    5669
Name: count, dtype: int64




In [64]:
df_oversampled[df_oversampled['urgency'] == 'urgent'].sample(10)

Unnamed: 0,processed_text,priority,urgency
6170,Server Outage Affecting Global Tax Filing Access,high,urgent
3723,Inconsistent performance of capacitive sensors...,high,urgent
9410,Critical fix needed for delayed input response...,high,urgent
426,We need an immediate response to address this ...,high,urgent
5808,Severe Performance Bottleneck in High Traffic ...,high,urgent
4775,Auto-recovery process fails to restart critica...,high,urgent
9309,Recurring Excel crash problem after update. De...,high,urgent
4994,Compilation failure on ARM architecture,high,urgent
9835,Critical Payroll Processing Halt During Fiscal...,high,urgent
10467,Blockchain explorer does not show updated tran...,high,urgent


In [67]:
# duplicate texts 
df_oversampled[df_oversampled['processed_text'].duplicated(keep=False)].sort_values('processed_text')

Unnamed: 0,processed_text,priority,urgency
3216,404 error on page 2 of paginated article list,high,urgent
10884,404 error on page 2 of paginated article list,high,urgent
342,A prompt and decisive response is required to ...,high,urgent
9179,A prompt and decisive response is required to ...,high,urgent
10212,A rapid response is crucial to address this ma...,high,urgent
...,...,...,...
11079,real-time messaging issues. Dear Tech Online S...,high,urgent
8625,real-time messaging issues. Dear Tech Online S...,high,urgent
10383,request for configuration changes. Dear IT Sup...,high,urgent
10680,request for configuration changes. Dear IT Sup...,high,urgent


In [163]:
import pandas as pd

df_hf = pd.read_csv("hf://datasets/rtweera/customer_care_emails/dataset.csv")

In [164]:
# Add high, medium and low label for 2,1,0 respectively
df_hf['priority'] = df_hf['email_criticality']
df_hf['urgency'] = df_hf['priority'].apply(lambda x: 'urgent' if x == 'high' else 'not_urgent')
df_hf['processed_text'] = df_hf['subject'] + '. ' + df_hf['message_body'].replace('\n', ' ', regex=True)

df_hf.head()

Unnamed: 0,subject,sender,receiver,timestamp,message_body,thread_id,email_types,email_status,email_criticality,product_types,agent_effectivity,agent_efficiency,customer_satisfaction,priority,urgency,processed_text
0,Mercury Language Documentation Issue,john.doe@example.com,support@aetheros.com,2023-10-26 10:02:34+00:00,"Hi Aetheros Support,\n\nI'm having trouble fin...",aa001-8e561ac9-8523-421f-9d05-e4c2a80a26d7,['inquiry'],ongoing,low,['Mercury Language'],high,very low,0.9043,low,not_urgent,Mercury Language Documentation Issue. Hi Aethe...
1,Re: Mercury Language Documentation Issue,support@aetheros.com,john.doe@example.com,2023-10-27 14:15:22+00:00,"Dear John,\n\nThank you for reaching out to Ae...",aa001-8e561ac9-8523-421f-9d05-e4c2a80a26d7,['inquiry'],ongoing,low,['Mercury Language'],high,very low,0.9043,low,not_urgent,Re: Mercury Language Documentation Issue. Dear...
2,Re: Mercury Language Documentation Issue,john.doe@example.com,support@aetheros.com,2023-10-28 09:38:01+00:00,"Hi,\n\nI'm using Mercury version 2.3.5.\n\nTha...",aa001-8e561ac9-8523-421f-9d05-e4c2a80a26d7,['inquiry'],ongoing,low,['Mercury Language'],high,very low,0.9043,low,not_urgent,"Re: Mercury Language Documentation Issue. Hi, ..."
3,Re: Mercury Language Documentation Issue,john.doe@example.com,support@aetheros.com,2023-10-30 15:12:54+00:00,"Hi there,\n\nJust checking in on this. Have yo...",aa001-8e561ac9-8523-421f-9d05-e4c2a80a26d7,['inquiry'],ongoing,low,['Mercury Language'],high,very low,0.9043,low,not_urgent,Re: Mercury Language Documentation Issue. Hi t...
4,URGENT: User Permission Errors in Production API,kenneth77@davis-becker.com,support@aetheros.com,2023-10-26 09:12:34+00:00,We are experiencing critical issues with user ...,aa001-550e8400-e29b-41d4-a716-446655440000,['issue'],completed,high,"['API Development', 'IAM service']",medium,very low,0.6918,high,urgent,URGENT: User Permission Errors in Production A...


In [165]:
df_hf = df_hf[['processed_text', 'priority', 'urgency']]
df_hf.urgency.value_counts()


urgency
not_urgent    1325
urgent         934
Name: count, dtype: int64

In [166]:
# get last 2700 urgent tickets
df_hf_urgent = df_hf[df_hf['urgency'] == 'urgent'].tail(2700)

In [167]:
df_hf_urgent.shape
df_hf_urgent.head()

Unnamed: 0,processed_text,priority,urgency
4,URGENT: User Permission Errors in Production A...,high,urgent
5,Re: URGENT: User Permission Errors in Producti...,high,urgent
6,Re: URGENT: User Permission Errors in Producti...,high,urgent
7,Re: URGENT: User Permission Errors in Producti...,high,urgent
8,Re: URGENT: User Permission Errors in Producti...,high,urgent


In [168]:
# combine the two datasets
df_combined = pd.concat([df, df_hf_urgent], axis=0)
df_combined.urgency.value_counts()

urgency
not_urgent    5669
urgent        3906
Name: count, dtype: int64

In [169]:
df_combined.duplicated(subset=['processed_text']).sum()

np.int64(0)

In [170]:
df_chatgpt = pd.read_csv("../csv/urgent_messages_chatgpt.csv")
df_chatgpt.head()

Unnamed: 0,Subject,Body,Priority
0,Important: At Risk,This is a at risk matter. Please handle urgent...,High
1,Alert: Top Priority,This is a escalated matter. Please resolve imm...,High
2,Important: Blocking Issue,This is a showstopper matter. Please act now. ...,High
3,Alert: Requires Action,This is a critical path matter. Please do not ...,High
4,Critical: Breakpoint,This is a high priority matter. Please act now...,High


In [171]:
df_chatgpt['urgency'] = df_chatgpt.Priority.apply(lambda x: 'urgent' if x == 'High' else 'not_urgent')
df_chatgpt['processed_text'] = df_chatgpt['Subject'] + '. ' + df_chatgpt['Body'].replace('\n', ' ', regex=True)

df_chatgpt = df_chatgpt[['processed_text', 'Priority', 'urgency']]
# rename Priority to priority
df_chatgpt.rename(columns={'Priority': 'priority'}, inplace=True)
df_chatgpt.urgency.value_counts()

urgency
urgent    1500
Name: count, dtype: int64

In [172]:
df_chatgpt.duplicated(subset=['processed_text']).sum()
df_chatgpt.drop_duplicates(subset=['processed_text'], inplace=True)

In [173]:
df_chatgpt.sample(3)

Unnamed: 0,processed_text,priority,urgency
45,Critical: Life-or-Death. This is a escalated m...,High,urgent
1407,Alert: High Risk. This is a top priority matte...,High,urgent
1342,Critical: Handle Urgently. This is a act now m...,High,urgent


In [174]:
df_combined = pd.concat([df_combined, df_chatgpt], axis=0)
df_combined.urgency.value_counts()

urgency
not_urgent    5669
urgent        5399
Name: count, dtype: int64

In [175]:
df_combined.duplicated(subset=['processed_text']).sum()


np.int64(0)

In [176]:
df_combined.sample(3)

Unnamed: 0,label,processed_text,priority,urgency,type,queue
5711,0.0,Scheduled report generation missing last hour'...,low,not_urgent,,
2630,0.0,Search in employee records not case insensitive,low,not_urgent,,
6713,0.0,Non-urgent updates to non-critical system comp...,low,not_urgent,,


In [177]:
df_combined.to_csv('../csv/oversampled_data.csv', index=False)