In [6]:
import random
from nltk.corpus import wordnet
import spacy
import pandas as pd



In [7]:
df = pd.read_csv('csv/processed_tickets_en.csv')

In [8]:
# convert high priority to urgent and medium and low to not urgent

df['urgency'] = df['priority'].apply(lambda x: 'urgent' if x == 'high' else 'not_urgent')
df.sample(5)

Unnamed: 0,processed_text,type,queue,priority,urgency
931,Concerns about the battery performance of the ...,Request,Customer Service,medium,not_urgent
445,"connectivity problems. Dear Support Team, I a...",Problem,IT Support,medium,not_urgent
1494,Product Exchange Request due to Connectivity I...,Problem,Returns and Exchanges,low,not_urgent
1555,Request for assistance in scaling AWS infrastr...,Request,IT Support,high,urgent
1309,MacBook Air M1 Power Issue. My MacBook Air M1 ...,Incident,Product Support,medium,not_urgent


In [9]:
df[df.urgency == 'urgent'].shape

(787, 5)

In [10]:
df[df.urgency == 'not_urgent'].shape

(777, 5)

In [11]:
df.to_csv('csv/processed_tickets_en.csv', index=False)

In [12]:
# Load SpaCy model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Function for synonym replacement
def synonym_replacement(text, n=1):
    words = text.split()
    new_words = words.copy()
    for _ in range(n):
        word_to_replace = random.choice(words)
        synonyms = wordnet.synsets(word_to_replace)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if word == word_to_replace else word for word in new_words]
    return " ".join(new_words)

# Function for random insertion
def random_insertion(text, n=1):
    words = text.split()
    for _ in range(n):
        random_word = random.choice(words)
        synonyms = wordnet.synsets(random_word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            insert_position = random.randint(0, len(words))
            words.insert(insert_position, synonym)
    return " ".join(words)

# Function for random deletion
def random_deletion(text, p=0.1):
    words = text.split()
    if len(words) == 1:
        return text
    new_words = [word for word in words if random.random() > p]
    return " ".join(new_words) if new_words else random.choice(words)

# Function for shuffling
def shuffle_words(text):
    words = text.split()
    random.shuffle(words)
    return " ".join(words)

# Apply text augmentation
augmented_data_map = {}

for index, row in df.iterrows():
    # original data
    augmented_data_map[index] = {
        'processed_text': row['processed_text'],
        'type': row['type'],
        'queue': row['queue'],
        'priority': row['priority'],
        'urgency': row['urgency']
    }
    
    
    # synonym replacement
    augmented_data_map[f"{index}_synonym_replacement"] = {
        'processed_text': synonym_replacement(row['processed_text']),
        'type': row['type'],
        'queue': row['queue'],
        'priority': row['priority'],
        'urgency': row['urgency']
    }
    
    # random insertion
    augmented_data_map[f"{index}_random_insertion"] = {
        'processed_text': random_insertion(row['processed_text']),
        'type': row['type'],
        'queue': row['queue'],
        'priority': row['priority'],
        'urgency': row['urgency']
    }
    
    # random deletion
    augmented_data_map[f"{index}_random_deletion"] = {
        'processed_text': random_deletion(row['processed_text']),
        'type': row['type'],
        'queue': row['queue'],
        'priority': row['priority'],
        'urgency': row['urgency']
    }
    
    # shuffling
    augmented_data_map[f"{index}_shuffling"] = {
        'processed_text': shuffle_words(row['processed_text']),
        'type': row['type'],
        'queue': row['queue'],
        'priority': row['priority'],
        'urgency': row['urgency']
    }
    
    
# Create a new DataFrame with augmented data
augmented_data_df = pd.DataFrame(augmented_data_map).T
    

In [13]:
augmented_data_df.sample(10)


Unnamed: 0,processed_text,type,queue,priority,urgency
1004_random_deletion,Urgent Request for Jira Ticket Modification. D...,Change,Technical Support,high,urgent
431_synonym_replacement,Critical: Significant AWS Service Disruption. ...,Incident,Service Outages and Maintenance,high,urgent
123_shuffling,it. further need I I This and issues. me exper...,Problem,Customer Service,high,urgent
1214_synonym_replacement,Immediate Attention Required. Dear Support Tea...,Incident,Technical Support,high,urgent
219,Required Modifications for IT Ticket System Pi...,Change,Customer Service,high,urgent
832_shuffling,you Consulting and operations. get I about any...,Incident,General Inquiry,medium,not_urgent
490_random_deletion,Server Configuration Change Request. Dear Supp...,Change,IT Support,medium,not_urgent
1230_shuffling,appreciate and regarding in usage can It am wi...,Problem,Billing and Payments,high,urgent
816_random_insertion,"MacBook Air screen problem. Hello, I am experi...",Incident,Customer Service,medium,not_urgent
1382_random_insertion,Dell XPS 13 problem when turning on. Dear Tech...,Incident,Technical Support,high,urgent


In [14]:
augmented_data_df.shape

(7820, 5)

### Synthesizing manually

In [15]:
import pandas as pd
import random

# Parameters for the dataset
num_records = 3000
urgency_levels = ["Urgent", "Not-Urgent"]

# Sample data for workplace/helpdesk environment
urgent_subjects = [
    "System Outage: Immediate Action Required",
    "Critical Alert: Unauthorized Access Detected",
    "Data Breach Confirmed! Urgent Investigation Needed",
    "Production Server Failure: Resolve ASAP",
    "High-Priority: Payment Gateway Down",
    "Security Incident: Firewall Breach",
    "Network Crash Impacting All Users",
    "Critical Bug in Production Environment",
    "Emergency: Hardware Malfunction",
    "Urgent: Stakeholder Meeting Postponed"
]

urgent_bodies = [
    "The entire system is unresponsive and needs immediate resolution.",
    "Sensitive data has been exposed; we need to secure the system urgently.",
    "All transactions are failing due to a payment gateway issue.",
    "A critical bug has been detected that is halting operations.",
    "Unauthorized access detected; escalate to the security team immediately.",
    "Production servers are overheating; shutdown required to prevent damage.",
    "The main network is down, affecting all departments.",
    "Firewall settings need to be reviewed to address the security breach.",
    "Please address this issue before the end of the day to avoid escalation.",
    "The scheduled meeting with stakeholders has been delayed; inform all parties."
]

non_urgent_subjects = [
    "Request for Profile Update",
    "Reminder: Upcoming Maintenance",
    "Feedback Needed on Website Design",
    "Policy Update: Work from Home Guidelines",
    "Monthly Performance Review Meeting",
    "Suggestion: Enhance FAQ Section",
    "Query Regarding Leave Balances",
    "Confirmation of Office Directory Update",
    "Routine Check: System Logs",
    "Follow-Up: Employee Satisfaction Survey"
]

non_urgent_bodies = [
    "Please update the user profile picture at your earliest convenience.",
    "This is a reminder about the scheduled maintenance this weekend.",
    "We would like your input on the new website design.",
    "Review the updated work-from-home guidelines attached herewith.",
    "Performance reviews for all employees will be conducted next week.",
    "Consider adding more details to the FAQ section for better clarity.",
    "Can you confirm the leave balance for the current year?",
    "The office directory update is pending; kindly confirm the changes.",
    "Routine log checks are required as part of the maintenance schedule.",
    "Please provide feedback on the recent satisfaction survey results.",
    "This is not critical and can be resolved later.",
    "The issue is not urgent and can wait for regular working hours."
]

# Generate dataset
data = []
for _ in range(num_records):
    urgency = random.choice(urgency_levels)
    if urgency == "Urgent":
        data.append({
            "subject": random.choice(urgent_subjects),
            "body": random.choice(urgent_bodies),
            "urgency": "urgent"
        })
    else:
        data.append({
            "subject": random.choice(non_urgent_subjects),
            "body": random.choice(non_urgent_bodies),
            "urgency": "not_urgent"
        })




In [16]:
df_synthesized = pd.DataFrame(data)
df_synthesized['processed_text'] = df_synthesized['subject'] + '. ' + df_synthesized['body']
df_synthesized.sample(5)

Unnamed: 0,subject,body,urgency,processed_text
2448,Feedback Needed on Website Design,Please provide feedback on the recent satisfac...,not_urgent,Feedback Needed on Website Design. Please prov...
1826,Reminder: Upcoming Maintenance,Please provide feedback on the recent satisfac...,not_urgent,Reminder: Upcoming Maintenance. Please provide...
2962,Critical Bug in Production Environment,"The main network is down, affecting all depart...",urgent,Critical Bug in Production Environment. The ma...
40,Production Server Failure: Resolve ASAP,Firewall settings need to be reviewed to addre...,urgent,Production Server Failure: Resolve ASAP. Firew...
2500,Request for Profile Update,We would like your input on the new website de...,not_urgent,Request for Profile Update. We would like your...


In [17]:
df_synthesized = df_synthesized[['processed_text', 'subject', 'body', 'urgency']]
df_synthesized.sample(10)

Unnamed: 0,processed_text,subject,body,urgency
1109,Production Server Failure: Resolve ASAP. All t...,Production Server Failure: Resolve ASAP,All transactions are failing due to a payment ...,urgent
325,Reminder: Upcoming Maintenance. Please provide...,Reminder: Upcoming Maintenance,Please provide feedback on the recent satisfac...,not_urgent
2105,Feedback Needed on Website Design. The issue i...,Feedback Needed on Website Design,The issue is not urgent and can wait for regul...,not_urgent
1249,Critical Bug in Production Environment. Please...,Critical Bug in Production Environment,Please address this issue before the end of th...,urgent
1618,Critical Alert: Unauthorized Access Detected. ...,Critical Alert: Unauthorized Access Detected,Please address this issue before the end of th...,urgent
2933,Data Breach Confirmed! Urgent Investigation Ne...,Data Breach Confirmed! Urgent Investigation Ne...,Production servers are overheating; shutdown r...,urgent
2546,Emergency: Hardware Malfunction. Unauthorized ...,Emergency: Hardware Malfunction,Unauthorized access detected; escalate to the ...,urgent
3,System Outage: Immediate Action Required. Prod...,System Outage: Immediate Action Required,Production servers are overheating; shutdown r...,urgent
1065,Routine Check: System Logs. Performance review...,Routine Check: System Logs,Performance reviews for all employees will be ...,not_urgent
1778,Feedback Needed on Website Design. Consider ad...,Feedback Needed on Website Design,Consider adding more details to the FAQ sectio...,not_urgent


In [18]:
# Create DataFrame and save as CSV

df_synthesized.to_csv("csv/synthesized_tickets.csv", index=False)

print("Dataset generated and saved as 'synthesized_tickets.csv'")

Dataset generated and saved as 'synthesized_tickets.csv'
