In [1]:
import random
from nltk.corpus import wordnet
import spacy
import pandas as pd

In [2]:
df = pd.read_csv('csv/processed_tickets_en.csv')

In [4]:
# convert high priority to urgent and medium and low to not urgent

df['urgency'] = df['priority'].apply(lambda x: 'urgent' if x == 'high' else 'not_urgent')
df.sample(5)

Unnamed: 0,processed_text,type,queue,priority,urgency
3,request server administration assistance dear ...,Request,Product Support,medium,not_urgent
202,inquiry macbook air m1 performance dear tech o...,Request,Customer Service,low,not_urgent
225,immediate assistance require touchscreen respo...,Incident,Product Support,high,urgent
238,request improvement aw infrastructure setting ...,Change,IT Support,high,urgent
336,request support appreciate help set canon pixm...,Request,Customer Service,low,not_urgent


In [5]:
df[df.urgency == 'urgent'].shape

(175, 5)

In [6]:
df[df.urgency == 'not_urgent'].shape

(163, 5)

In [7]:
df.to_csv('csv/processed_tickets_en.csv', index=False)

In [None]:
# Load SpaCy model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Function for synonym replacement
def synonym_replacement(text, n=1):
    words = text.split()
    new_words = words.copy()
    for _ in range(n):
        word_to_replace = random.choice(words)
        synonyms = wordnet.synsets(word_to_replace)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if word == word_to_replace else word for word in new_words]
    return " ".join(new_words)

# Function for random insertion
def random_insertion(text, n=1):
    words = text.split()
    for _ in range(n):
        random_word = random.choice(words)
        synonyms = wordnet.synsets(random_word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            insert_position = random.randint(0, len(words))
            words.insert(insert_position, synonym)
    return " ".join(words)

# Function for random deletion
def random_deletion(text, p=0.1):
    words = text.split()
    if len(words) == 1:
        return text
    new_words = [word for word in words if random.random() > p]
    return " ".join(new_words) if new_words else random.choice(words)

# Function for shuffling
def shuffle_words(text):
    words = text.split()
    random.shuffle(words)
    return " ".join(words)

# Apply text augmentation
augmented_data_map = {}

for index, row in df.iterrows():
    # original data
    augmented_data_map[index] = {
        'processed_text': row['processed_text'],
        'type': row['type'],
        'queue': row['queue'],
        'priority': row['priority'],
        'urgency': row['urgency']
    }
    
    
    # synonym replacement
    augmented_data_map[f"{index}_synonym_replacement"] = {
        'processed_text': synonym_replacement(row['processed_text']),
        'type': row['type'],
        'queue': row['queue'],
        'priority': row['priority'],
        'urgency': row['urgency']
    }
    
    # random insertion
    augmented_data_map[f"{index}_random_insertion"] = {
        'processed_text': random_insertion(row['processed_text']),
        'type': row['type'],
        'queue': row['queue'],
        'priority': row['priority'],
        'urgency': row['urgency']
    }
    
    # random deletion
    augmented_data_map[f"{index}_random_deletion"] = {
        'processed_text': random_deletion(row['processed_text']),
        'type': row['type'],
        'queue': row['queue'],
        'priority': row['priority'],
        'urgency': row['urgency']
    }
    
    # shuffling
    augmented_data_map[f"{index}_shuffling"] = {
        'processed_text': shuffle_words(row['processed_text']),
        'type': row['type'],
        'queue': row['queue'],
        'priority': row['priority'],
        'urgency': row['urgency']
    }
    
    
# Create a new DataFrame with augmented data
augmented_data_df = pd.DataFrame(augmented_data_map).T
    

Unnamed: 0,processed_text,type,queue,priority,urgency
5_random_deletion,urgent assistance need hello support team reac...,Request,IT Support,high,urgent
1_shuffling,> perform documentation driver persist 13 inte...,Incident,Product Support,low,not_urgent
315_shuffling,< operational profoundly kindly follow sustain...,Incident,IT Support,high,urgent


In [11]:
augmented_data_df.sample(10)


Unnamed: 0,processed_text,type,queue,priority,urgency
60_random_deletion,request detail payment option assistance dear ...,Request,Billing and Payments,medium,not_urgent
81_random_deletion,jira software efficiency issue dear consult fi...,Problem,Technical Support,medium,not_urgent
263_random_insertion,request assistance office 365 installation err...,Problem,Technical Support,medium,not_urgent
11_synonym_replacement,assistance need canon setup dear tech online s...,Request,Customer Service,low,not_urgent
285,request support router cause network issue dea...,Incident,IT Support,high,urgent
209_shuffling,diagnose potential physical urgency additional...,Incident,Technical Support,high,urgent
321_shuffling,tel_num regard require d issue direct customer...,Problem,Technical Support,high,urgent
270_random_insertion,dell xps 13 overheat issue hello tech online s...,Problem,Product Support,medium,not_urgent
37_random_deletion,miss ticket notification impact jira hello cus...,Problem,Product Support,low,not_urgent
214_random_deletion,urgent immediate assistance need dear consult ...,Incident,Technical Support,high,urgent


In [12]:
augmented_data_df.shape

(1690, 5)