In [1]:
import random
from nltk.corpus import wordnet
import spacy
import pandas as pd

In [2]:
df = pd.read_csv('csv/processed_tickets_en.csv')

In [4]:
# convert high priority to urgent and medium and low to not urgent

df['urgency'] = df['priority'].apply(lambda x: 'urgent' if x == 'high' else 'not_urgent')
df.sample(5)

Unnamed: 0,processed_text,type,queue,priority,urgency
3,request server administration assistance dear ...,Request,Product Support,medium,not_urgent
202,inquiry macbook air m1 performance dear tech o...,Request,Customer Service,low,not_urgent
225,immediate assistance require touchscreen respo...,Incident,Product Support,high,urgent
238,request improvement aw infrastructure setting ...,Change,IT Support,high,urgent
336,request support appreciate help set canon pixm...,Request,Customer Service,low,not_urgent


In [5]:
df[df.urgency == 'urgent'].shape

(175, 5)

In [6]:
df[df.urgency == 'not_urgent'].shape

(163, 5)

In [None]:

# Load SpaCy model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Function to preprocess text
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Function for synonym replacement
def synonym_replacement(text, n=1):
    words = text.split()
    new_words = words.copy()
    for _ in range(n):
        word_to_replace = random.choice(words)
        synonyms = wordnet.synsets(word_to_replace)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if word == word_to_replace else word for word in new_words]
    return " ".join(new_words)

# Function for random insertion
def random_insertion(text, n=1):
    words = text.split()
    for _ in range(n):
        random_word = random.choice(words)
        synonyms = wordnet.synsets(random_word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            insert_position = random.randint(0, len(words))
            words.insert(insert_position, synonym)
    return " ".join(words)

# Function for random deletion
def random_deletion(text, p=0.1):
    words = text.split()
    if len(words) == 1:
        return text
    new_words = [word for word in words if random.random() > p]
    return " ".join(new_words) if new_words else random.choice(words)

# Function for shuffling
def shuffle_words(text):
    words = text.split()
    random.shuffle(words)
    return " ".join(words)

# Apply text augmentation
augmented_data_map = {}

for index, row in df.iterrows():
    # original data
    augmented_data_map[index] = {
        'subject': row['subject'],
        'body': row['body'],
        'priority': row['priority'],
        'type': row['type'],
        'queue': row['queue']
    }
    
    # synonym replacement
    augmented_data_map[f'{index}_synonym'] = {
        'subject': synonym_replacement(row['subject']),
        'body': synonym_replacement(row['body']),
        'priority': row['priority'],
        'type': row['type'],
        'queue': row['queue']
    }
    
    # random insertion
    augmented_data_map[f'{index}_insertion'] = {
        'subject': random_insertion(row['subject']),
        'body': random_insertion(row['body']),
        'priority': row['priority'],
        'type': row['type'],
        'queue': row['queue']
    }
    
    # random deletion
    augmented_data_map[f'{index}_deletion'] = {
        'subject': random_deletion(row['subject']),
        'body': random_deletion(row['body']),
        'priority': row['priority'],
        'type': row['type'],
        'queue': row['queue']
    }
    
    # shuffling
    augmented_data_map[f'{index}_shuffle'] = {
        'subject': shuffle_words(row['subject']),
        'body': shuffle_words(row['body']),
        'priority': row['priority'],
        'type': row['type'],
        'queue': row['queue']
    }
    
# Create a new DataFrame with augmented data
augmented_data_df = pd.DataFrame(augmented_data_map).T
    
augmented_data_df.sample(3)