# Test Text Augmentation Libraries

In [None]:
!pip install transformers textaugment augly -q
!sudo apt-get install python3-magic -q

[K     |████████████████████████████████| 2.6 MB 5.0 MB/s 
[K     |████████████████████████████████| 40.1 MB 16 kB/s 
[K     |████████████████████████████████| 895 kB 46.4 MB/s 
[K     |████████████████████████████████| 3.3 MB 23.0 MB/s 
[K     |████████████████████████████████| 636 kB 44.8 MB/s 
[K     |████████████████████████████████| 3.0 MB 37.1 MB/s 
[K     |████████████████████████████████| 721 kB 45.4 MB/s 
[K     |████████████████████████████████| 394 kB 44.6 MB/s 
[K     |████████████████████████████████| 55 kB 2.9 MB/s 
[K     |████████████████████████████████| 1.2 MB 30.4 MB/s 
[K     |████████████████████████████████| 42 kB 1.0 MB/s 
[K     |████████████████████████████████| 53 kB 1.9 MB/s 
[K     |████████████████████████████████| 65 kB 3.3 MB/s 
[?25h  Building wheel for googletrans (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of th

In [None]:
import augly.text as txtaugs
from textaugment import Wordnet, Translate
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
texts = ["hello world", "bye planet"]
a1 = txtaugs.simulate_typos("hello world", aug_char_p=0.01, aug_word_p=0.01)
a2 = txtaugs.split_words(texts, aug_word_p=0.01)
print(a1, a2)

hello worls ['hello wor ld', 'bye pla net']


In [None]:
t = Wordnet(runs=5)
t.augment('where can i find guides')

'where can i happen guides'

In [None]:
t = Translate(src="en", to="fr")
t.augment('how can i get the kerberos id of callista')

'how can i get kerberos id from callista'

# Data Preparation Begins Here

In [None]:
import pandas as pd
df = pd.read_csv("commands.csv").iloc[:, :2]
df.sample(5, random_state=0)

Unnamed: 0,Question,Command
49,Require assistance on rebooting NDS,help rebooting NDS
27,how can I request a vendor review?,techrisk
30,How do I draft new email,mail
63,how can I telephone Ernest,call Ernest
45,What is helpdesk?,help


In [None]:
df['BaseCommand'] = df['Command'].apply(lambda x: x.split()[0])
value_counts = df['BaseCommand'].value_counts()
print("Number of unique commands:", len(value_counts))
value_counts

Number of unique commands: 10


call        14
techrisk    10
teutr       10
im          10
help         7
engHub       7
Orbit        6
kerb         5
mail         5
time         5
Name: BaseCommand, dtype: int64

In [None]:
values = df[['BaseCommand', 'CommandID']].value_counts().keys().tolist()
mapper = {k: v for k, v in values}
print(mapper)

{'call': 1, 'teutr': 8, 'techrisk': 7, 'im': 4, 'help': 3, 'engHub': 2, 'Orbit': 0, 'time': 9, 'mail': 6, 'kerb': 5}


In [None]:
reverse_mapper = {v: k for k, v in values}

In [None]:
from sklearn.preprocessing import LabelEncoder
import string

def transform_question_column(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '').lower()
    return text

df['CommandID'] = LabelEncoder().fit_transform(df['BaseCommand'].values)
df['Question'] = df['Question'].apply(transform_question_column)

In [None]:
df.sample(5, random_state=0)

Unnamed: 0,Question,Command,BaseCommand,CommandID
49,require assistance on rebooting nds,help rebooting NDS,help,3
27,how can i request a vendor review,techrisk,techrisk,7
30,how do i draft new email,mail,mail,6
63,how can i telephone ernest,call Ernest,call,1
45,what is helpdesk,help,help,3


In [None]:
df

Unnamed: 0,Question,Command,BaseCommand,CommandID
0,how do i access email on my phone,Orbit,Orbit,0
1,how do i share documents with colleagues,Orbit,Orbit,0
2,access work from mobile device,Orbit,Orbit,0
3,how do i get orbit suite,Orbit,Orbit,0
4,android work applications,Orbit,Orbit,0
...,...,...,...,...
74,report a security incident,techrisk,techrisk,7
75,how do i request a site unblock,techrisk,techrisk,7
76,where do i request a control override,techrisk,techrisk,7
77,request a security review,techrisk,techrisk,7


In [None]:
import random

augmented_df = df.copy()

for command in df['BaseCommand'].unique():
  print(command + "...")
  temp = df[df['BaseCommand'] == command]
  questions = temp['Question'].values

  augmented_questions = list(questions)
  num_questions = len(augmented_questions)
  i = 0
  while i < (25 - num_questions):
    q = questions[i%num_questions]
    if 0 <= random.random() <= 0.2:
      typo_q = txtaugs.simulate_typos(q, aug_char_p=random.uniform(0, 0.25), aug_word_p=random.uniform(0, 0.25))
      if typo_q not in augmented_questions:
        augmented_questions.append(typo_q)
      i += 1

    if 0 <= random.random() <= 0.1:
      q = txtaugs.split_words(q, aug_word_p=0.1)
      augmented_questions.append(q)
      i += 1
      
    wordnet_augmenter = Wordnet(runs=random.randint(1, 2), p=random.uniform(0, 0.4))
    wordnet_q = wordnet_augmenter.augment(q)
    if wordnet_q not in augmented_questions:
      augmented_questions.append(wordnet_q)
      i += 1

  data = {
      'Question': augmented_questions,
      'BaseCommand': [command] * len(augmented_questions),
      'CommandID': [mapper[command]] * len(augmented_questions)
  }
  to_append = pd.DataFrame.from_dict(data)
  augmented_df = augmented_df.append(to_append)

Orbit...
teutr...
engHub...
techrisk...
mail...
kerb...
time...
help...
im...
call...


In [None]:
augmented_df = augmented_df.drop(columns=['Command'])

In [None]:
def lmao(x):
  if x in ['kerb', 'im', 'mail', 'call']:
    return 'name'
  elif x in ['help', 'teutr']:
    return 'command,none'
  elif x == 'time':
    return 'place,none'
  elif x in ['engHub', 'Orbit', 'techrisk']:
    return 'none'
augmented_df['CommandType'] = augmented_df['BaseCommand'].apply(lmao)
augmented_df.sample(5)

Unnamed: 0,Question,BaseCommand,CommandID,CommandType
22,engeneer onboard,engHub,2,none
9,wha t is helpdesk,help,3,"command,none"
10,how do i take time panel,time,9,"place,none"
6,where to get selfservice tech support,teutr,8,"command,none"
6,require assistance on rebooting nds,help,3,"command,none"


In [None]:
augmented_df['BaseCommand'] = augmented_df['BaseCommand'].apply(lambda x: x.lower())
augmented_df.sample(5)

Unnamed: 0,Question,BaseCommand,CommandID,CommandType
4,send message to binitha,im,4,name
4,how to access helpdesk tickets,help,3,"command,none"
5,iphone work applications,orbit,0,none
19,how manage i retrieve kerberos of binitha,kerb,5,name
67,speak to callista,call,1,name


In [None]:
augmented_df.to_csv('augmented_commands.csv', index=False)