# Wordnet Synonyms

In [None]:
from random import choice

# NLTK Import
try:
  from nltk.corpus import wordnet
  # Placeholder search to ensure wordnet data is available.
  wordnet.synsets('hello')
except LookupError as e:
  import nltk
  nltk.download('wordnet')

"""
  It returns a list of synonyms of the input word.
  The output list may contain the original word.
"""
def synonyms(word):
  results = set()
  for syn in wordnet.synsets(word):
    for lemma in syn.lemmas():
      results.add(lemma.name())

  return list(results)

"""
  It handles the cases when the synonyms for a word are unavailable.
  It returns the original word in such cases.
"""
def synonym_or_self(word):
  return choice(synonyms(word) or [word])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# Transformations

In [None]:
from keras.preprocessing.text import text_to_word_sequence

original = 'We enjoyed our short vacation in Mexico'
words = text_to_word_sequence(original) # Tokenize the sentence to a list of words.

## Synonym Transformation

In [None]:
candidates = ['vacation'] # These are the words that can be replaced with their synonyms

def syn_transformation(words, candidates):
  transformed_words = []
  for word in words:
    if word in candidates:
      transformed_words.append(synonym_or_self(word))
    else:
      transformed_words.append(word)

  return transformed_words

syn_transformation(words, candidates)

['we', 'enjoyed', 'our', 'short', 'holiday', 'in', 'mexico']

## Random Insertion

In [None]:
from random import randint

candidates = ['vacation'] # These are the words whose synonyms will be inserted.

"""
  It inserts a synonym for every candidate word at a random position in the word sequence.
"""
def ins_transformation(words, candidates):
  for candidate in candidates:
    pos = randint(0, len(words) - 1) # Random insertion position for the candidate
    syn_word = synonym_or_self(candidate) # Get a random synonym
    words.insert(pos, syn_word)

  return words

ins_transformation(words.copy(), candidates)

['we', 'enjoyed', 'our', 'short', 'vacation', 'in', 'vacation', 'mexico']

## Random Deletion

In [None]:
from random import randint

"""
  It inserts a synonym for every candidate word at a random position in the word sequence.
"""
def del_transformation(words):
  pos = randint(0, len(words) - 1) # Random deletion position
  words.pop(pos)

  return words

del_transformation(words.copy())

['we', 'enjoyed', 'our', 'short', 'in', 'mexico']

## Random Swap

In [None]:
from random import randint

"""
  It inserts a synonym for every candidate word at a random position in the word sequence.
"""
def swap_transformation(words):
  random_pos = lambda: randint(0, len(words) - 1)
  pos1 = random_pos() # First random position
  pos2 = random_pos() # Second random position

  temp = words[pos1]
  words[pos1] = words[pos2]
  words[pos2] = temp

  return words

swap_transformation(words.copy())

['we', 'enjoyed', 'our', 'short', 'vacation', 'in', 'mexico']


['we', 'enjoyed', 'our', 'short', 'mexico', 'in', 'vacation']

## Random Shuffle

In [None]:
from random import shuffle

# NLTK Import
try:
  from nltk.tokenize import sent_tokenize
  # Placeholder search to ensure wordnet data is available.
  sent_tokenize('hello')
except LookupError as e:
  import nltk
  nltk.download('punkt')

# Input paragraph
paragraph = "In some ways, \"telegram style\" was the precursor to the modern language abbreviations employed in \"texting\" or the use of short message standard (SMS) services such as Twitter. For telegrams, space was at a premium—economically speaking—and abbreviations were used as necessity. This motivation was revived for compressing information into the 160-character limit of a costly SMS before the advent of multi-message capabilities. Length constraints, and the initial handicap of having to enter each individual letter using multiple keypresses on a numeric pad, drove readoption of telegraphic style, and continued space limits and high per-message cost meant the practice persisted for some time after the introduction of built-in predictive text assistance despite it then needing more effort to write (and read)."

# Sentence tokenization
sentences = list(sent_tokenize(paragraph))

# Sentence shuffe
shuffle(sentences)

# Paragraph recomposition
shuffled_paragraph = ' '.join(sentences)
print(shuffled_paragraph)


This motivation was revived for compressing information into the 160-character limit of a costly SMS before the advent of multi-message capabilities. For telegrams, space was at a premium—economically speaking—and abbreviations were used as necessity. In some ways, "telegram style" was the precursor to the modern language abbreviations employed in "texting" or the use of short message standard (SMS) services such as Twitter. Length constraints, and the initial handicap of having to enter each individual letter using multiple keypresses on a numeric pad, drove readoption of telegraphic style, and continued space limits and high per-message cost meant the practice persisted for some time after the introduction of built-in predictive text assistance despite it then needing more effort to write (and read).
