## Imports + Setup

In [1]:
%load_ext autoreload
%autoreload complete

In [2]:
from src import lang_permute

In [3]:
syntax_model = get_spacy_model()
fix_spelling = load_spellcheck()

spacy backend: en_core_web_trf


### Spellchecking

In [4]:
fix_spelling("The cts are playin with the tys and the dgs are barking at the brds.")

[{'generated_text': 'The cats are playing with the toys and the dogs are barking at the birds.'}]

## POS Extraction

In [5]:
get_nltk_pos_descriptions()

ADJ: Adjective
ADP: Adposition (Pre- & Post-)
ADV: Adverb
AUX: Auxiliary Verb
CONJ: Conjunction
CCONJ: Coordinating Conjunction
DET: Determiner
INTJ: Interjection
NOUN: Noun
NUM: Numeral
PART: Particle
PRON: Pronoun
PROPN: Proper Noun
PUNCT: Punctuation
SCONJ: Subordinating Conjunction
SYM: Symbol
VERB: Verb
X: Other


In [6]:
sentences = ["The cats are playing with strings.",
             "The dogs are barking at the birds.",
             "The catfish are jumping in the river."]

In [7]:
extract_pos(sentences[0], ['NOUN'], model=syntax_model)

['cats', 'strings']

In [8]:
batch_extract_pos(sentences, ['NOUN'], syntax_model)

POS Extraction (over Sentences):   0%|          | 0/3 [00:00<?, ?it/s]

[['cats', 'strings'], ['dogs', 'birds'], ['catfish', 'river']]

In [9]:
batch_extract_pos(sentences, ['VERB'], syntax_model)

POS Extraction (over Sentences):   0%|          | 0/3 [00:00<?, ?it/s]

[['playing'], ['barking'], ['jumping']]

In [10]:
batch_extract_pos(sentences, ['VERB'], syntax_model, lemmatize=True)

POS Extraction (over Sentences):   0%|          | 0/3 [00:00<?, ?it/s]

[['play'], ['bark'], ['jump']]

In [11]:
batch_extract_pos(sentences, ['VERB'], syntax_model, exclude=True)

POS Extraction (over Sentences):   0%|          | 0/3 [00:00<?, ?it/s]

[['the', 'cats', 'are', 'with', 'strings', '.'],
 ['the', 'dogs', 'are', 'at', 'the', 'birds', '.'],
 ['the', 'catfish', 'are', 'in', 'the', 'river', '.']]

In [12]:
batch_extract_pos(sentences, ['PUNCT'], syntax_model, exclude=True)

POS Extraction (over Sentences):   0%|          | 0/3 [00:00<?, ?it/s]

[['the', 'cats', 'are', 'playing', 'with', 'strings'],
 ['the', 'dogs', 'are', 'barking', 'at', 'the', 'birds'],
 ['the', 'catfish', 'are', 'jumping', 'in', 'the', 'river']]

## Perturbations

### Word Shuffling

In [22]:
from random import seed

In [34]:
from random import seed as random_seed

def shuffle_words(sentence, seed=0):
    random_seed(0) # set seed
    words = sentence.split(' ')
    shuffle(words)
    return ' '.join(words)

In [35]:
[shuffle_words(sentence) for sentence in sentences]

['with are cats The strings. playing',
 'at are dogs The the barking birds.',
 'in are catfish The the jumping river.']

In [36]:
def nouns_only(sentences, model, lemmatize=False, 
               shuffle=False, rejoin=True):
    
    if not isinstance(sentences, list):
        sentences = [sentences]

    extracts = batch_extract_pos(sentences, ['NOUN'], model,
                                 lemmatize = lemmatize)

    if shuffle:
        [shuffle(words) for words in extracts]

    if not rejoin:
        return extracts

    return [' '.join(extract) for extract in extracts] 

In [37]:
nouns_only(sentences, syntax_model)

POS Extraction (over Sentences):   0%|          | 0/3 [00:00<?, ?it/s]

['cats strings', 'dogs birds', 'catfish river']

In [38]:
def everything_but(sentences, model, pos_tags,
                   shuffle=False, rejoin=True):
    
    if not isinstance(sentences, list):
        sentences = [sentences]

    extracts = batch_extract_pos(sentences, pos_tags, model,
                                 exclude = True)

    if shuffle:
        [shuffle(words) for words in extracts]

    if not rejoin:
        return extracts

    return [' '.join(extract) for extract in extracts] 

In [39]:
exclude_pos_tags = ['NOUN','DET','PUNCT']
everything_but(sentences, syntax_model, exclude_pos_tags)

POS Extraction (over Sentences):   0%|          | 0/3 [00:00<?, ?it/s]

['are playing with', 'are barking at', 'are jumping in']