## Imports + Setup

In [1]:
%load_ext autoreload
%autoreload complete

In [23]:
from src import lang_permute
import pandas as pd
from random import seed as random_seed
import numpy as np

In [3]:
syntax_model = lang_permute.get_spacy_model()
fix_spelling = lang_permute.load_spellcheck()

spacy backend: en_core_web_trf


In [4]:
def shuffle_words(sentence, seed=0):
    random_seed(0) # set seed
    words = sentence.split(' ')
    lang_permute.shuffle(words)
    return ' '.join(words)

In [35]:
def pos_extraction(sentences, model, pos_tags, 
             lemmatize=False, shuffle=False,
                   exclude=False):
        
    if not isinstance(sentences, list):
        sentences = [sentences]
    
    extracts = lang_permute.batch_extract_pos(sentences, pos_tags, model,
                                              lemmatize=lemmatize, exclude=exclude)
    extracts = [' '.join(extract) for extract in extracts] 
    
    if shuffle:
        return [shuffle_words(words) for words in extracts]
    
    return extracts

In [14]:
cap_cols = [f'caption{str(i+1).zfill(2)}'for i in range(5)]
caption_df = pd.read_csv('../data/interim/CaptionData/captions.csv').set_index('video_name')
captions = list(caption_df[cap_cols].to_numpy().flatten())
corrected_captions_dict = fix_spelling(captions)
corrected_captions = [cap['generated_text'] for cap in corrected_captions_dict]
corrected_captions = pos_extraction(corrected_captions, syntax_model, ['PUNCT'], exclude=True)

POS Extraction (over Sentences): 100%|██████████| 1250/1250 [00:04<00:00, 258.86it/s]


In [37]:
out_conditions = {'lemmas_shuffled': [['PUNC'], True, True],
                  'lemmas_ordered': [['PUNC'], False, True],
                  'excnv_shuffled': [['NOUN','VERB'], True, True],
                  'excnv_ordered': [['NOUN','VERB'], False, True],
                  'nv_shuffled': [['NOUN','VERB'], True, False],
                  'nv_ordered': [['NOUN','VERB'], False, False],
                  'verb_shuffled': [['VERB'], True, False],
                  'verb_ordered': [['VERB'], False, False],
                  'noun_shuffled': [['NOUN'], True, False],
                  'noun_ordered': [['NOUN'], False, False],
                  }

In [39]:
for key, val in out_conditions.items(): 
    print(key, val)
    out_name = f'../data/interim/CaptionData/{key}.txt'
    extracted_captions = pos_extraction(corrected_captions, syntax_model,
                                        val[0], lemmatize=True, 
                                        shuffle=val[1], exclude=val[2])
    print(extracted_captions[:5])
    print()
    file = open(out_name,'w')
    for item in extracted_captions:
        file.write(item+"\n")
    file.close()

lemmas_shuffled [['PUNC'], True, True]


POS Extraction (over Sentences): 100%|██████████| 1250/1250 [00:04<00:00, 251.60it/s]


['hold two game man while sit play a with his tv video a on small baby', 'lap play man on a baby with a wii his', 'a a in man to in short standard with chair next sit a a smile lamp', 'man baby the wii chuckle play on be make the a dog while which the', 'a child and father show enjoy']

lemmas_ordered [['PUNC'], False, True]


POS Extraction (over Sentences): 100%|██████████| 1250/1250 [00:04<00:00, 259.21it/s]


['a man sit play video game on his tv while hold a baby with two small', 'a man with a baby on his lap play wii', 'a man in short sit in a chair next to a standard lamp with a smile', 'a man play on the wii which be make the baby chuckle while the dog', 'father and child enjoy a show']

excnv_shuffled [['NOUN', 'VERB'], True, True]


POS Extraction (over Sentences): 100%|██████████| 1250/1250 [00:04<00:00, 261.04it/s]


['a on with his a while small two', 'his a with a wii on', 'standard with in to a next in a a a', 'be on the the a which the while', 'and a']

excnv_ordered [['NOUN', 'VERB'], False, True]


POS Extraction (over Sentences): 100%|██████████| 1250/1250 [00:04<00:00, 261.33it/s]

['a on his while a with two small', 'a with a on his wii', 'a in in a next to a standard with a', 'a on the which be the while the', 'and a']




