## Data Formatting

Data is formatted and annotated for use with various models later in the project. Reformatting includes annotating lines with syllable counts of words, breaking lines up into work sequences, etc. 

#### Imports and Function Definition

In [1]:
import pandas as pd
import numpy as np 
import cmudict as cmu
import re 

In [2]:
phones = cmu.dict()

def syllable_count(word): 
    try: 
        if word == "," or word == "." or word == "!" or word == ";" or word == "-" or word == "/" or word == "\n": 
            return 0
        elif word == " ": 
            pass
        else: 
            return sum([char.isdigit() for block in phones[f"{word}"][0] for char in block])    
    except: 
        return None
    
# thanks to this Stack Exchange post for the inspiration 
# https://datascience.stackexchange.com/questions/23376/how-to-get-the-number-of-syllables-in-a-word


def tuplize(word): 
    return (word, syllable_count(word))

In [3]:
tuplize("self-substantiatlize")

('self-substantiatlize', None)

In [4]:
couplets = pd.read_csv("../data/couplets.csv")

lines = pd.read_csv("../data/lines.csv")

#### Formatting Couplets

In [5]:
# itemizing each couplet, keeping the puncuation 

itemized = []

for i in range(couplets.shape[0]): 
    itemized.append(tuple(w.lower() for w in re.findall(r"[\w']+|[-:.,!?;\/]", couplets.loc[i, "couplet"])))
    
couplets["itemized"] = itemized

In [6]:
# itemizing, without puncuation

itemized_sans = []

for i in range(couplets.shape[0]):  
    itemized_sans.append(tuple(w.lower() for w in re.split(r"[-:.,\/\s]", couplets.loc[i, "couplet"])))

couplets["itemized_no_punc"] = itemized_sans


In [7]:
# annotating the itemized versions with syllable count for each element

# with puncutation
ann_itemized = [] 

for i in range(couplets.shape[0]): 
    ann_element = []
    for e in couplets.loc[i, "itemized"]: 
        ann_element.append(tuplize(e.lower()))
    ann_itemized.append(tuple(ann_element))

couplets["annotated"] = ann_itemized

#without puncuation
ann_itemized_sans = []

for i in range(couplets.shape[0]): 
    ann_sans_element = []
    for e in couplets.loc[i, "itemized_no_punc"]: 
        ann_sans_element.append(tuplize(e.lower()))
    ann_itemized_sans.append(tuple(ann_sans_element))
    
couplets["annotated_no_punc"] = ann_itemized_sans

In [20]:
couplets.to_csv("../data/couplets_formatted.csv", index = False)

#### Formatting Lines

In [9]:
# itemizing each line, keeping the puncuation 

itemized = []

for i in range(lines.shape[0]): 
    itemized.append(tuple(w.lower() for w in re.findall(r"[\w']+|[-:.,!?;\/]", lines.loc[i, "line"])))
    
lines["itemized"] = itemized

In [10]:
# itemizing, without puncuation

itemized_sans = []

for i in range(lines.shape[0]):  
    itemized_sans.append(tuple(w.lower() for w in re.split(r"[-:.,\/\s]", lines.loc[i, "line"])))

lines["itemized_no_punc"] = itemized_sans

In [11]:
# annotating the itemized versions with syllable count for each element

# with puncutation
ann_itemized = [] 

for i in range(lines.shape[0]): 
    ann_element = []
    for e in lines.loc[i, "itemized"]: 
        ann_element.append(tuplize(e.lower()))
    ann_itemized.append(tuple(ann_element))

lines["annotated"] = ann_itemized

#without puncuation
ann_itemized_sans = []

for i in range(lines.shape[0]): 
    ann_sans_element = []
    for e in lines.loc[i, "itemized_no_punc"]: 
        ann_sans_element.append(tuplize(e.lower()))
    ann_itemized_sans.append(tuple(ann_sans_element))
    
lines["annotated_no_punc"] = ann_itemized_sans

In [18]:
lines.to_csv("../data/lines_formatted.csv", index = False)

#### Creating NGram and Individual Word Sequence Dataframe

In [12]:
one_gram = []
two_gram = []
three_gram = []

previous = []
next_one = []
next_two = []
next_three = []

one_gram_sans = []
two_gram_sans = []
three_gram_sans = []

previous_sans = []
next_one_sans = []
next_two_sans = []
next_three_sans = []


In [13]:
for i in range(lines.shape[0]): 
    
    # first with puncuation 
    for w in range(len(lines.loc[i, "itemized"])): 
        
        try: 
            # add current starting word and ngrams
            one_gram.append(lines.loc[i, "itemized"][w])

            # add previous word
            if w == 0: 
                previous.append("\n")
            else: 
                previous.append(lines.loc[i, "itemized"][w - 1])

            # add next word 
            if w == (len(lines.loc[i, "itemized"]) - 1): 
                next_one.append("\n")
                next_two.append("\n")
                next_three.append("\n")

                two_gram.append("\n")
                three_gram.append("\n")

            elif w == (len(lines.loc[i, "itemized"]) - 2): 
                next_one.append(lines.loc[i, "itemized"][w+1])
                next_two.append("\n")
                next_three.append("\n")

                two_gram.append(lines.loc[i, "itemized"][w : w+2])
                three_gram.append("\n")

            elif w == (len(lines.loc[i, "itemized"]) - 3): 
                next_one.append(lines.loc[i, "itemized"][w+1])
                next_two.append(lines.loc[i, "itemized"][w+2])
                next_three.append("\n")

                two_gram.append(lines.loc[i, "itemized"][w : w+2])
                three_gram.append(lines.loc[i, "itemized"][w : w+3])

            else: 
                next_one.append(lines.loc[i, "itemized"][w+1])
                next_two.append(lines.loc[i, "itemized"][w+2])
                next_three.append(lines.loc[i, "itemized"][w+3])

                two_gram.append(lines.loc[i, "itemized"][w : w+2])
                three_gram.append(lines.loc[i, "itemized"][w : w+3])
    
        
            # now without puncuation 
            # add current starting word and ngrams
            one_gram_sans.append(lines.loc[i, "itemized_no_punc"][w])        

            # add previous word
            if w == 0: 
                previous_sans.append("\n")
            else: 
                previous_sans.append(lines.loc[i, "itemized_no_punc"][w - 1])

            # add next word 
            if w == (len(lines.loc[i, "itemized_no_punc"]) - 1): 
                next_one_sans.append("\n")
                next_two_sans.append("\n")
                next_three_sans.append("\n")

                two_gram_sans.append("\n")
                three_gram_sans.append("\n")

            elif w == (len(lines.loc[i, "itemized_no_punc"]) - 2): 
                next_one_sans.append(lines.loc[i, "itemized_no_punc"][w+1])
                next_two_sans.append("\n")
                next_three_sans.append("\n")

                two_gram_sans.append(lines.loc[i, "itemized_no_punc"][w : w+2])
                three_gram_sans.append("\n")

            elif w == (len(lines.loc[i, "itemized_no_punc"]) - 3): 
                next_one_sans.append(lines.loc[i, "itemized_no_punc"][w+1])
                next_two_sans.append(lines.loc[i, "itemized_no_punc"][w+2])
                next_three_sans.append("\n")

                two_gram_sans.append(lines.loc[i, "itemized_no_punc"][w : w+2])
                three_gram_sans.append(lines.loc[i, "itemized_no_punc"][w : w+3])

            else: 
                next_one_sans.append(lines.loc[i, "itemized_no_punc"][w+1])
                next_two_sans.append(lines.loc[i, "itemized_no_punc"][w+2])
                next_three_sans.append(lines.loc[i, "itemized_no_punc"][w+3])

                two_gram_sans.append(lines.loc[i, "itemized_no_punc"][w : w+2])
                three_gram_sans.append(lines.loc[i, "itemized_no_punc"][w : w+3])
        
        except: 
            pass 
        
        

In [14]:
word_sequence = {"previous_word": previous, "one_word": one_gram, "one_word_next": next_one, 
                 "two_word": two_gram, "two_word_next": next_two, "three_word": three_gram, 
                 "three_word_next": next_three}

word_sequence_np = {"np_previous_word": previous_sans, 
                 "np_one_word": one_gram_sans, "np_one_word_next": next_one_sans, 
                 "np_two_word": two_gram_sans, "np_two_word_next": next_two_sans, 
                 "np_three_word": three_gram_sans, "np_three_word_next": next_three_sans}

ann_word_seq = {"previous_word": list(map(tuplize, previous)), "one_word": list(map(tuplize, one_gram)), 
                "one_word_next": list(map(tuplize, next_one)), "two_word": list(map(tuplize, two_gram)), 
                "two_word_next": list(map(tuplize, next_two)), "three_gram": list(map(tuplize, three_gram)), 
                "three_word_next": list(map(tuplize, next_three))}

ann_word_seq_np = {"np_previous_word": list(map(tuplize, previous_sans)), "np_one_word": list(map(tuplize, one_gram_sans)), 
                   "np_one_word_next": list(map(tuplize, next_one_sans)), "np_two_word": list(map(tuplize, two_gram_sans)), 
                   "np_two_word_next": list(map(tuplize, next_two_sans)), "np_three_word": list(map(tuplize, three_gram_sans)), 
                   "np_three_word_next": list(map(tuplize, next_three_sans))}


In [15]:
word_sequence_df = pd.DataFrame(word_sequence)

word_sequence_np_df = pd.DataFrame(word_sequence_np)

ann_word_seq_df = pd.DataFrame(ann_word_seq)

ann_word_seq_np_df = pd.DataFrame(ann_word_seq_np)

In [16]:
word_sequence_df.to_csv("../data/word_sequences.csv", index = False)

word_sequence_np_df.to_csv("../data/word_sequences_np.csv", index = False)

ann_word_seq_df.to_csv("../data/word_sequences_ann.csv", index = False)

ann_word_seq_np_df.to_csv("../data/word_sequences_ann_np.csv", index = False)