In [1]:
import nltk
import string
import numpy as np
import pandas as pd

from nltk.corpus import words
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = []
with open("DanceScripts.txt", "r") as danceFile:
    danceLines = danceFile.readlines()
for i in range(0, len(danceLines), 2):
    data.append(danceLines[i])
    
print(len(data))
print(data[:5])

500
["Ladies and gentlemen, prepare to be mesmerized by the dance artistry of Isabella Rodriguez! Her moves are poetry in motion, and tonight, she graces our stage with a performance that promises to be nothing short of extraordinary. Let's welcome Isabella Rodriguez to the spotlight!\n", 'Hold your applause for the spellbinding duo, Aiden and Lily! Their synchronized dance routines and seamless connection are set to transport us into a world of rhythmic harmony. Brace yourselves for a mesmerizing performance by Aiden and Lily!\n', 'In the spotlight tonight is the dynamic performer, Elijah Turner! With a fusion of contemporary and street dance, Elijah is about to break all the rules and redefine the dance narrative. Get ready for a performance that pushes the boundaries with Elijah Turner!\n', "Make way for the sensational Maya Patel! Her dance style is a symphony of elegance and strength, creating a visual masterpiece with every movement. Let's give a warm welcome to the enchanting Ma

In [3]:
danceData = pd.DataFrame()
danceData["raw"] = data
danceData["raw"].head()

0    Ladies and gentlemen, prepare to be mesmerized...
1    Hold your applause for the spellbinding duo, A...
2    In the spotlight tonight is the dynamic perfor...
3    Make way for the sensational Maya Patel! Her d...
4    Hold onto your seats as we welcome the charism...
Name: raw, dtype: object

In [4]:
def lowerCase(sentence):
    sentence = sentence.lower()
    return sentence

danceData["lowerCase"] = danceData["raw"].apply(lowerCase)
danceData["lowerCase"].head()

0    ladies and gentlemen, prepare to be mesmerized...
1    hold your applause for the spellbinding duo, a...
2    in the spotlight tonight is the dynamic perfor...
3    make way for the sensational maya patel! her d...
4    hold onto your seats as we welcome the charism...
Name: lowerCase, dtype: object

In [5]:
def tokenize(sentence):
    tokenized = sentence.split(" ")
    return tokenized

danceData["tokenize"] = danceData["lowerCase"].apply(tokenize)
danceData["tokenize"].head()

0    [ladies, and, gentlemen,, prepare, to, be, mes...
1    [hold, your, applause, for, the, spellbinding,...
2    [in, the, spotlight, tonight, is, the, dynamic...
3    [make, way, for, the, sensational, maya, patel...
4    [hold, onto, your, seats, as, we, welcome, the...
Name: tokenize, dtype: object

In [6]:
danceData.head()

Unnamed: 0,raw,lowerCase,tokenize
0,"Ladies and gentlemen, prepare to be mesmerized...","ladies and gentlemen, prepare to be mesmerized...","[ladies, and, gentlemen,, prepare, to, be, mes..."
1,"Hold your applause for the spellbinding duo, A...","hold your applause for the spellbinding duo, a...","[hold, your, applause, for, the, spellbinding,..."
2,In the spotlight tonight is the dynamic perfor...,in the spotlight tonight is the dynamic perfor...,"[in, the, spotlight, tonight, is, the, dynamic..."
3,Make way for the sensational Maya Patel! Her d...,make way for the sensational maya patel! her d...,"[make, way, for, the, sensational, maya, patel..."
4,Hold onto your seats as we welcome the charism...,hold onto your seats as we welcome the charism...,"[hold, onto, your, seats, as, we, welcome, the..."


In [7]:
danceData.to_csv("preProcessedDanceData.csv", index=False)

In [8]:
modifiedTag = []
words = words.words()
for tokens in danceData["tokenize"]:
    tag = nltk.pos_tag(tokens)
    temp = []
    for element in tag:
        modifiedElement = list(element)
        word = modifiedElement[0]
        posTag = modifiedElement[1]
        if posTag == "NN":
            if word not in words:
                modifiedElement[0] = "name"
        temp.append(modifiedElement)
    modifiedTag.append(temp)
print(len(modifiedTag))
modifiedTag[0]

500


[['ladies', 'NNS'],
 ['and', 'CC'],
 ['gentlemen,', 'NNS'],
 ['prepare', 'VBP'],
 ['to', 'TO'],
 ['be', 'VB'],
 ['mesmerized', 'VBN'],
 ['by', 'IN'],
 ['the', 'DT'],
 ['dance', 'NN'],
 ['artistry', 'NN'],
 ['of', 'IN'],
 ['name', 'NN'],
 ['name', 'NN'],
 ['her', 'PRP$'],
 ['moves', 'NNS'],
 ['are', 'VBP'],
 ['poetry', 'NN'],
 ['in', 'IN'],
 ['name', 'NN'],
 ['and', 'CC'],
 ['tonight,', 'VB'],
 ['she', 'PRP'],
 ['graces', 'VBZ'],
 ['our', 'PRP$'],
 ['stage', 'NN'],
 ['with', 'IN'],
 ['a', 'DT'],
 ['performance', 'NN'],
 ['that', 'WDT'],
 ['promises', 'VBZ'],
 ['to', 'TO'],
 ['be', 'VB'],
 ['nothing', 'NN'],
 ['short', 'JJ'],
 ['of', 'IN'],
 ['extraordinary.', 'JJ'],
 ["let's", 'JJ'],
 ['welcome', 'JJ'],
 ['name', 'NN'],
 ['name', 'NN'],
 ['to', 'TO'],
 ['the', 'DT'],
 ['name', 'NN']]

In [9]:
modifiedDataLines = []
for tag in modifiedTag:
    wordList = []
    for element in tag:
        word_ = element[0]
        wordList.append(word_)
    sentence = " ".join(wordList)
    modifiedDataLines.append(sentence)
print(modifiedDataLines)

["ladies and gentlemen, prepare to be mesmerized by the dance artistry of name name her moves are poetry in name and tonight, she graces our stage with a performance that promises to be nothing short of extraordinary. let's welcome name name to the name", 'hold your applause for the spellbinding name name and lily! their synchronized dance routines and seamless connection are set to transport us into a world of rhythmic name brace yourselves for a name performance by aiden and lily!\n', 'in the spotlight tonight is the dynamic name name name with a fusion of contemporary and street dance, elijah is about to break all the rules and redefine the dance name get ready for a performance that pushes the boundaries with elijah name', "make way for the sensational maya patel! her dance style is a symphony of elegance and name creating a visual masterpiece with every name let's give a warm welcome to the enchanting maya patel as she graces our name", 'hold onto your seats as we welcome the char

In [10]:
len(modifiedDataLines)

500

In [11]:
tokenizer = BertTokenizer.from_pretrained('dkleczek/bert-base-polish-uncased-v1', from_tf=True)

In [12]:
maxlen = 0
for line in modifiedDataLines:
    length = len(line)
    if length > maxlen:
        maxlen = length
print(maxlen)

269


In [13]:
tokenList = []
for line in modifiedDataLines:
    tokens = tokenizer(line.strip(), padding="max_length", truncation=True, max_length=maxlen)
    tokenList.append(tokens["input_ids"])

In [14]:
tokenList = np.array(tokenList)
tokenList.shape

(500, 269)