In [2]:
# TF-IDF, Word2Vec, FastText

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
with open("DanceScripts.txt", "r") as f:
    lines = f.readlines()

In [4]:
cleaned_lines = []

for _ in lines:
    if len(_) > 10:
        cleaned_lines += [_]

In [5]:
cleaned_lines

["Ladies and gentlemen, prepare to be mesmerized by the dance artistry of Isabella Rodriguez! Her moves are poetry in motion, and tonight, she graces our stage with a performance that promises to be nothing short of extraordinary. Let's welcome Isabella Rodriguez to the spotlight!\n",
 'Hold your applause for the spellbinding duo, Aiden and Lily! Their synchronized dance routines and seamless connection are set to transport us into a world of rhythmic harmony. Brace yourselves for a mesmerizing performance by Aiden and Lily!\n',
 'In the spotlight tonight is the dynamic performer, Elijah Turner! With a fusion of contemporary and street dance, Elijah is about to break all the rules and redefine the dance narrative. Get ready for a performance that pushes the boundaries with Elijah Turner!\n',
 "Make way for the sensational Maya Patel! Her dance style is a symphony of elegance and strength, creating a visual masterpiece with every movement. Let's give a warm welcome to the enchanting May

In [6]:
import pandas as pd
df = pd.DataFrame(columns=["RAW"])
df["RAW"] = cleaned_lines

## Preprocessing

### Punctuation

In [7]:
import string

punc = string.punctuation

def clean_punc(string):
    temp = ""
    for c in string:
        if c not in punc:
            temp += c
    return temp
df["no_punc"] = df["RAW"].apply(clean_punc)

### Tokenization

In [8]:
def tokenize(string):
    string = string.lower()
    return string.split()

In [9]:
df["Tokens"] = df["no_punc"].apply(tokenize)
df

Unnamed: 0,RAW,no_punc,Tokens
0,"Ladies and gentlemen, prepare to be mesmerized...",Ladies and gentlemen prepare to be mesmerized ...,"[ladies, and, gentlemen, prepare, to, be, mesm..."
1,"Hold your applause for the spellbinding duo, A...",Hold your applause for the spellbinding duo Ai...,"[hold, your, applause, for, the, spellbinding,..."
2,In the spotlight tonight is the dynamic perfor...,In the spotlight tonight is the dynamic perfor...,"[in, the, spotlight, tonight, is, the, dynamic..."
3,Make way for the sensational Maya Patel! Her d...,Make way for the sensational Maya Patel Her da...,"[make, way, for, the, sensational, maya, patel..."
4,Hold onto your seats as we welcome the charism...,Hold onto your seats as we welcome the charism...,"[hold, onto, your, seats, as, we, welcome, the..."
...,...,...,...
115,Join me in welcoming the charismatic performer...,Join me in welcoming the charismatic performer...,"[join, me, in, welcoming, the, charismatic, pe..."
116,Hold onto your seats as we present the enchant...,Hold onto your seats as we present the enchant...,"[hold, onto, your, seats, as, we, present, the..."
117,Get ready for the soulful performance of Olivi...,Get ready for the soulful performance of Olivi...,"[get, ready, for, the, soulful, performance, o..."
118,In the spotlight tonight is the sensational ta...,In the spotlight tonight is the sensational ta...,"[in, the, spotlight, tonight, is, the, sensati..."


### Stopwords

In [10]:
st = stopwords.words("english")

In [11]:
def remove_stop_words(tokens):
    temp = []
    for tk in tokens:
        if tk not in st:
            temp += [tk]

    return temp

df["removed_stop"] = df["Tokens"].apply(remove_stop_words)

In [12]:
df

Unnamed: 0,RAW,no_punc,Tokens,removed_stop
0,"Ladies and gentlemen, prepare to be mesmerized...",Ladies and gentlemen prepare to be mesmerized ...,"[ladies, and, gentlemen, prepare, to, be, mesm...","[ladies, gentlemen, prepare, mesmerized, dance..."
1,"Hold your applause for the spellbinding duo, A...",Hold your applause for the spellbinding duo Ai...,"[hold, your, applause, for, the, spellbinding,...","[hold, applause, spellbinding, duo, aiden, lil..."
2,In the spotlight tonight is the dynamic perfor...,In the spotlight tonight is the dynamic perfor...,"[in, the, spotlight, tonight, is, the, dynamic...","[spotlight, tonight, dynamic, performer, elija..."
3,Make way for the sensational Maya Patel! Her d...,Make way for the sensational Maya Patel Her da...,"[make, way, for, the, sensational, maya, patel...","[make, way, sensational, maya, patel, dance, s..."
4,Hold onto your seats as we welcome the charism...,Hold onto your seats as we welcome the charism...,"[hold, onto, your, seats, as, we, welcome, the...","[hold, onto, seats, welcome, charismatic, duo,..."
...,...,...,...,...
115,Join me in welcoming the charismatic performer...,Join me in welcoming the charismatic performer...,"[join, me, in, welcoming, the, charismatic, pe...","[join, welcoming, charismatic, performer, ava,..."
116,Hold onto your seats as we present the enchant...,Hold onto your seats as we present the enchant...,"[hold, onto, your, seats, as, we, present, the...","[hold, onto, seats, present, enchanting, chore..."
117,Get ready for the soulful performance of Olivi...,Get ready for the soulful performance of Olivi...,"[get, ready, for, the, soulful, performance, o...","[get, ready, soulful, performance, olivia, fos..."
118,In the spotlight tonight is the sensational ta...,In the spotlight tonight is the sensational ta...,"[in, the, spotlight, tonight, is, the, sensati...","[spotlight, tonight, sensational, talent, etha..."


In [13]:
nltk.download('wordnet',  download_dir="./") 
nltk.download('averaged_perceptron_tagger', download_dir="./")

from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

def lemmatize(tokens):
    temp = [lemma.lemmatize(tk) for tk in tokens]
    return temp

[nltk_data] Downloading package wordnet to ./...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to ./...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [14]:
df["lems"] = df["removed_stop"].apply(lemmatize)
df

Unnamed: 0,RAW,no_punc,Tokens,removed_stop,lems
0,"Ladies and gentlemen, prepare to be mesmerized...",Ladies and gentlemen prepare to be mesmerized ...,"[ladies, and, gentlemen, prepare, to, be, mesm...","[ladies, gentlemen, prepare, mesmerized, dance...","[lady, gentleman, prepare, mesmerized, dance, ..."
1,"Hold your applause for the spellbinding duo, A...",Hold your applause for the spellbinding duo Ai...,"[hold, your, applause, for, the, spellbinding,...","[hold, applause, spellbinding, duo, aiden, lil...","[hold, applause, spellbinding, duo, aiden, lil..."
2,In the spotlight tonight is the dynamic perfor...,In the spotlight tonight is the dynamic perfor...,"[in, the, spotlight, tonight, is, the, dynamic...","[spotlight, tonight, dynamic, performer, elija...","[spotlight, tonight, dynamic, performer, elija..."
3,Make way for the sensational Maya Patel! Her d...,Make way for the sensational Maya Patel Her da...,"[make, way, for, the, sensational, maya, patel...","[make, way, sensational, maya, patel, dance, s...","[make, way, sensational, maya, patel, dance, s..."
4,Hold onto your seats as we welcome the charism...,Hold onto your seats as we welcome the charism...,"[hold, onto, your, seats, as, we, welcome, the...","[hold, onto, seats, welcome, charismatic, duo,...","[hold, onto, seat, welcome, charismatic, duo, ..."
...,...,...,...,...,...
115,Join me in welcoming the charismatic performer...,Join me in welcoming the charismatic performer...,"[join, me, in, welcoming, the, charismatic, pe...","[join, welcoming, charismatic, performer, ava,...","[join, welcoming, charismatic, performer, ava,..."
116,Hold onto your seats as we present the enchant...,Hold onto your seats as we present the enchant...,"[hold, onto, your, seats, as, we, present, the...","[hold, onto, seats, present, enchanting, chore...","[hold, onto, seat, present, enchanting, choreo..."
117,Get ready for the soulful performance of Olivi...,Get ready for the soulful performance of Olivi...,"[get, ready, for, the, soulful, performance, o...","[get, ready, soulful, performance, olivia, fos...","[get, ready, soulful, performance, olivia, fos..."
118,In the spotlight tonight is the sensational ta...,In the spotlight tonight is the sensational ta...,"[in, the, spotlight, tonight, is, the, sensati...","[spotlight, tonight, sensational, talent, etha...","[spotlight, tonight, sensational, talent, etha..."


In [15]:
df.to_csv("ProcessedDanceScripts.csv", index=False)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
words = vectorizer.get_feature_names_out()
print(words)

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [17]:
X.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [18]:
words

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)