In [11]:
# We use the Natural Language Toolkit Library (NLTK) to look at individual words and
# sentences in a text, and to clean unnecessary features from the text data 
# to prepare for sentiment analysis. TEXT ANALYSIS

# The NLTK library was built to separate punctuation from words
# when tokenizing (splitting into parts).

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# this is sample data
from nltk.corpus import names

from string import punctuation

In [12]:
# import pandas as pd

# load the data from 12dancingprincesses.txt

# filepath = r'12dancingprincesses.txt'
# df = pd.read_csv(filepath, encoding = "cp125")  # got this encoding from Dawit!

# df.head()

In [13]:
import pandas as pd

filename = '12dancingprincesses.txt'

with open(filename, encoding= "latin-1") as f_obj:
    contents = f_obj.read()

In [19]:
word_tokenize(contents)

['the',
 'twelve',
 'dancing',
 'princesses',
 'there',
 'was',
 'a',
 'king',
 'who',
 'had',
 'twelve',
 'beautiful',
 'daughters',
 '.',
 'they',
 'slept',
 'in',
 'twelve',
 'beds',
 'all',
 'in',
 'one',
 'room',
 ';',
 'and',
 'when',
 'they',
 'went',
 'to',
 'bed',
 ',',
 'the',
 'doors',
 'were',
 'shut',
 'and',
 'locked',
 'up',
 ';',
 'but',
 'every',
 'morning',
 'their',
 'shoes',
 'were',
 'found',
 'to',
 'be',
 'quite',
 'worn',
 'through',
 'as',
 'if',
 'they',
 'had',
 'been',
 'danced',
 'in',
 'all',
 'night',
 ';',
 'and',
 'yet',
 'nobody',
 'could',
 'find',
 'out',
 'how',
 'it',
 'happened',
 ',',
 'or',
 'where',
 'they',
 'had',
 'been',
 '.',
 'then',
 'the',
 'king',
 'made',
 'it',
 'known',
 'to',
 'all',
 'the',
 'land',
 ',',
 'that',
 'if',
 'any',
 'person',
 'could',
 'discover',
 'the',
 'secret',
 ',',
 'and',
 'find',
 'out',
 'where',
 'it',
 'was',
 'that',
 'the',
 'princesses',
 'danced',
 'in',
 'the',
 'night',
 ',',
 'he',
 'should',
 'ha

In [15]:
# Let's clean it up using NLTK and do a basic analysis.

# first, change all the words to lowercase
contents = contents.lower()

# then tokenize each part of the text
tknz_wct = word_tokenize(contents)

In [16]:
# length includes words + punctuation

len(tknz_wct)

1803

In [17]:
# look at the first 11 tokens in the list. A list slice...

tknz_wct[:11]

['the',
 'twelve',
 'dancing',
 'princesses',
 'there',
 'was',
 'a',
 'king',
 'who',
 'had',
 'twelve']

In [20]:
# The NLTK FreqDist gives a count for how often each part of the text occurs.
# A word is the key, and the value is the count. It's as dictionary of Frequency Distribution

fd_wct = FreqDist(tknz_wct)
fd_wct

FreqDist({'the': 139, ',': 102, 'and': 78, 'to': 42, '.': 35, ';': 35, 'he': 33, 'they': 32, 'of': 28, 'in': 25, ...})

In [21]:
# .most common() shows the top words in the text. It's from the NLTK Library
# Not very useful because it stilll has lots of filler...

fd_wct.most_common(11)

[('the', 139),
 (',', 102),
 ('and', 78),
 ('to', 42),
 ('.', 35),
 (';', 35),
 ('he', 33),
 ('they', 32),
 ('of', 28),
 ('in', 25),
 ('was', 24)]

In [22]:
# number of tokens in list before punctuation removal
len(tknz_wct)

1803

In [23]:
# usually only standard punctuation

punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [24]:
# Let's now remove the punctuation tokens from the list

for token in tknz_wct:
    if token in punctuation:
        tknz_wct.remove(token)

In [25]:
# Now, what's the number of tokens in our list AFTER punctuation removal

len(tknz_wct)

1617

In [26]:
# list of English stopwords. Stopwords are ALSO filler words

eng_stopwords = stopwords.words('english')
eng_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [27]:
rm_count = 0
new_words = []  # empty list to hold new words

for token in tknz_wct:
    if token not in eng_stopwords:
        new_words.append(token)
    else: rm_count += 1

In [28]:
rm_count

918

In [29]:
len(new_words)

699

In [30]:
# Now, let's see the NEW TOP 10 WORDS in this text.

fd_nw = FreqDist(new_words)
fd_nw.most_common(11)

[('soldier', 19),
 ('princesses', 17),
 ('said', 16),
 ('king', 12),
 ('twelve', 11),
 ('went', 11),
 ('came', 10),
 ('eldest', 10),
 ('one', 7),
 ('night', 7),
 ('happened', 7)]