In [1]:
# request the raw text of The Great Gatsby
# you will need to leverage the requests package
import requests
r = requests.get(r'https://www.gutenberg.org/cache/epub/64317/pg64317.txt')
great_gatsby = r.text

# first, remove unwanted new line and tab characters from the text
for char in ["\n", "\r", "\d", "\t"]:
    great_gatsby = great_gatsby.replace(char, " ")
    
# you can also subset for the book text
# (removing the project gutenburg introduction/footnotes)
great_gatsby = great_gatsby[1433:277912]
print(great_gatsby)



In [2]:
# print out some information about the text

# what's the data type of your text
print(f"the type of your data: {type(great_gatsby)}")

# how long is your text (in characters)?
print(f"length = {len(great_gatsby)} characters")

# Which of your favorite characters is most mentioned?

# create an empty dict to keep track of mentions by character
reference_dict = {}
# create a list of characters
characters = ["daisy", "jay", "nick", "tom", "myrtle"]
# loop through each character to count their mentions
for character in characters:
    reference_dict[character] = great_gatsby.lower().count(character)
# turn your dictionary into a pandas dataframe and print it
import pandas as pd
df = pd.DataFrame(list(reference_dict.items()),
                 columns = ["character", "mentions"])
df = df.set_index("character")
df = df.sort_values(by = "mentions",
                   ascending = False)
print("\n")
print(df)

the type of your data: <class 'str'>
length = 276479 characters


           mentions
character          
tom             219
daisy           186
nick             27
myrtle           23
jay              12


In [3]:
# if you don't have nltk installed..
# pip install nltk

import nltk
import re

def tokenize_text(text: str):
    
    # lowercase the text
    text = text.lower()
    
    # remove punctuation from text
    text = re.sub(r"[^\w\s]", "", text)
    
    # tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # remove stopwords from txt_tokens and word_tokens
    from nltk.corpus import stopwords
    english_stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in english_stop_words]
    
    # return your tokens
    return tokens

tokens = tokenize_text(text = great_gatsby)
print(tokens)



In [4]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def lemmatize_tokens(tokens):
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # return your lemmatized tokens
    return lemmatized_tokens

lemmatized_tokens = lemmatize_tokens(tokens = tokens)
print(lemmatized_tokens)

[nltk_data] Downloading package wordnet to /home/camilabs/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




In [None]:
# return the most common tokens
def return_top_tokens(tokens,
                      top_N = 10):

    # first, count the frequency of every unique token
    word_token_distribution = nltk.FreqDist(tokens)
    
    # next, filter for only the most common top_N tokens
    # also, put this in a dataframe
    top_tokens = pd.DataFrame(word_token_distribution.most_common(top_N),
                              columns=['Word', 'Frequency'])
    
    # return the top_tokens dataframe
    return top_tokens

# run the return_top_tokens and print the results
top_tokens = return_top_tokens(tokens = lemmatized_tokens,
                               top_N = 10)
print(top_tokens)

In [None]:
# return the most common bi-grams
from nltk.collocations import BigramCollocationFinder

def return_top_bigrams(tokens,
                       top_N = 10):
    
    # collect bigrams
    bcf = BigramCollocationFinder.from_words(tokens)
    
    # put bigrams into a dataframe
    bigram_df = pd.DataFrame(data = bcf.ngram_fd.items(),
                             columns = ['Bigram', 'Frequency'])
    
    # sort the dataframe by frequency
    bigram_df = bigram_df.sort_values(by=['Frequency'],ascending = False).reset_index(drop=True)
    
    # filter for only top bigrams
    bigram_df = bigram_df[0:top_N]
    
    # return the bigram dataframe
    return bigram_df

# run the return_top_bigrams function and print the results
bigram_df = return_top_bigrams(tokens = lemmatized_tokens,
                               top_N = 10)
print(bigram_df)

In [None]:
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

def return_sentiment_df(tokens):

    # initialize sentiment analyzer
    sia = SentimentIntensityAnalyzer()
    
    # create some counters for sentiment of each token
    positive_tokens = 0
    negative_tokens = 0
    neutral_tokens = 0
    compound_scores = []
        
    # loop through each token
    for token in tokens:
        
        if sia.polarity_scores(token)["compound"] > 0:
            
            positive_tokens += 1
            compound_scores.append(sia.polarity_scores(token)["compound"])
            
        elif sia.polarity_scores(token)["compound"] < 0:
            
            negative_tokens += 1
            compound_scores.append(sia.polarity_scores(token)["compound"])
              
        elif sia.polarity_scores(token)["compound"] == 0:
            
            neutral_tokens += 1
            compound_scores.append(sia.polarity_scores(token)["compound"])
            
    # put sentiment results into a dataframe
    compound_score_numbers = [num for num in compound_scores if num != 0]
    sentiment_df = pd.DataFrame(data = {"total_tokens" : len(tokens),
                                        "positive_tokens" : positive_tokens,
                                        "negative_tokens" : negative_tokens,
                                        "neutral_tokens" : neutral_tokens,
                                        "compound_sentiment_score" : sum(compound_score_numbers) / len(compound_score_numbers)},
                                index = [0])

    # return sentiment_df
    return sentiment_df

sentiment_df = return_sentiment_df(tokens = lemmatized_tokens)
print(sentiment_df)

In [None]:
# what if your text is stored in a dataframe column?

# (the simplest) way is to pull the column a single string
def tokenize_dataframe_columns(dataframe,
                               column: str):
    
    # create string from columnremarks text
    string_from_column = dataframe[column].cat(sep=' ')
    
    # tokenize the new string using the function we already defined above
    tokens = tokenize_text(text = string_from_column)
    
    # return tokens
    return tokens