In [1]:
#ATLA sentiment analysis project
#Author: Justin Marotta

#Objectives:
#properly implement sentiment analysis using a transformer architecture
#perform tokenization and preprocessing from scratch
#finetune existing transformer architecture on processed transcripts from Avatar: The Last Airbender
#showcase how each character's sentiment/emotion changes over the course of the show (Aang, Katara, Sokka, Toph, Zuko, Iroh)
#showcase how to use the model to predict sentiment on unseen data
#gain insights into the show's storytelling from a psychological perspective

#Steps:
#Data collection
    #-gather transcripts of the show
#Preprocessing
    #-tokenization
    #-data cleaning (lower case, remove special characters, remove stop words, lemmatization)
#Modeling
    #-finetune existing transformer architecture
    #-train model, evaluate model, save model

#Analysis
    #-show how each character's sentiment changes over the course of the show
    #-Classify dialogues into emotions like joy, anger, sadness, surprise, etc. Maybe NRC Emotion Lexicon for this purpose
    #-show how to use the model to predict sentiment on unseen data
    
#Separate task:
#-predict which character spoke on unseen data

In [2]:
#sentiment analysis notebook

In [34]:
#impost packages

import pandas as pd
import numpy as np
import re

#for web scraping
from bs4 import BeautifulSoup
import requests

#for tokenization
from nltk import sent_tokenize, word_tokenize
import spacy
from spacy.pipeline import Sentencizer
from spacy.lang.en import English

#for sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# For visualization.
import plotly.express as px

#for bag of words model vectorization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#to evaluate which document most aligns with a given vector/token
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
atla_df = pd.read_csv('/Users/justin/Documents/Personal/projects/atla_nlp/data/processed/atla_df.csv', keep_default_na=False, index_col=0)
#keep_default_na=False prevent empty strings from being read as NaN

In [5]:
atla_df

Unnamed: 0,book_num,book_name,episode_num,episode_name,link,Aang,Iroh,Katara,Sokka,Toph,Zuko
0,1.0,Book One: Water,1,The Boy in the Iceberg,https://avatar.fandom.com/wiki/Transcript:The_...,[In a weak voice.] I need to ask you something...,[He is playing some sort of card game. He answ...,[Narrating.] Water. Earth. Fire. Air. My grand...,It's not getting away from me this time. [Clos...,,Finally! [He turns around to face another pers...
1,1.0,Book One: Water,2,The Avatar Returns,https://avatar.fandom.com/wiki/Transcript:The_...,"[Sheepishly, as Katara glares at Sokka.] Yeah....",[Turns to one of Zuko's men and gives him the ...,"Water. Earth. Fire. Air. Long ago, the four na...",[Angrily.] I knew it! [Accusingly points a fin...,,[Angrily.] Where are you hiding him?[Shaking K...
2,1.0,Book One: Water,3,The Southern Air Temple,https://avatar.fandom.com/wiki/Transcript:The_...,"[Excitedly.] Wait 'til you see it, Katara! The...",[Disinterested.] You mean the Avatar?[Bows bac...,"[Cautiously.] Aang, I know you're excited, [Sh...",[Grunting sleepily.] Uggh! Sleep now. Temple l...,,"Uncle, I want the repairs made as quickly as p..."
3,1.0,Book One: Water,4,The Warriors of Kyoshi,https://avatar.fandom.com/wiki/Transcript:The_...,Well ... [Happily.] I know it's near water.Mom...,[Pushes open the door completely and enters th...,"[Nonchalantly, still focusing on her task at h...",[To Aang.] You have no idea where you're going...,,[Calmly.] The only reason you should be interr...
4,1.0,Book One: Water,5,The King of Omashu,https://avatar.fandom.com/wiki/Transcript:The_...,The Earth Kingdom city of Omashu! [Camera pans...,,[Close-up; impressed.] Wow. We don't have buil...,[Close-up; overwhelmed.] They have buildings h...,,
...,...,...,...,...,...,...,...,...,...,...,...
56,3.0,Book Three: Fire,57,The Southern Raiders,https://avatar.fandom.com/wiki/Transcript:The_...,[To Zuko.] What are you doing?[In the backgrou...,,"[Angrily.] What are you doing?Okay, I'm not cr...",Come on! We've gotta get out of here!We need t...,[Pointing to the exit.] Come on! We can get ou...,[As he runs and pushes her out of harm's way.]...
57,3.0,Book Three: Fire,58,The Ember Island Players,https://avatar.fandom.com/wiki/Transcript:The_...,[To Zuko; points to his seat and tries to act ...,"[Looking toward Actor Zuko, at ease.] Prince Z...",Doesn't it seem kind of weird that we're hidin...,You guys are not gonna believe this! There's a...,Why are we sitting in the nosebleed section? M...,"I told you, [Cut to Zuko sitting on a dry foun..."
58,3.0,Book Three: Fire,59,"Sozin's Comet, Part 1",https://avatar.fandom.com/wiki/Transcript:Sozi...,[Stops firebending and turns to Zuko while gro...,,[Holding up two watermelons.] Who wants a nice...,Maybe Zuko's right. Sitting around the house h...,"Not bad, baldy, [Aang jumps off the sculpture....",More ferocious! Imagine striking through your ...
59,3.0,Book Three: Fire,60,"Sozin's Comet, Part 2",https://avatar.fandom.com/wiki/Transcript:Sozi...,"Where are we, Momo? [Cut to view of the sky th...",[Close-up.] I was never angry with you. I was ...,[Angrily.] I'm not his girlfriend![Pulls out A...,"[To Zuko.] Hey, I remember her! [Cut to fronta...",[Frontal view.] We know he's gone. That's why ...,Yup. [Walks forward.] Back in the good old day...


In [6]:
def get_sentiment(sentences, tokenizer):
    
    # 1) Calculate the sentiment of each sentence.
    # 2) Take the average of these sentiment scores.
    # 3) Use threshold values to determine if the text is negative (-1), positive (1), or neutral (0).
    
    sentiment_per_sentence = []
    
    if tokenizer == spacy:
        doc = tokenizer(sentences) # set tokenizer of choice
        tokenize_sentences = [sent.text.strip() for sent in doc.sents]
    
    else:
        tokenize_sentences = tokenizer(sentences) # set tokenizer of choice
    
    analyzer = SentimentIntensityAnalyzer()
    for sentence in tokenize_sentences:
        sentiment = analyzer.polarity_scores(sentence)
        sentiment_per_sentence.append(sentiment['compound'])
        
    avg_sentiment = np.mean(sentiment_per_sentence)
      
    # Typical threshold values suggested in VADER's README.
    if avg_sentiment >= 0.05:
        return 1
    elif avg_sentiment <= -0.05:
        return -1
    elif np.isnan(avg_sentiment): #if character has no lines in episode return np.nan
        return np.nan
    else:
        return 0   

In [7]:
##sentiment analysis with vader
atla_sentiment_df = atla_df.copy()
atla_sentiment_df.drop(['link'], axis = 1, inplace = True)

# For each character, replace lines spoken in each episode with sentiment score.
characters = ['Katara', 'Sokka', 'Aang', 'Toph', 'Zuko', 'Iroh']

for character in characters:  
    character_sentiment_per_episode = []
       
    for episode_lines in atla_df[character]:
        character_sentiment_per_episode.append(get_sentiment(episode_lines, sent_tokenize)) #uses nltk PunktSentenceTokenizer (currently recommended)
  
        
    atla_sentiment_df[character] = character_sentiment_per_episode

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [8]:
#alternatively, use spacy tokenizer
spacy = English()
spacy.add_pipe("sentencizer")

atla_sentiment_df_spacy_tokenizer = atla_df.copy()
atla_sentiment_df_spacy_tokenizer.drop(['link'], axis = 1, inplace = True)

# For each character, replace lines spoken in each episode with sentiment score.
characters = ['Katara', 'Sokka', 'Aang', 'Toph', 'Zuko', 'Iroh']

for character in characters:  
    character_sentiment_per_episode = []
       
    for episode_lines in atla_df[character]:
            character_sentiment_per_episode.append(get_sentiment(episode_lines, spacy)) #uses custom spaCy tokenizer
 
        
    atla_sentiment_df_spacy_tokenizer[character] = character_sentiment_per_episode

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [9]:
atla_sentiment_df

Unnamed: 0,book_num,book_name,episode_num,episode_name,Aang,Iroh,Katara,Sokka,Toph,Zuko
0,1.0,Book One: Water,1,The Boy in the Iceberg,0.0,0.0,0.0,0.0,,-1.0
1,1.0,Book One: Water,2,The Avatar Returns,1.0,1.0,1.0,0.0,,-1.0
2,1.0,Book One: Water,3,The Southern Air Temple,1.0,1.0,0.0,0.0,,0.0
3,1.0,Book One: Water,4,The Warriors of Kyoshi,1.0,0.0,0.0,0.0,,1.0
4,1.0,Book One: Water,5,The King of Omashu,1.0,,0.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...
56,3.0,Book Three: Fire,57,The Southern Raiders,0.0,,-1.0,0.0,1.0,0.0
57,3.0,Book Three: Fire,58,The Ember Island Players,1.0,0.0,0.0,0.0,1.0,0.0
58,3.0,Book Three: Fire,59,"Sozin's Comet, Part 1",0.0,,0.0,0.0,0.0,0.0
59,3.0,Book Three: Fire,60,"Sozin's Comet, Part 2",0.0,0.0,1.0,0.0,0.0,0.0


In [10]:
atla_sentiment_df_spacy_tokenizer

Unnamed: 0,book_num,book_name,episode_num,episode_name,Aang,Iroh,Katara,Sokka,Toph,Zuko
0,1.0,Book One: Water,1,The Boy in the Iceberg,0.0,0.0,0.0,0.0,,-1.0
1,1.0,Book One: Water,2,The Avatar Returns,1.0,1.0,1.0,0.0,,-1.0
2,1.0,Book One: Water,3,The Southern Air Temple,1.0,1.0,0.0,0.0,,0.0
3,1.0,Book One: Water,4,The Warriors of Kyoshi,1.0,-1.0,0.0,0.0,,1.0
4,1.0,Book One: Water,5,The King of Omashu,1.0,,0.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...
56,3.0,Book Three: Fire,57,The Southern Raiders,0.0,,0.0,0.0,1.0,0.0
57,3.0,Book Three: Fire,58,The Ember Island Players,1.0,1.0,0.0,0.0,1.0,0.0
58,3.0,Book Three: Fire,59,"Sozin's Comet, Part 1",0.0,,0.0,0.0,1.0,0.0
59,3.0,Book Three: Fire,60,"Sozin's Comet, Part 2",0.0,0.0,1.0,0.0,0.0,0.0


In [11]:
# Change the data frame format from wide to long.
# Treat character names as variables and sentiment scores as values.
atla_sentiment_df_long = atla_sentiment_df.melt(id_vars = ['book_num', 'book_name', 'episode_num', 'episode_name'], 
                                           var_name = 'character', value_name = 'sentiment')

In [13]:
atla_sentiment_df_long.to_csv('/Users/justin/Documents/Personal/projects/atla_nlp/data/interim/atla_sentiment_df_long.csv')

In [14]:
atla_sentiment_df_long

Unnamed: 0,book_num,book_name,episode_num,episode_name,character,sentiment
0,1.0,Book One: Water,1,The Boy in the Iceberg,Aang,0.0
1,1.0,Book One: Water,2,The Avatar Returns,Aang,1.0
2,1.0,Book One: Water,3,The Southern Air Temple,Aang,1.0
3,1.0,Book One: Water,4,The Warriors of Kyoshi,Aang,1.0
4,1.0,Book One: Water,5,The King of Omashu,Aang,1.0
...,...,...,...,...,...,...
361,3.0,Book Three: Fire,57,The Southern Raiders,Zuko,0.0
362,3.0,Book Three: Fire,58,The Ember Island Players,Zuko,0.0
363,3.0,Book Three: Fire,59,"Sozin's Comet, Part 1",Zuko,0.0
364,3.0,Book Three: Fire,60,"Sozin's Comet, Part 2",Zuko,0.0


In [15]:
#count number of each sentiments per character
atla_sentiment_df_long.groupby(['character', 'sentiment']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,book_num,book_name,episode_num,episode_name
character,sentiment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Aang,-1.0,3,3,3,3
Aang,0.0,35,35,35,35
Aang,1.0,22,22,22,22
Iroh,-1.0,5,5,5,5
Iroh,0.0,13,13,13,13
Iroh,1.0,18,18,18,18
Katara,-1.0,3,3,3,3
Katara,0.0,38,38,38,38
Katara,1.0,17,17,17,17
Sokka,-1.0,8,8,8,8


In [16]:
# Sort values by character (in the specified order below) and episode number.
atla_sentiment_df_long['character'] = pd.Categorical(atla_sentiment_df_long['character'],
                                                categories = ['Aang', 'Katara', 'Sokka', 'Toph', 'Iroh', 'Zuko'],
                                                ordered = True)

atla_sentiment_df_long = atla_sentiment_df_long.sort_values(['character', 'episode_num'])

In [17]:
atla_total_sentiment_df = atla_sentiment_df_long.copy()

# For each character, calculate running total of sentiment.
atla_total_sentiment_df['sentiment'] = (atla_total_sentiment_df.groupby(['character'])['sentiment']
                                          .apply(lambda x: x.cumsum()))

# Rename column.
atla_total_sentiment_df.rename(columns = {'sentiment': 'total_sentiment'}, inplace = True)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda x: x.cumsum()))


In [18]:
atla_total_sentiment_df

Unnamed: 0,book_num,book_name,episode_num,episode_name,character,total_sentiment
0,1.0,Book One: Water,1,The Boy in the Iceberg,Aang,0.0
1,1.0,Book One: Water,2,The Avatar Returns,Aang,1.0
2,1.0,Book One: Water,3,The Southern Air Temple,Aang,2.0
3,1.0,Book One: Water,4,The Warriors of Kyoshi,Aang,3.0
4,1.0,Book One: Water,5,The King of Omashu,Aang,4.0
...,...,...,...,...,...,...
361,3.0,Book Three: Fire,57,The Southern Raiders,Zuko,-8.0
362,3.0,Book Three: Fire,58,The Ember Island Players,Zuko,-8.0
363,3.0,Book Three: Fire,59,"Sozin's Comet, Part 1",Zuko,-8.0
364,3.0,Book Three: Fire,60,"Sozin's Comet, Part 2",Zuko,-8.0


In [19]:
atla_sentiment_df_long[atla_sentiment_df_long.character == 'Aang'].sentiment.value_counts()

 0.0    35
 1.0    22
-1.0     3
Name: sentiment, dtype: int64

In [20]:
atla_sentiment_df_long[atla_sentiment_df_long.character == 'Toph'].sentiment

244    NaN
245    NaN
246    NaN
247    NaN
248    NaN
      ... 
300    1.0
301    1.0
302    0.0
303    0.0
304   -1.0
Name: sentiment, Length: 61, dtype: float64

In [21]:
atla_sentiment_df_long[atla_sentiment_df_long.character == 'Sokka'].sentiment

183    0.0
184    0.0
185    0.0
186    0.0
187    1.0
      ... 
239    0.0
240    0.0
241    0.0
242    0.0
243    1.0
Name: sentiment, Length: 61, dtype: float64

In [22]:
# Plot sentiment per episode, per character.
legend_colors = ['#FF8F00', '#40C4FF', '#1E88E5', '#4CAF50', '#B71C1C', '#F44336']
                 
sentiment_fig = px.scatter(atla_sentiment_df_long, x = 'episode_num', y = 'sentiment',
                           color = 'character',
                           color_discrete_sequence = legend_colors,
                           hover_data = ['episode_name'],
                           trendline = 'lowess',
                           width = 950, height = 750)

# Customize titles.
sentiment_fig.update_layout(title = {'text': "Sentiments per episode",
                                     'x': 0.5,
                                     'xanchor': 'center'},
                            xaxis_title = "Episode number",
                            yaxis_title = "Sentiment",
                            legend_title = "Characters")

# Correct position of x points.
for scatter, trendline in zip(sentiment_fig.data[::2], sentiment_fig.data[1::2]):
    trendline['x'] = scatter['x'][np.logical_not(np.isnan(scatter['y']))]
    
sentiment_fig.show()

In [23]:
# Plot running total of sentiment per character.
total_sentiment_fig = px.scatter(atla_total_sentiment_df, x = 'episode_num', y = 'total_sentiment', 
                                 color = 'character',
                                 color_discrete_sequence = legend_colors, 
                                 hover_data = ['episode_name'],
                                 trendline = 'lowess', 
                                 width = 950, height = 750)

# Customize titles.
total_sentiment_fig.update_layout(title = {'text': "Running total of sentiments",
                                           'x': 0.5,
                                           'xanchor': 'center'},
                                  xaxis_title = "Episode number",
                                  yaxis_title = "Total sentiment",
                                  legend_title = "Characters")
    
# Correct position of x points.
for scatter, trendline in zip(total_sentiment_fig.data[::2], total_sentiment_fig.data[1::2]):
    trendline['x'] = scatter['x'][np.logical_not(np.isnan(scatter['y']))]

total_sentiment_fig.show()

In [24]:
#spacy visualizations

# Change the data frame format from wide to long.
# Treat character names as variables and sentiment scores as values.
atla_sentiment_df_spacy_tokenizer_long = atla_sentiment_df_spacy_tokenizer.melt(id_vars = ['book_num', 'book_name', 'episode_num', 'episode_name'], 
                                           var_name = 'character', value_name = 'sentiment')

# Sort values by character (in the specified order below) and episode number.
atla_sentiment_df_spacy_tokenizer_long['character'] = pd.Categorical(atla_sentiment_df_spacy_tokenizer_long['character'],
                                                categories = ['Aang', 'Katara', 'Sokka', 'Toph', 'Iroh', 'Zuko'],
                                                ordered = True)

atla_sentiment_df_spacy_tokenizer_long = atla_sentiment_df_spacy_tokenizer_long.sort_values(['character', 'episode_num'])


atla_total_sentiment_df_spacy_tokenizer = atla_sentiment_df_spacy_tokenizer_long.copy()

# For each character, calculate running total of sentiment.
atla_total_sentiment_df_spacy_tokenizer['sentiment'] = (atla_total_sentiment_df_spacy_tokenizer.groupby(['character'])['sentiment']
                                          .apply(lambda x: x.cumsum()))

# Rename column.
atla_total_sentiment_df_spacy_tokenizer.rename(columns = {'sentiment': 'total_sentiment'}, inplace = True)

atla_total_sentiment_df_spacy_tokenizer


Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)



Unnamed: 0,book_num,book_name,episode_num,episode_name,character,total_sentiment
0,1.0,Book One: Water,1,The Boy in the Iceberg,Aang,0.0
1,1.0,Book One: Water,2,The Avatar Returns,Aang,1.0
2,1.0,Book One: Water,3,The Southern Air Temple,Aang,2.0
3,1.0,Book One: Water,4,The Warriors of Kyoshi,Aang,3.0
4,1.0,Book One: Water,5,The King of Omashu,Aang,4.0
...,...,...,...,...,...,...
361,3.0,Book Three: Fire,57,The Southern Raiders,Zuko,-5.0
362,3.0,Book Three: Fire,58,The Ember Island Players,Zuko,-5.0
363,3.0,Book Three: Fire,59,"Sozin's Comet, Part 1",Zuko,-5.0
364,3.0,Book Three: Fire,60,"Sozin's Comet, Part 2",Zuko,-5.0


In [25]:
# Plot sentiment per episode, per character.
legend_colors = ['#FF8F00', '#40C4FF', '#1E88E5', '#4CAF50', '#B71C1C', '#F44336']
                 
sentiment_fig = px.scatter(atla_sentiment_df_spacy_tokenizer_long, x = 'episode_num', y = 'sentiment',
                           color = 'character',
                           color_discrete_sequence = legend_colors,
                           hover_data = ['episode_name'],
                           trendline = 'lowess',
                           width = 950, height = 750)

# Customize titles.
sentiment_fig.update_layout(title = {'text': "Sentiments per episode",
                                     'x': 0.5,
                                     'xanchor': 'center'},
                            xaxis_title = "Episode number",
                            yaxis_title = "Sentiment",
                            legend_title = "Characters")

# Correct position of x points.
for scatter, trendline in zip(sentiment_fig.data[::2], sentiment_fig.data[1::2]):
    trendline['x'] = scatter['x'][np.logical_not(np.isnan(scatter['y']))]
    
sentiment_fig.show()

In [26]:
# Plot running total of sentiment per character.
total_sentiment_fig = px.scatter(atla_total_sentiment_df_spacy_tokenizer, x = 'episode_num', y = 'total_sentiment', 
                                 color = 'character',
                                 color_discrete_sequence = legend_colors, 
                                 hover_data = ['episode_name'],
                                 trendline = 'lowess', 
                                 width = 950, height = 750)

# Customize titles.
total_sentiment_fig.update_layout(title = {'text': "Running total of sentiments",
                                           'x': 0.5,
                                           'xanchor': 'center'},
                                  xaxis_title = "Episode number",
                                  yaxis_title = "Total sentiment",
                                  legend_title = "Characters")
    
# Correct position of x points.
for scatter, trendline in zip(total_sentiment_fig.data[::2], total_sentiment_fig.data[1::2]):
    trendline['x'] = scatter['x'][np.logical_not(np.isnan(scatter['y']))]

total_sentiment_fig.show()

In [27]:
atla_lines_df = pd.read_csv('/Users/justin/Documents/Personal/projects/atla_nlp/data/raw/atla_lines.csv', keep_default_na=False, index_col=0)
atla_lines_df

Unnamed: 0,episode_num,character_name,character_lines
0,1,Katara,[Narrating.] Water. Earth. Fire. Air. My grand...
1,1,Sokka,It's not getting away from me this time. [Clos...
2,1,Aang,[In a weak voice.] I need to ask you something...
3,1,Toph,
4,1,Zuko,Finally! [He turns around to face another pers...
...,...,...,...
361,61,Sokka,We're too late! The fleet's already taking off...
362,61,Aang,"Momo, time for you to go.Please listen to me. ..."
363,61,Toph,Too bad the Fire Lord's about to use it to des...
364,61,Zuko,It's not her I'm worried about. I'm worried ab...


In [28]:
atla_lines_df.character_lines

0      [Narrating.] Water. Earth. Fire. Air. My grand...
1      It's not getting away from me this time. [Clos...
2      [In a weak voice.] I need to ask you something...
3                                                       
4      Finally! [He turns around to face another pers...
                             ...                        
361    We're too late! The fleet's already taking off...
362    Momo, time for you to go.Please listen to me. ...
363    Too bad the Fire Lord's about to use it to des...
364    It's not her I'm worried about. I'm worried ab...
365    [Cut to a shot of Iroh.] Only once every hundr...
Name: character_lines, Length: 366, dtype: object

In [29]:
#binary bag of words model

vectorizer = CountVectorizer(lowercase=False, binary=True, ngram_range=(1,2)) #ngram_range=(1,2) means unigrams and bigrams for more context
bow = vectorizer.fit_transform(atla_lines_df.character_lines)

In [30]:
# View features (tokens).
print(vectorizer.get_feature_names_out())

# View vocabulary dictionary.
print('vocab size: ', len(vectorizer.vocabulary_))

vectorizer.vocabulary_

['112' '112 years' '24' ... 'zooms to' '子白' '子白 on']
vocab size:  63278


{'Narrating': 6739,
 'Water': 11924,
 'Earth': 2766,
 'Fire': 3222,
 'Air': 625,
 'My': 6682,
 'grandmother': 29614,
 'used': 58697,
 'to': 56167,
 'tell': 52850,
 'me': 38697,
 'stories': 51657,
 'about': 13380,
 'the': 53458,
 'old': 42376,
 'days': 22905,
 'time': 55994,
 'of': 41725,
 'peace': 43962,
 'when': 60732,
 'Avatar': 1259,
 'kept': 35877,
 'balance': 17487,
 'between': 18656,
 'Tribes': 11367,
 'Kingdom': 5678,
 'Nation': 6745,
 'and': 14703,
 'Nomads': 7074,
 'But': 1838,
 'that': 53064,
 'all': 14160,
 'changed': 20892,
 'attacked': 16852,
 'Only': 7458,
 'mastered': 38612,
 'four': 27979,
 'elements': 25023,
 'only': 42794,
 'he': 31021,
 'could': 22087,
 'stop': 51573,
 'ruthless': 47253,
 'firebenders': 26985,
 'world': 61885,
 'needed': 40685,
 'him': 32230,
 'most': 39849,
 'vanished': 58767,
 'hundred': 33364,
 'years': 62270,
 'have': 30786,
 'passed': 43843,
 'is': 34444,
 'nearing': 40579,
 'victory': 58924,
 'in': 33750,
 'war': 59464,
 'Two': 11434,
 'ago': 1

In [31]:
#first element represents the characters lines in a episode (document), and the second element represents a token ID, It's then followed by a count of that token in the document.
print(bow)

  (0, 6739)	1
  (0, 11924)	1
  (0, 2766)	1
  (0, 3222)	1
  (0, 625)	1
  (0, 6682)	1
  (0, 29614)	1
  (0, 58697)	1
  (0, 56167)	1
  (0, 52850)	1
  (0, 38697)	1
  (0, 51657)	1
  (0, 13380)	1
  (0, 53458)	1
  (0, 42376)	1
  (0, 22905)	1
  (0, 55994)	1
  (0, 41725)	1
  (0, 43962)	1
  (0, 60732)	1
  (0, 1259)	1
  (0, 35877)	1
  (0, 17487)	1
  (0, 18656)	1
  (0, 11367)	1
  :	:
  (365, 20300)	1
  (365, 35988)	1
  (365, 55389)	1
  (365, 33370)	1
  (365, 2364)	1
  (365, 35991)	1
  (365, 42630)	1
  (365, 26973)	1
  (365, 4797)	1
  (365, 41744)	1
  (365, 48814)	1
  (365, 56684)	1
  (365, 48841)	1
  (365, 44876)	1
  (365, 25776)	1
  (365, 42033)	1
  (365, 4804)	1
  (365, 7461)	1
  (365, 42640)	1
  (365, 25490)	1
  (365, 62289)	1
  (365, 20377)	1
  (365, 26978)	1
  (365, 25781)	1
  (365, 55568)	1


In [32]:
sent_tokenize(atla_lines_df.character_lines[0])

['[Narrating.]',
 'Water.',
 'Earth.',
 'Fire.',
 'Air.',
 'My grandmother used to tell me stories about the old days: a time of peace when the Avatar kept balance between the Water Tribes, Earth Kingdom, Fire Nation and Air Nomads.',
 'But that all changed when the Fire Nation attacked.',
 'Only the Avatar mastered all four elements; only he could stop the ruthless firebenders.',
 'But when the world needed him most, he vanished.',
 'A hundred years have passed, and the Fire Nation is nearing victory in the war.',
 'Two years ago, my father and the men of my tribe journeyed to the Earth Kingdom to help fight against the Fire Nation, leaving me and my brother to look after our tribe.',
 "Some people believe that the Avatar was never reborn into the Air Nomads and that the cycle is broken, but I haven't lost hope.",
 'I still believe that, somehow, the Avatar will return to save the world.',
 '[Happily surprised.]',
 'Sokka, look!',
 '[Struggling with the water that passes right in fron

In [35]:
nlp = spacy.load('en_core_web_sm')
unwanted_pipes = ['ner', 'parser']

# Further remove stop words and take the lemma instead of token text.
def spacy_tokenizer(doc):
  with nlp.disable_pipes(*unwanted_pipes):
    return [t.lemma_ for t in nlp(doc) if \
            not t.is_punct and \
            not t.is_space and \
            not t.is_stop and \
            t.is_alpha]

In [36]:
#TD-IDF model

vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer) #sent_tokenize for sentences or use custom spacy word tokenizer
tf_idf_features = vectorizer.fit_transform(atla_lines_df.character_lines)

In [37]:
#see terms in vocabulary
vectorizer.get_feature_names_out()

array(['aaaaah', 'aaahh', 'aaahhh', ..., 'zoom', 'zuko', '子白'],
      dtype=object)

In [38]:
# The number of unique tokens.
print(len(vectorizer.get_feature_names_out()))

4244


In [39]:
# The dimensions of our feature matrix. X rows (documents) by Y columns (tokens).
print(tf_idf_features.shape)

(366, 4244)


In [40]:
# What the encoding of the first document looks like in sparse format.
print(tf_idf_features[0])

  (0, 2688)	0.0257439863805513
  (0, 2490)	0.04222189767976029
  (0, 4060)	0.05746706187726526
  (0, 455)	0.04908169617940475
  (0, 2240)	0.02615355919813395
  (0, 3422)	0.026015142993892192
  (0, 1262)	0.03536209918011584
  (0, 567)	0.05461998826241606
  (0, 3742)	0.017571043608646954
  (0, 4085)	0.044608581434554255
  (0, 3011)	0.035722015108385845
  (0, 3219)	0.04374746487797635
  (0, 518)	0.025350692352254033
  (0, 1537)	0.03287494149353665
  (0, 2137)	0.024029185935620895
  (0, 1687)	0.05461998826241606
  (0, 3049)	0.07072419836023168
  (0, 1139)	0.05241162640011434
  (0, 405)	0.05241162640011434
  (0, 107)	0.04908169617940475
  (0, 4183)	0.06147978676206052
  (0, 158)	0.032339859978741815
  (0, 316)	0.030236224990824907
  (0, 2263)	0.05461998826241606
  (0, 252)	0.02942789041150373
  :	:
  (0, 3072)	0.05461998826241606
  (0, 3543)	0.07142034339477141
  (0, 1160)	0.04153910301567463
  (0, 2229)	0.032082791050736255
  (0, 198)	0.034354796844769
  (0, 580)	0.06269874859064116
  (0, 

In [41]:
# Transform the query into a TF-IDF vector.
query = ["aaaaah zuko"]
query_tfidf = vectorizer.transform(query)
print(query_tfidf)

  (0, 4242)	0.358227457777385
  (0, 0)	0.9336343441060594


In [42]:
# Calculate the cosine similarities between the query and each document.
# We're calling flatten() here becaue cosine_similarity returns a list
# of lists and we just want a single list.
cosine_similarities = cosine_similarity(tf_idf_features, query_tfidf).flatten()

In [43]:
len(cosine_similarities)

366

In [44]:
# numpy's argsort() method returns a list of *indices* that
# would sort an array:
# https://numpy.org/doc/stable/reference/generated/numpy.argsort.html
#
# The sort is ascending, but we want the largest k cosine_similarites
# at the bottom of the sort. So we negate k, and get the last k
# entries of the indices list in reverse order. There are faster
# ways to do this using things like argpartition but this is
# more succinct.
def top_k(arr, k):
  kth_largest = (k + 1) * -1
  return np.argsort(arr)[:kth_largest:-1]

In [45]:
# So for our query above, these are the top five documents.
top_related_indices = top_k(cosine_similarities, 5)
print(top_related_indices)

[320  71  17 360 322]


In [46]:
# Let's take a look at their respective cosine similarities.
print(cosine_similarities[top_related_indices])

[0.19374747 0.18696627 0.18511697 0.17611141 0.15428672]


In [47]:
# Top match.
print(atla_lines_df.iloc[top_related_indices[0],:])

episode_num                                                       54
character_name                                                  Aang
character_lines    [Cut to Aang, as he sighs heavily.] Okay. Not ...
Name: 320, dtype: object


In [48]:
#with bag of words model
#try using N-grams in conjunction with unigrams to improve semantics of tokens
#filter out stop words, find N-grams that occur frequently, or with certain POS structure like nouns, or anything flagged by NER tagger
#-or use pointwise mutual information (probability of words cooccurring vs occurring individually across corpus)filter out stop words, find N-grams that occur frequently, or with certain POS structure like nouns, or anything flagged by NER tagger
#-or use pointwise mutual information (probability of words cooccurring vs occurring individually across corpus)


In [49]:
#interesting, but there are a lot of zero results
# I want to try and get a more accurate sentiment analysis reading per episode
#look at link for compound Vader scores
#Use feature extraction techniques (like Bag of Words, TF-IDF, or embeddings) to create a numerical representation of the text.