In [1]:
#ATLA sentiment analysis project
#Author: Justin Marotta

#Objectives:
#properly implement sentiment analysis using a transformer architecture
#perform tokenization and preprocessing from scratch
#finetune existing transformer architecture on processed transcripts from Avatar: The Last Airbender
#showcase how each character's sentiment/emotion changes over the course of the show (Aang, Katara, Sokka, Toph, Zuko, Iroh)
#showcase how to use the model to predict sentiment on unseen data
#gain insights into the show's storytelling from a psychological perspective

#Steps:
#Data collection
    #-gather transcripts of the show
#Preprocessing
    #-tokenization
    #-data cleaning (lower case, remove special characters, remove stop words, lemmatization)
#Modeling
    #-finetune existing transformer architecture
    #-train model, evaluate model, save model

#Analysis
    #-show how each character's sentiment changes over the course of the show
    #-Classify dialogues into emotions like joy, anger, sadness, surprise, etc. Maybe NRC Emotion Lexicon for this purpose
    #-show how to use the model to predict sentiment on unseen data
    
#Separate task:
#-predict which character spoke on unseen data


In [2]:
#web scraping notebook

In [3]:
#impost packages

import pandas as pd
import numpy as np
import re

#for web scraping
from bs4 import BeautifulSoup
import requests

#for tokenization
from nltk import sent_tokenize, word_tokenize
import spacy
from spacy.pipeline import Sentencizer
from spacy.lang.en import English

#for sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# For visualization.
import plotly.express as px

#for bag of words model vectorization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



In [4]:
#scrape series transcripts from atla wiki
url = "https://avatar.fandom.com/wiki/Avatar_Wiki:Transcripts#Avatar:_The_Last_Airbender_episodes"

#send GET request
response = requests.get(url)
print("status: ", response.status_code)

#parse html
soup = BeautifulSoup(response.content, "html.parser")

#makes the HTML more readable
print(soup.prettify())

base_url = 'https://avatar.fandom.com'

status:  200
<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Avatar Wiki:Transcripts | Avatar Wiki | Fandom
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"c8d01269df5b7a1f44b90cff7bb7c46c","wgCSPNonce":false,"wgCanonicalNamespace":"Project","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":4,"wgPageName":"Avatar_Wiki:Transcripts","wgTitle":"Transcripts","wgCurRevisionId":2937029,"wgRevisionId":2937029,"wgArticleId":177051,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Transcripts"],"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"

In [5]:
#locate transcript links
transcript_urls = []
episode_names = []
book_name = []
tables = soup.find_all('table', class_="wikitable")

#only want the links for ATLA S1-3

book_counter = 1
for table in tables[1:8]:
    for link in table.find_all('a', href=True):
        #print("Found the URL:", link['href'])
        if "(commentary)" not in link['href']: #do not include commentary episodes
            transcript_urls.append(base_url + link['href'])
            episode_names.append(link['title'].split(":")[1])
            
            if table == tables[1] or table == tables[2]:
                book_name.append('Book One: Water')
            if table == tables[3] or table == tables[4]:
                book_name.append('Book Two: Earth')
            if table == tables[5]:
                book_name.append('Escape from the Spirit World')
            if table == tables[6] or table == tables[7]:
                book_name.append('Book Three: Fire')

In [6]:
table == tables[6] or table == tables[5]

False

In [7]:
len(episode_names)

62

In [8]:
tables[5]

<table class="wikitable" style="width:100%;">
<tbody><tr>
<th style="width:10%;">0
</th>
<td><i><a href="/wiki/Transcript:Escape_from_the_Spirit_World" title="Transcript:Escape from the Spirit World">Escape from the Spirit World</a></i>
</td></tr></tbody></table>

In [9]:
#create dataframe and save to csv
episode_numbers = list(range(1,62))

atla_links_df = pd.DataFrame(list(zip(book_name, episode_numbers, episode_names, transcript_urls)),
                       columns =['book_name', 'episode_num', 'episode_name', 'link'])

#link book numbers by name
book_names_unique=['Book One: Water', 'Book Two: Earth', 'Escape from the Spirit World', 'Book Three: Fire']
book_num_list = ['1', '2', '2.5', '3']
book_number = atla_links_df['book_name'].copy()
book_number.replace(book_names_unique,book_num_list, inplace=True)

#insert book_num column
atla_links_df.insert(0, column = 'book_num', value = book_number)

atla_links_df.to_csv('/Users/justin/Documents/Personal/projects/atla_nlp/data/raw/atla_links.csv')

In [10]:
atla_links_df

Unnamed: 0,book_num,book_name,episode_num,episode_name,link
0,1,Book One: Water,1,The Boy in the Iceberg,https://avatar.fandom.com/wiki/Transcript:The_...
1,1,Book One: Water,2,The Avatar Returns,https://avatar.fandom.com/wiki/Transcript:The_...
2,1,Book One: Water,3,The Southern Air Temple,https://avatar.fandom.com/wiki/Transcript:The_...
3,1,Book One: Water,4,The Warriors of Kyoshi,https://avatar.fandom.com/wiki/Transcript:The_...
4,1,Book One: Water,5,The King of Omashu,https://avatar.fandom.com/wiki/Transcript:The_...
...,...,...,...,...,...
56,3,Book Three: Fire,57,The Southern Raiders,https://avatar.fandom.com/wiki/Transcript:The_...
57,3,Book Three: Fire,58,The Ember Island Players,https://avatar.fandom.com/wiki/Transcript:The_...
58,3,Book Three: Fire,59,"Sozin's Comet, Part 1",https://avatar.fandom.com/wiki/Transcript:Sozi...
59,3,Book Three: Fire,60,"Sozin's Comet, Part 2",https://avatar.fandom.com/wiki/Transcript:Sozi...


In [11]:
#first attempt at extracting character diaglogue
"""
kataras_lines = []
sokkas_lines = []
aangs_lines = []
tophs_lines = []
zukos_lines = []
irohs_lines = []

for line in sample_ep.find_all('th', string=[re.compile("Katara"), re.compile("Sokka"), re.compile("Aang"), re.compile("Toph"), re.compile("Zuko"), re.compile("Iroh")]): #use re.compile for substring, not the exact string
    for sib in line.next_siblings:
        #print(line.text)
        #print(sib.text)
        cleaned_line = sib.text.strip()
        if len(cleaned_line) != 0:
            if line.text.strip() == 'Katara': #add to katara's lines if string match is 'Katara'
                kataras_lines.append(cleaned_line) #remove leading and trailing whitespace
            if line.text.strip() == 'Sokka':
                sokkas_lines.append(cleaned_line)
            if line.text.strip() == 'Aang':
                aangs_lines.append(cleaned_line)
            if line.text.strip() == 'Toph':
                tophs_lines.append(cleaned_line)
            if line.text.strip() == 'Zuko':
                zukos_lines.append(cleaned_line)
            if line.text.strip() == 'Iroh':
                irohs_lines.append(cleaned_line)
"""

'\nkataras_lines = []\nsokkas_lines = []\naangs_lines = []\ntophs_lines = []\nzukos_lines = []\nirohs_lines = []\n\nfor line in sample_ep.find_all(\'th\', string=[re.compile("Katara"), re.compile("Sokka"), re.compile("Aang"), re.compile("Toph"), re.compile("Zuko"), re.compile("Iroh")]): #use re.compile for substring, not the exact string\n    for sib in line.next_siblings:\n        #print(line.text)\n        #print(sib.text)\n        cleaned_line = sib.text.strip()\n        if len(cleaned_line) != 0:\n            if line.text.strip() == \'Katara\': #add to katara\'s lines if string match is \'Katara\'\n                kataras_lines.append(cleaned_line) #remove leading and trailing whitespace\n            if line.text.strip() == \'Sokka\':\n                sokkas_lines.append(cleaned_line)\n            if line.text.strip() == \'Aang\':\n                aangs_lines.append(cleaned_line)\n            if line.text.strip() == \'Toph\':\n                tophs_lines.append(cleaned_line)\n     

In [12]:
def character_lines_ep(character_name, episode_soup): #extracts character lines from a single episode
    character_lines = []

    char_dialogue = episode_soup.find_all('th', string=re.compile(f"{character_name}")) #use re.compile for substring, not the exact string
    
    for line in char_dialogue:
        
        for sib in line.next_siblings:

            cleaned_line = sib.text.strip() #remove leading and trailing whitespace
            
            if len(cleaned_line) != 0: #only append if line is not empty
                
                character_lines.append(cleaned_line)
                
    #joins all of the character’s lines so that the function returns one string instead of a list of strings
    character_lines = ''.join(character_lines) 
    
    return character_lines

In [13]:
sample_ep = BeautifulSoup(requests.get(atla_links_df.iloc[0, 4]).content, "html.parser")

sample_str1 = character_lines_ep("Katara", sample_ep)

In [14]:
#Create df of all character lines per episode
#columns are episode number, character name, character lines
characters = ['Katara', 'Sokka', 'Aang', 'Toph', 'Zuko', 'Iroh']

character_col = []
ep_numbers_col = []
lines_col = []

#iterate through episodes
for i in range(0, len(atla_links_df)):
    #get episode html
    sample_ep = BeautifulSoup(requests.get(atla_links_df.iloc[i, 4]).content, "html.parser")
    #get episode number
    ep_num = atla_links_df.iloc[i,2]
    
    #iterate through characters
    for character in characters:
    
        #extract character lines
        lines = character_lines_ep(character, sample_ep)

        #add lines, character name, and episode number to respective columns
        lines_col.append(lines)
        character_col.append(character)
        ep_numbers_col.append(ep_num)

#make dataframe
atla_lines_df = pd.DataFrame(list(zip(ep_numbers_col, character_col, lines_col)),
                       columns =['episode_num', 'character_name', 'character_lines'])

#save to csv
atla_lines_df.to_csv('/Users/justin/Documents/Personal/projects/atla_nlp/data/raw/atla_lines.csv')


In [15]:
atla_lines_df

Unnamed: 0,episode_num,character_name,character_lines
0,1,Katara,[Narrating.] Water. Earth. Fire. Air. My grand...
1,1,Sokka,It's not getting away from me this time. [Clos...
2,1,Aang,[In a weak voice.] I need to ask you something...
3,1,Toph,
4,1,Zuko,Finally! [He turns around to face another pers...
...,...,...,...
361,61,Sokka,We're too late! The fleet's already taking off...
362,61,Aang,"Momo, time for you to go.Please listen to me. ..."
363,61,Toph,Too bad the Fire Lord's about to use it to des...
364,61,Zuko,It's not her I'm worried about. I'm worried ab...


In [16]:
#create one column per character
atla_lines_per_char_df = atla_lines_df.pivot(index='episode_num', columns='character_name', values='character_lines')

In [17]:
atla_lines_per_char_df

character_name,Aang,Iroh,Katara,Sokka,Toph,Zuko
episode_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,[In a weak voice.] I need to ask you something...,[He is playing some sort of card game. He answ...,[Narrating.] Water. Earth. Fire. Air. My grand...,It's not getting away from me this time. [Clos...,,Finally! [He turns around to face another pers...
2,"[Sheepishly, as Katara glares at Sokka.] Yeah....",[Turns to one of Zuko's men and gives him the ...,"Water. Earth. Fire. Air. Long ago, the four na...",[Angrily.] I knew it! [Accusingly points a fin...,,[Angrily.] Where are you hiding him?[Shaking K...
3,"[Excitedly.] Wait 'til you see it, Katara! The...",[Disinterested.] You mean the Avatar?[Bows bac...,"[Cautiously.] Aang, I know you're excited, [Sh...",[Grunting sleepily.] Uggh! Sleep now. Temple l...,,"Uncle, I want the repairs made as quickly as p..."
4,Well ... [Happily.] I know it's near water.Mom...,[Pushes open the door completely and enters th...,"[Nonchalantly, still focusing on her task at h...",[To Aang.] You have no idea where you're going...,,[Calmly.] The only reason you should be interr...
5,The Earth Kingdom city of Omashu! [Camera pans...,,[Close-up; impressed.] Wow. We don't have buil...,[Close-up; overwhelmed.] They have buildings h...,,
...,...,...,...,...,...,...
57,[To Zuko.] What are you doing?[In the backgrou...,,"[Angrily.] What are you doing?Okay, I'm not cr...",Come on! We've gotta get out of here!We need t...,[Pointing to the exit.] Come on! We can get ou...,[As he runs and pushes her out of harm's way.]...
58,[To Zuko; points to his seat and tries to act ...,"[Looking toward Actor Zuko, at ease.] Prince Z...",Doesn't it seem kind of weird that we're hidin...,You guys are not gonna believe this! There's a...,Why are we sitting in the nosebleed section? M...,"I told you, [Cut to Zuko sitting on a dry foun..."
59,[Stops firebending and turns to Zuko while gro...,,[Holding up two watermelons.] Who wants a nice...,Maybe Zuko's right. Sitting around the house h...,"Not bad, baldy, [Aang jumps off the sculpture....",More ferocious! Imagine striking through your ...
60,"Where are we, Momo? [Cut to view of the sky th...",[Close-up.] I was never angry with you. I was ...,[Angrily.] I'm not his girlfriend![Pulls out A...,"[To Zuko.] Hey, I remember her! [Cut to fronta...",[Frontal view.] We know he's gone. That's why ...,Yup. [Walks forward.] Back in the good old day...


In [18]:
#merge with atla_links_df
atla_df = pd.merge(atla_links_df, atla_lines_per_char_df, on='episode_num')

#save to csv
atla_df.to_csv('/Users/justin/Documents/Personal/projects/atla_nlp/data/processed/atla_df.csv')

In [19]:
atla_df

Unnamed: 0,book_num,book_name,episode_num,episode_name,link,Aang,Iroh,Katara,Sokka,Toph,Zuko
0,1,Book One: Water,1,The Boy in the Iceberg,https://avatar.fandom.com/wiki/Transcript:The_...,[In a weak voice.] I need to ask you something...,[He is playing some sort of card game. He answ...,[Narrating.] Water. Earth. Fire. Air. My grand...,It's not getting away from me this time. [Clos...,,Finally! [He turns around to face another pers...
1,1,Book One: Water,2,The Avatar Returns,https://avatar.fandom.com/wiki/Transcript:The_...,"[Sheepishly, as Katara glares at Sokka.] Yeah....",[Turns to one of Zuko's men and gives him the ...,"Water. Earth. Fire. Air. Long ago, the four na...",[Angrily.] I knew it! [Accusingly points a fin...,,[Angrily.] Where are you hiding him?[Shaking K...
2,1,Book One: Water,3,The Southern Air Temple,https://avatar.fandom.com/wiki/Transcript:The_...,"[Excitedly.] Wait 'til you see it, Katara! The...",[Disinterested.] You mean the Avatar?[Bows bac...,"[Cautiously.] Aang, I know you're excited, [Sh...",[Grunting sleepily.] Uggh! Sleep now. Temple l...,,"Uncle, I want the repairs made as quickly as p..."
3,1,Book One: Water,4,The Warriors of Kyoshi,https://avatar.fandom.com/wiki/Transcript:The_...,Well ... [Happily.] I know it's near water.Mom...,[Pushes open the door completely and enters th...,"[Nonchalantly, still focusing on her task at h...",[To Aang.] You have no idea where you're going...,,[Calmly.] The only reason you should be interr...
4,1,Book One: Water,5,The King of Omashu,https://avatar.fandom.com/wiki/Transcript:The_...,The Earth Kingdom city of Omashu! [Camera pans...,,[Close-up; impressed.] Wow. We don't have buil...,[Close-up; overwhelmed.] They have buildings h...,,
...,...,...,...,...,...,...,...,...,...,...,...
56,3,Book Three: Fire,57,The Southern Raiders,https://avatar.fandom.com/wiki/Transcript:The_...,[To Zuko.] What are you doing?[In the backgrou...,,"[Angrily.] What are you doing?Okay, I'm not cr...",Come on! We've gotta get out of here!We need t...,[Pointing to the exit.] Come on! We can get ou...,[As he runs and pushes her out of harm's way.]...
57,3,Book Three: Fire,58,The Ember Island Players,https://avatar.fandom.com/wiki/Transcript:The_...,[To Zuko; points to his seat and tries to act ...,"[Looking toward Actor Zuko, at ease.] Prince Z...",Doesn't it seem kind of weird that we're hidin...,You guys are not gonna believe this! There's a...,Why are we sitting in the nosebleed section? M...,"I told you, [Cut to Zuko sitting on a dry foun..."
58,3,Book Three: Fire,59,"Sozin's Comet, Part 1",https://avatar.fandom.com/wiki/Transcript:Sozi...,[Stops firebending and turns to Zuko while gro...,,[Holding up two watermelons.] Who wants a nice...,Maybe Zuko's right. Sitting around the house h...,"Not bad, baldy, [Aang jumps off the sculpture....",More ferocious! Imagine striking through your ...
59,3,Book Three: Fire,60,"Sozin's Comet, Part 2",https://avatar.fandom.com/wiki/Transcript:Sozi...,"Where are we, Momo? [Cut to view of the sky th...",[Close-up.] I was never angry with you. I was ...,[Angrily.] I'm not his girlfriend![Pulls out A...,"[To Zuko.] Hey, I remember her! [Cut to fronta...",[Frontal view.] We know he's gone. That's why ...,Yup. [Walks forward.] Back in the good old day...
