Import Packages

In [94]:
import re
import nltk
import os
import pandas as pd
import numpy as np
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px

Sentiment analysis should be conducted by chapters (possibly even by the chapters paragrpahs, will try chapters for now). I need to split the text file by chapters

**NO NEED TO RUN AGAIN**: The following code will extract and separate chapters from the Shinning text file and save each chapter as an individual txt file so sentiment analysis can be conducted on each chapter individually (can split by paragraphs later if needed)

In [76]:
#read in "The Shinning" text file
text = "The Shining-Stephen King-1977.txt"
#folder for the chapter txt files
chapter_text_files = 'chapters'
os.makedirs(chapter_text_files, exist_ok=True)
# Read the entire book text
with open(text, 'r', encoding='utf-8') as file:
    book_text = file.read()
#Capture each chapter, each is denoted "CHAPTER ####"
chapter_pattern = re.compile(r'(CHAPTER [A-Z]+)')
# Split the text into chapters
chapters = chapter_pattern.split(book_text)
# Combine headers with their respective content
chapter_data = [f"{header}\n{text.strip()}" for header, text in zip(chapters[1::2], chapters[2::2])]

# Save each chapter to a separate text file
for idx, content in enumerate(chapter_data, start=1):
    chapter_file = os.path.join(chapter_text_files, f'chapter_{idx}.txt')
    with open(chapter_file, 'w', encoding='utf-8') as file:
        file.write(content)
#making sure everything ran fine
print(f"Chapters have been successfully saved to the '{chapter_text_files}' folder!")

Chapters have been successfully saved to the 'chapters' folder!


After running the code above. I had to go into each chapter and separate each paragraph by cross referencing the original text. Indentations were lost in the online conversion process (this would of made it easy to automate the separation in each chapter). There is a large amount of conversational text, which makes it more difficult to split the text into paragraphs. Others could split the text in different spots because of this.

Setting Up Vader Package

In [95]:
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()

Sentiment analysis on *The Shinning*'s chapters

Setting up working directory to extract all chapters from the chapters folder

In [96]:
current_directory = os.getcwd()#Gets the current working directory
folder_path = os.path.join(current_directory, 'chapters') #Define the path to the "chapters" folder

#List all the chapters in the folder, ignoring hidden files like .DS_Store (i guess this happens on Mac)
chapters = [file for file in os.listdir(folder_path) if not file.startswith('.')]
def get_chapter_number(filename):
    try:
        return int(filename.split('_')[1].split('.')[0])
    except (IndexError, ValueError):
        return float('inf')
    
chapters.sort(key=get_chapter_number) #Sort the chapters based on the chapter number
#print(chapters)

Defining sentiment thresholds. 
Neutral doesn't offer much value, so I need tighten the threshold so there are fewer neutral values and add more groupings. 

standard cutoff is: positive >0.5, 0.5<= neutral >= - 0.5, negative < -0.5

Further analysis shows that the min compound score is: -0.12598069306930695 and the max is: 0.15897197802197804

In [97]:
strong_pos = 0.05 #x > 0.05
weak_pos = 0.01  #0.05>= x > 0.01
neutral_upper = 0.01 #0.01>= x >=-0.01
neutral_lower = -0.01 #0.01>= x >=-0.01
weak_neg = -0.05 #-0.01 > x >= -0.05
strong_neg = -0.05 #-0.05 > x

Loop through each chapter in the chapter file -> clean the text -> extract sentiment analysis scores for each sentence in the chapter -> calculate avg. compound score for each chapter -> store information about each chapter in a data frame

In [98]:
#List to store chapter data for the table
chapter_data = []

#Loop through each chapter text file and read its content
for chapter in chapters:
    file_path = os.path.join(folder_path, chapter)  # Get full file path
    with open(file_path, 'r') as file:
        content = file.read()

    #Getting rid of quotations, commas, paretheses, etc., and converting everything to lower case
    cleaned_text = (content.replace('"', '').replace('…', '').replace(',', '').replace('“', '').replace('”', '').replace('(', '').replace(')', '').lower())

    #Page splits introduce multiple spaces -> replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    #Split the text into sentences
    sentences = nltk.tokenize.sent_tokenize(cleaned_text)

    #Place to store each sentence's compound score to later get the avg. compound score for the chapter
    compound_scores = []

    #Sentiment analysis
    for sentence in sentences:
        score = sentiment.polarity_scores(sentence)
        compound = score['compound']
        compound_scores.append(compound)  # Sequentially adds compound scores to list

    #Calculate average compound score for the chapter
    average_compound = sum(compound_scores) / len(compound_scores)

    #Classify overall chapter sentiment based on the average compound score
    if average_compound > strong_pos:
        chapter_sentiment = "Positive"
    elif average_compound > weak_pos:
        chapter_sentiment = "Weak Positive"
    elif neutral_lower <= average_compound <= neutral_upper:
        chapter_sentiment = "Neutral"
    elif average_compound >= weak_neg:
        chapter_sentiment = "Weak Negative"
    else:
        chapter_sentiment = "Negative"
    #Store the chapter number, sentiment, average compound score, and full chapter text in the list
    chapter_number = int(chapter.split('_')[1].split('.')[0])  # Extract chapter number
    chapter_data.append([chapter_number, chapter_sentiment, average_compound, cleaned_text])

#Creating a dataframe for easy analysis
df = pd.DataFrame(chapter_data, columns=['Chapter Number', 'Sentiment', 'Average Compound Score', 'Full Text'])


Finding the min/max avg. compound scores to adjust thresholds

In [99]:
print(df['Average Compound Score'].min())
print(df['Average Compound Score'].max())

-0.12598069306930695
0.15897197802197804


Saving the df to a csv

In [None]:
#df.to_csv('allchapters.csv', index=False)

I think it would be cool to find the main topics of each chapter

**Topic Modeling**

In [79]:
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim

#nltk.download('wordnet')
#nltk.download('omw-1.4')
from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain

A little bit more cleaning for LDA

In [80]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

In [81]:
def clean(text):
    remove_stopwords = ' '. join([word for word in text.lower().split() if word not in stop])
    remove_punc = ''.join(pun for pun in remove_stopwords if pun not in exclude)
    normalized = ' '. join([lemma.lemmatize(word) for word in remove_punc.split()])
    return normalized.split()

In [82]:
df['chp_words'] = df['Full Text'].apply(clean)

Create a dictionary for all the words in the book

In [83]:
word_dict = corpora.Dictionary(df['chp_words'])
print(word_dict.num_nnz) #Numnber of unqiue words in the book

46144


Create document term matrix

In [84]:
doc_term_matrix = [word_dict.doc2bow(doc) for doc in df['chp_words']]
print(len(doc_term_matrix))

58


In [85]:
lda = gensim.models.ldamodel.LdaModel

In [86]:
np.random.seed(24)
num_topics = 5
%time ldamodel = lda(doc_term_matrix, num_topics = num_topics, id2word = word_dict, passes = 50, random_state=24, minimum_probability =0 )

CPU times: user 9.72 s, sys: 8.86 s, total: 18.6 s
Wall time: 12.2 s


Print the topics identified by LDA model

In [None]:
ldamodel.print_topics(num_topics=num_topics)

In [87]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, word_dict, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

In [88]:
# Assigns the topics to the documents in corpus
lda_corpus = ldamodel[doc_term_matrix]

In [89]:
scores = list(chain(*[[score for topic_id,score in topic] \
                      for topic in [doc for doc in lda_corpus]]))

threshold = sum(scores)/len(scores)
print(threshold)

0.2000000008112442


Assigning the clusters

In [90]:
cluster1 = [j for i,j in zip(lda_corpus,df.index) if i[0][1] > threshold]
cluster2 = [j for i,j in zip(lda_corpus,df.index) if i[1][1] > threshold]
cluster3 = [j for i,j in zip(lda_corpus,df.index) if i[2][1] > threshold]
cluster4 = [j for i,j in zip(lda_corpus,df.index) if i[3][1] > threshold]
cluster5 = [j for i,j in zip(lda_corpus,df.index) if i[4][1] > threshold]

Seeing what chapters are clustered together

In [91]:
df.iloc[cluster1]

Unnamed: 0,Chapter Number,Sentiment,Average Compound Score,Full Text,chp_words
17,18,weak positive,0.012725,chapter eighteen the scrapbook jack found the ...,"[chapter, eighteen, scrapbook, jack, found, sc..."


In [92]:
df.iloc[cluster2]

Unnamed: 0,Chapter Number,Sentiment,Average Compound Score,Full Text,chp_words
1,2,weak negative,-0.019968,chapter two boulder she looked out the kitchen...,"[chapter, two, boulder, looked, kitchen, windo..."
4,5,neutral,0.007354,chapter five phonebooth jack parked the vw in ...,"[chapter, five, phonebooth, jack, parked, vw, ..."
12,13,weak positive,0.029682,chapter thirteen the front porch the torrance ...,"[chapter, thirteen, front, porch, torrance, fa..."
14,15,positive,0.083067,chapter fifteen down in the front yard jack ha...,"[chapter, fifteen, front, yard, jack, found, h..."
27,28,neutral,-0.001825,chapter twenty -eight it was her! jack had sto...,"[chapter, twenty, eight, her, jack, stood, sta..."
32,33,weak negative,-0.012005,chapter thirty -three the snowmobile sometime ...,"[chapter, thirty, three, snowmobile, sometime,..."
35,36,weak negative,-0.032447,chapter thirty -six the elevator jack awoke fr...,"[chapter, thirty, six, elevator, jack, awoke, ..."
39,40,negative,-0.070606,chapter forty in the basement !!! the boiler t...,"[chapter, forty, basement, boiler, goddam, boi..."
44,45,neutral,0.008469,chapter forty -five stapleton airport denver a...,"[chapter, forty, five, stapleton, airport, den..."
47,48,weak negative,-0.018217,chapter forty -eight jack he sat on the floor ...,"[chapter, forty, eight, jack, sat, floor, pant..."


In [None]:
df.iloc[cluster3]

In [None]:
df.iloc[cluster4]

In [None]:
df.iloc[cluster5]


**Sentiment analysis on every paragraph for each chapter**

Breakdown each chapter by paragraph and get the sentiment for each paragraph

In [None]:
#Function to split a text file into paragraphs
def split_into_paragraphs(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()  # Read the contents of the file

        #Getting rid of quotations, commas, paretheses, etc., and converting everything to lower case
        text = (text.replace('"', '').replace('…', '').replace(',', '').replace('“', '').replace('”', '').replace('(', '').replace(')', '').lower())
    #Split the text into paragraphs using blank lines as the delimiter
    paragraphs = [paragraph.strip() for paragraph in text.split('\n\n') if paragraph.strip()]
    return paragraphs

Compound scores will be higher since it is isolated to the chapter level. Will need to rework the classifiers for sentiment taking this into consideration. 

Most are going to be in the range of -1 to 1

In [None]:
#Function to have custom sentiment classifiers based on compound score
def classify_sentiment(compound_score):
    if compound_score > 0.5:
        return 'Strong Positive'
    elif 0.5 >= compound_score > 0.1:
        return 'Weak Positive'
    elif 0.1 >= compound_score >= -0.1:
        return 'Neutral'
    elif -0.1 > compound_score >= -0.5:
        return 'Weak Negative'
    else:  #compound_score < -0.5
        return 'Strong Negative'

In [None]:
#Function to analyze sentiment for each paragraph
def analyze_chapter(filename):
    analyzer = SentimentIntensityAnalyzer()  #Initialize sentiment analyzer
    paragraphs = split_into_paragraphs(filename)  #Split file into paragraphs
    
     #Formating for a data frame
    data = []
    for i, paragraph in enumerate(paragraphs, start=1):
        sentiment = analyzer.polarity_scores(paragraph)  #Perform sentiment analysis
        classification = classify_sentiment(sentiment['compound'])  #Classify sentiment
        data.append({
            'Paragraph Number': i, #extracting the paragraph number
            'Compound Score': sentiment['compound'], #extracting the compound score for each paragraph
            'Classification': classification #extracting the classification for each paragraph
        })
    
    df = pd.DataFrame(data)
    return df

**NO NEED TO RUN AGAIN**

Running function on all chapters in the "chapters" folder as created before and saving them as CSVs

In [None]:
#File path of the folder containing the chapters of The Shinning
folder_path = os.path.join(os.getcwd(), 'chapters')  #Path to the "chapters" folder
output_folder = os.path.join(os.getcwd(), 'chapter_paragraphs')  #Path to the "chapter_paragraphs" folder, made this for ease of access
os.makedirs(output_folder, exist_ok=True)

#List all chapter files in the "chapters" folder
chapters = os.listdir(folder_path)

#Loop through each chapter text file and apply the function above
for chapter in chapters:
    file_path = os.path.join(folder_path, chapter)  #Get full file path
    
    #Only process .txt files
    if chapter.endswith('.txt'):
        df_name = os.path.splitext(chapter)[0] + '_paragraphs'  #Remove '.txt' and add '_paragraphs'
        
        #Analyze and store in a DataFrame
        df = analyze_chapter(file_path)
        
        #Save to CSV in the output folder
        output_path = os.path.join(output_folder, f'{df_name}.csv')
        df.to_csv(output_path, index=False)
        
        print(f"Processed {chapter} and saved to {output_path}")

**Visualizing the data**

General idea: display the sentiment for the book w/ compound scores on y axis and chapters on the x axis. Hover over each point to get specific information about the chapter. Click that point to then pull up a second graph w/ compond scores on y axis and the chapter's paragraphs on the x axis

In [102]:
allchapters = pd.read_csv('allchapters.csv') #Sentiment for The Shinning broken down by chapter sentences

#Sentiment for the individual chapters broken down by paragraphs
folder_path = 'chapter_paragraphs' #Folder path for individual chapters
chapter_dfs = {} #Load all individual chapter DataFrames dynamically
for i in range(1, 59):
    file_name = f'chapter_{i}_paragraphs.csv'
    file_path = os.path.join(folder_path, file_name)
    chapter_dfs[f'chp{i}'] = pd.read_csv(file_path)

In [103]:
#Initialize Dash app
app = dash.Dash(__name__)

#Create the main line plot for 'allchapters'
main_fig = px.line(
    allchapters, 
    x='Chapter Number', 
    y='Average Compound Score', 
    markers=True, 
    title="<i>The Shinning</i><br>Sentiment Breakdown by Chapter"  
)

#Add hover info
main_fig.update_traces(
    hovertemplate='Chapter: %{x}<br>Score: %{y}<br>Sentiment: %{customdata}<extra></extra>',
    customdata=allchapters['Sentiment']
)

#Marker/line colors
main_fig.update_traces(
    line=dict(color='#D36D6D'),  #Set line color to a much lighter red
    marker=dict(
        color='#800020',  #Set marker color to burgundy
    )
)

#Configurating x axis
main_fig.update_layout(
    xaxis=dict(
        tickmode='array',  
        tickvals=allchapters['Chapter Number'].unique(),  #Use unique chapter numbers for ticks
        range=[allchapters['Chapter Number'].min(), allchapters['Chapter Number'].max()],  #Set x-axis range to the min and max chapter numbers
        tickangle=-45,  #Rotate x-axis labels to 45 degrees to the left
        tickfont=dict(size=12, family='Arial', weight='bold'),  #Make x-axis ticks bold
    ),
    # Add horizontal line at y=0 (black)
    shapes=[dict(
        type='line',
        x0=allchapters['Chapter Number'].min(),
        x1=allchapters['Chapter Number'].max(),
        y0=0,
        y1=0,
        line=dict(
            color='black',
            width=1.5,
            dash='solid'
        )
    )],
    #Make plot background gray
    plot_bgcolor='#f5f5f5',  
    width=1200,  
    title=dict(
        x=0.5, #center the title
        xanchor='center',  
        yanchor='top',  
    )
)

#Layout of the app
app.layout = html.Div([
    #Main graph
    dcc.Graph(id='main-graph', figure=main_fig, style={'width': '1200px'}),
    
    #Placeholder for the detailed graph
    dcc.Graph(id='detail-graph', style={'width': '1200px'})
])

#Callback to update the detailed graph when clicking a chapter in the main graph
@app.callback(
    Output('detail-graph', 'figure'),
    Input('main-graph', 'clickData')
)
def update_detail_graph(click_data):
    if click_data:
        #Extract chapter number from the clicked point
        chapter_num = click_data['points'][0]['x']
        
        #Get the corresponding chapter DataFrame
        ch_df = chapter_dfs.get(f'chp{int(chapter_num)}', pd.DataFrame())

        #Get the min and max values for compound score
        detail_min_value = ch_df['Compound Score'].min()
        detail_max_value = ch_df['Compound Score'].max()

        #Create a plot for the selected chapter
        if not ch_df.empty:
            detail_fig = px.line(
                ch_df,
                x='Paragraph Number',
                y='Compound Score',
                markers=True,
                title=f"Sentiment Breakdown for Chapter: {int(chapter_num)}"
            )

            #Add hover info for the detail plot
            detail_fig.update_traces(
                hovertemplate='Paragraph: %{x}<br>Score: %{y}<br>Sentiment: %{customdata}<extra></extra>',
                customdata=ch_df['Classification']  
            )

            detail_fig.update_traces(
                line=dict(color='#D36D6D'),
                marker=dict(
                    color='#800020',
                )
            )

            #Dynamically update the x-axis range for the 'detailed' graph
            detail_fig.update_layout(
                xaxis=dict(
                    tickmode='array',  
                    tickvals=ch_df['Paragraph Number'].unique(), 
                    tickangle=-45,  
                    tickfont=dict(size=12, family='Arial', weight='bold'),  
                    range=[ch_df['Paragraph Number'].min(), ch_df['Paragraph Number'].max()]  
                ),

                shapes=[dict(
                    type='line',
                    x0=ch_df['Paragraph Number'].min(),
                    x1=ch_df['Paragraph Number'].max(),
                    y0=0,
                    y1=0,
                    line=dict(
                        color='black',
                        width=1.5,
                        dash='solid'
                    )
                )],
              
                plot_bgcolor='#f5f5f5',  
                width=1200,  
                height=500,  
                title=dict(
                    x=0.5,  
                    xanchor='center',  
                    yanchor='top', 
                )
            )

            return detail_fig
    return {}

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)
