In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import unicodedata
import re
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import prep
import acquire

## Sentiment analysis of books of different religious texts

An overview using natural language processing and vader sentiment analysis to look at what percent of verses are very positive, positive, neutral, negative, and very negative.

### Bible

In [None]:
df = acquire.bible_wrangle()

In [None]:
df.head(2)

In [None]:
#removing stopwords, lower casing, and tokenizing each verse.
df.text = df.text.apply(prep.basic_clean)
df.text = df.text.apply(prep.remove_stopwords)

In [None]:
# creating column for each verse stemmed.
df['stemmed'] = df.text.apply(prep.stem)

In [None]:
df.head(2)

In [None]:
# Sentiment intensity algorithm initialized
sia = SentimentIntensityAnalyzer()

In [None]:
# Example of how SIA scores a corpus.
blob = "For God so loved the world that he gave his only son to die for us"
sia.polarity_scores(blob)

In [None]:
df.head(2)

In [None]:
df['vader_sentiment'] = df.text.apply(lambda txt: sia.polarity_scores(txt)['compound'])

In [None]:
print("Sentiment of each group of books by genre")
df.groupby('genre')['genre','vader_sentiment'].mean().sort_values(by='vader_sentiment',ascending=False)

In [None]:
ax = sns.boxplot(x="genre", y="vader_sentiment", data=df)
#ax = sns.swarmplot(x="genre", y="vader_sentiment", data=df, color=".25")
plt.hlines(df['vader_sentiment'].mean(),0,6, color='limegreen')
print("Boxplot of mean sentiment of each genre")

In [None]:
df_book_avg = df.groupby('book')['book','book_no','vader_sentiment'].mean().sort_values(by='book_no')
print("Average sentiment by book")
df_book_avg

In [None]:
#joinging each book into a complete corpus
df_books = pd.DataFrame(df.groupby('book')['text'].apply(list))
df_books.text = df_books.text.apply(prep.text_join)

In [None]:
df_books.head()

In [None]:
#grouping verses by chapter and applying vader sentiment score to each chapter
df_chap = pd.DataFrame(df.groupby(['book','ch'])['text'].apply(list))
df_chap.text = df_chap.text.apply(prep.text_join)
df_chap['vader_sentiment'] = df_chap.text.apply(lambda txt: sia.polarity_scores(txt)['compound'])

In [None]:
#example of one book grouped by chapter
df_chap.loc['Matthew']

In [None]:
#removing some additional stopwords and grouping by word frequency
df_books.text = df_books.text.apply(prep.remove_stopwords, args=(['said','thee','ye','thou','thy',"'",'shall','unto'],))
raw_count = pd.Series(" ".join(df_books.text).split()).value_counts()

In [None]:
raw_count.sort_values(ascending=False)[0:10].plot.barh(width=.9)
print('10 most common words')

In [None]:
df_gospels = df_books[df_books.index.isin(['John','Mark','Matthew','Luke'])]

In [None]:
df_gospels.text = df_gospels.text.apply(prep.remove_stopwords, args=(['said','thee','ye','thou','thy',"'",'shall','unto','things','say'],))

In [None]:
raw_count = pd.Series(" ".join(df_gospels.text).split()).value_counts()
raw_count.sort_values(ascending=False)[0:10].plot.barh(width=.9)
print('10 most common words from Gospels')

In [None]:
#assigning sentiment groups based on vader score. 
#Very positive, positive, neutral, negative, and very negative
df['vader'] = round(df.vader_sentiment,1)
df['sentiment_group'] = np.where(df.vader >= .7,'very positive',np.where(df.vader>=.1,'positive',np.where(df.vader == 0,'neutral',np.where(df.vader >= -.6,'negative','very negative'))))

In [None]:
#total verse count for each book
verse_count = df.groupby('book').count()
verse_count.reset_index(inplace=True)
verse_count = verse_count[['book','sentiment_group']]
verse_count.head(2)

In [None]:
#total verses by book and sentiment group
df_likert = df.groupby(['book','sentiment_group'])['id'].count().reset_index()

In [None]:
df_likert.head(8)

In [None]:
df_likert_total = pd.merge(df_likert,verse_count,left_on='book',right_on='book')

In [None]:
df_likert_total.head(2)

In [None]:
#calculating percent of each sentiment group by book
df_likert_total['percent'] = round(df_likert_total.id/df_likert_total.sentiment_group_y,2)

In [None]:
df_likert_total.head(5)

In [None]:
#grabbing book # for sorting in Tableau
df_books = df.groupby('book')[['book','book_no']].mean()
df_books.sort_values(by='book_no')

In [None]:
#final table for tableau viz
#book name, sentiment group, total verses, percent of verse, and book no
df_likert_final = pd.merge(df_likert_total,df_books,left_on='book',right_on='book')
df_likert_final.columns = ['book','sentiment','sentiment_verses','total_verses','percent','book_no']

In [None]:
df_likert_final.head(15)

In [None]:
df_likert_final.to_csv("bible_by_book_likert.csv",index=False)

In [None]:
pd.read_csv('bible_by_book_likert.csv').sort_values(by='book_no')

<img src="files/old_testament.png">

<img src="files/new_testament.png">

### Book of Mormon

In [2]:
df = acquire.bookofmormon_wrangle()
#df = pd.read_csv('lds-scriptures.csv')

In [3]:
df.head(2)

Unnamed: 0,id,book_no,book,test,ch,ver,text
31102,3,67,1 Nephi,His Reign and Ministry,1,1,"I, Nephi, having been born of goodly parents, ..."
31103,3,67,1 Nephi,His Reign and Ministry,1,2,"Yea, I make a record in the language of my fat..."
