# Text Analysis on Great Expectations Novel

### Imports- **Run First**

In [None]:
#Bring in text file with our novel
textfile = open('sense_and_sensibility.txt', 'r', encoding = "utf8")
great_expect = textfile.read()

print(great_expect)

In [None]:
#Import libraries
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer

from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.models.coherencemodel import CoherenceModel
from wordcloud import WordCloud

import pandas as pd
from PIL import Image
import numpy as np
import random
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#Lowercase words for word cloud
word_cloud_text = great_expect.lower()
#Remove numbers and alphanumeric words we don't need for word cloud
word_cloud_text = re.sub("[^a-zA-Z0-9]", " ", word_cloud_text)

In [None]:
#Tokenize the data to split it into words
tokens = word_tokenize(word_cloud_text)
#Remove stopwords
tokens = (word for word in tokens if word not in stopwords.words('english'))
#Remove short words less than 3 letters in length
tokens = (word for word in tokens if len(word) >= 3)
#Add word cloud stopwords
stopwords_wc = set(stopwords.words("english"))

In [None]:
#Data cleaning to split data into sentences
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|edu|me)"
digits = "([0-9])"

text = " " + great_expect + "  "
text = text.replace("\n"," ")
text = re.sub(prefixes,"\\1<prd>",text)
text = re.sub(websites,"<prd>\\1",text)
text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
if "..." in text: text = text.replace("...","<prd><prd><prd>")
if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
if "”" in text: text = text.replace(".”","”.")
if "\"" in text: text = text.replace(".\"","\".")
if "!" in text: text = text.replace("!\"","\"!")
if "?" in text: text = text.replace("?\"","\"?")
text = text.replace(".",".<stop>")
text = text.replace("?","?<stop>")
text = text.replace("!","!<stop>")
text = text.replace("<prd>",".")
sentences = text.split("<stop>")
sentences = [s.strip() for s in sentences]
sentences = pd.DataFrame(sentences)
sentences.columns = ['sentence']

In [None]:
#Remove the first few rows of text that are irrelevant for analysis
sentences.drop(sentences.index[:59], inplace=True)
sentences = sentences.reset_index(drop=True)
sentences.head(10)

### Challenge: Analyze Sense and Sensibility

In [None]:
#Create word frequency distribution
fdist = %%%%

In [None]:
#View the 40 most common words in the text
%%%%

In [None]:
#Visualization of top 40 most common words in text
plt.figure(figsize=(12,6))
fdist.plot(%%%%)
plt.show()

In [None]:
#Initialize Vader sentiment analyzer
analyzer = %%%%

In [None]:
# Perfom Vader sentiment analysis
sentences['compound'] = [analyzer.polarity_scores(x)['%%%%'] for x in sentences['sentence']]
sentences['neg'] = [analyzer.polarity_scores(x)['%%%%'] for x in sentences['sentence']]
sentences['neu'] = [analyzer.polarity_scores(x)['%%%%'] for x in sentences['sentence']]
sentences['pos'] = [analyzer.polarity_scores(x)['%%%%'] for x in sentences['sentence']]
sentences.head(10)

In [None]:
#Get number of positive, neutral, and negative sentences
positive_sentence = sentences.loc[sentences['%%%%'] > %%%%]
negative_sentence = sentences.loc[sentences['%%%%'] < %%%%] 
neutral_sentence = sentences.loc[sentences['%%%%'] == %%%%]

print(sentences.shape)
print(len(positive_sentence))
print(len(negative_sentence))
print(len(neutral_sentence))

In [None]:
#Visualize Vader sentiment results
plt.figure(figsize=(14,6))
plt.hist(sentences['%%%%'], bins=50);