In [1]:
import nltk
from nltk.corpus import stopwords
from nltk import bigrams
from nltk.collocations import *
import re

#### Source of .txt files: https://www.gutenberg.org/ebooks/42671 

## Load and process files for text analysis

In [2]:
#Load .txt files
SS = open("Sense_and_Sensibility.txt", "r")
PP = open("Pride_and_Prejudice.txt", "r")

In [3]:
SS = SS.read()
PP = PP.read()

In [4]:
#Tokenize
SS_tokens = nltk.word_tokenize(SS)
PP_tokens = nltk.word_tokenize(PP)

In [5]:
#View length of tokens
print(len(SS_tokens))
print(len(PP_tokens))

144488
144561


In [6]:
#Viiew sample of tokens
print(SS_tokens[150:200])

['to', 'which', 'we', 'have', 'referred', '.', '_Pride', 'and', 'Prejudice_', 'it', 'is', 'true', ',', 'was', 'written', 'and', 'finished', 'before', '_Sense', 'and', 'Sensibility_', '--', 'its', 'original', 'title', 'for', 'several', 'years', 'being', '_First', 'Impressions_', '.', 'Then', ',', 'in', '1797', ',', 'the', 'author', 'fell', 'to', 'work', 'upon', 'an', 'older', 'essay', 'in', 'letters', '_Ã', 'la_']


In [7]:
#Convert all letters to lowercase to allow for proper cleansing
def lowercase(text):
    token_list = []
    for word in text:
        word_lower = word.lower()
        token_list.append(word_lower)
    return(token_list)

In [8]:
SS_tokens = lowercase(SS_tokens)
PP_tokens = lowercase(PP_tokens)

In [9]:
#Remove stopwords
#Add extra stopwords in addition to the default nltk stopwords list
nltkstopwords = nltk.corpus.stopwords.words('english')
more_words = ["mr.","mrs.", "miss", "sir", "lady", "colonel", "de", "chapter"]
stop_words = nltkstopwords + more_words

In [10]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [11]:
SS_stop = [word for word in SS_tokens if word not in stop_words]
PP_stop = [word for word in PP_tokens if word not in stop_words] 

In [13]:
#View length after removal of stopwords
print(len(SS_stop))
print(len(PP_stop))

76109
75797


In [14]:
#Stemming using Lancaster method
Porter = nltk.PorterStemmer()

In [15]:
SS_stem = [Porter.stem(t) for t in SS_stop]
PP_stem = [Porter.stem(t) for t in PP_stop]

In [18]:
#Remove non-letter characters
SS_clean = [re.sub(r'[^a-zA-Z0-9]+', '', word) for word in SS_stem]
SS_final = list(filter(None, SS_clean))

PP_clean = [re.sub(r'[^a-zA-Z0-9]+', '', word) for word in PP_stem]
PP_final = list(filter(None, PP_clean))

In [19]:
#View length after removal of stopwords
print(len(SS_final))
print(len(PP_final))

54824
54328


## Text Analysis

View top 50 most common words for both <i> Sense and Sensibility </i> and <i> Pride and Prejudice </i>

In [434]:
SS_top_50 = nltk.FreqDist(SS_final)
PP_top_50 = nltk.FreqDist(PP_final)

# Print the 50 most common tokens in The Dead...
print("The 50 most common words in Sense and Sensibility are:\n")
print(SS_top_50.most_common(50))

The 50 most common words in Sense and Sensibility are:

[('elinor', 693), ('s', 660), ('could', 582), ('mariann', 569), ('would', 520), ('said', 399), ('everi', 377), ('one', 346), ('sister', 320), ('dashwood', 294), ('much', 291), ('must', 289), ('know', 273), ('edward', 267), ('time', 263), ('mother', 253), ('think', 238), ('jen', 236), ('well', 224), ('see', 218), ('though', 217), ('might', 217), ('willoughbi', 214), ('say', 213), ('thing', 208), ('never', 191), ('day', 190), ('noth', 189), ('luci', 186), ('may', 184), ('even', 182), ('soon', 181), ('without', 176), ('wish', 172), ('first', 170), ('ever', 170), ('feel', 169), ('make', 169), ('littl', 167), ('look', 166), ('give', 166), ('john', 166), ('happi', 166), ('go', 162), ('two', 157), ('howev', 157), ('good', 157), ('hous', 155), ('great', 154), ('thought', 151)]


In [426]:
# Print the 50 most common tokens in The Dead...
print("The 50 most common words in Pride and Prejudice are:\n")
print(PP_top_50.most_common(50))

The 50 most common words in Pride and Prejudice are:

[('elizabeth', 634), ('s', 596), ('could', 523), ('would', 470), ('darci', 409), ('said', 402), ('bennet', 334), ('much', 328), ('bingley', 312), ('must', 308), ('one', 305), ('jane', 292), ('sister', 292), ('everi', 285), ('know', 273), ('think', 237), ('though', 226), ('time', 222), ('never', 221), ('well', 219), ('soon', 218), ('see', 212), ('say', 210), ('make', 206), ('good', 203), ('might', 200), ('may', 194), ('wickham', 194), ('thing', 191), ('littl', 187), ('wish', 183), ('noth', 178), ('collin', 178), ('look', 174), ('lydia', 171), ('come', 170), ('without', 170), ('hope', 168), ('feel', 167), ('friend', 167), ('day', 165), ('shall', 163), ('go', 161), ('even', 160), ('dear', 158), ('famili', 157), ('like', 157), ('give', 157), ('happi', 155), ('man', 149)]


View top 50 most common bigrams for both <i> Sense and Sensibility </i> and <i> Pride and Prejudice </i> using frequency score

In [427]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
SS_bigrams = BigramCollocationFinder.from_words(SS_final)
SS_bigrams.apply_ngram_filter(lambda w1, w2: len(w2) < 2)

SS_bigrams_top_50 = SS_bigrams.score_ngrams(bigram_measures.raw_freq)

In [428]:
print("The top 50 bigrams in Sense and Sensibility by frequency score are:\n")
for bscore in SS_bigrams_top_50[:50]:
    print (bscore)

The top 50 bigrams in Sense and Sensibility by frequency score are:

(('everi', 'thing'), 0.0014592149423610099)
(('said', 'elinor'), 0.0011856121406683205)
(('john', 'dashwood'), 0.0006931270976214797)
(('dare', 'say'), 0.0006566467240624544)
(('everi', 'bodi'), 0.0006201663505034291)
(('said', 'mariann'), 0.0005472056033853787)
(('thousand', 'pound'), 0.0005472056033853787)
(('elinor', 'could'), 0.0005289654166058661)
(('everi', 'day'), 0.0005107252298263534)
(('repli', 'elinor'), 0.0004924850430468408)
(('young', 'man'), 0.00047424485626732817)
(('cri', 'mariann'), 0.00045600466948781553)
(('great', 'deal'), 0.00043776448270830295)
(('would', 'give'), 0.00034656354881073984)
(('berkeley', 'street'), 0.00031008317525171457)
(('next', 'morn'), 0.00031008317525171457)
(('day', 'two'), 0.00029184298847220193)
(('harley', 'street'), 0.00029184298847220193)
(('next', 'day'), 0.00029184298847220193)
(('said', 'jen'), 0.00029184298847220193)
(('sens', 'sensibility'), 0.00029184298847220193)

In [429]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
PP_bigrams = BigramCollocationFinder.from_words(PP_final)
PP_bigrams.apply_ngram_filter(lambda w1, w2: len(w2) < 2)

PP_bigrams_top_50 = PP_bigrams.score_ngrams(bigram_measures.raw_freq)

In [430]:
print("The top 50 bigrams in Pride and Prejudice by frequency score are:\n")
for bscore in PP_bigrams_top_50[:50]:
    print (bscore)

The top 50 bigrams in Pride and Prejudice by frequency score are:

(('everi', 'thing'), 0.0010859961714033278)
(('said', 'elizabeth'), 0.0008467088793992049)
(('young', 'man'), 0.0006994551612428214)
(('everi', 'bodi'), 0.0006442350169341776)
(('dare', 'say'), 0.0005706081578559859)
(('said', 'bennet'), 0.0004969812987777941)
(('elizabeth', 'could'), 0.0004785745840082462)
(('young', 'ladi'), 0.00046016786923869826)
(('great', 'deal'), 0.00044176115446915035)
(('uncl', 'aunt'), 0.00044176115446915035)
(('cri', 'elizabeth'), 0.00042335443969960244)
(('went', 'away'), 0.00038654101016050656)
(('could', 'help'), 0.0003681342953909586)
(('half', 'hour'), 0.0003681342953909586)
(('repli', 'elizabeth'), 0.0003681342953909586)
(('thousand', 'pound'), 0.0003681342953909586)
(('next', 'morn'), 0.0003497275806214107)
(('good', 'humour'), 0.0003129141510823148)
(('made', 'answer'), 0.0003129141510823148)
(('mr', 'darci'), 0.0003129141510823148)
(('much', 'better'), 0.0003129141510823148)
(('dear'

View top 50 most common bigrams for both <i> Sense and Sensibility </i> and <i> Pride and Prejudice </i> using mutual information and a minimum frequency of 5

In [431]:
SS_bigrams.apply_freq_filter(5)
SS_bigrams = PP_bigrams.score_ngrams(bigram_measures.pmi)
print("The top 50 bigrams in Sense and Sensibility by MI are:\n")
for bscore in SS_bigrams[:50]:
    print (bscore)

The top 50 bigrams in Sense and Sensibility by MI are:

(('15th', 'october'), 15.72940831698402)
(('adept', 'scienc'), 15.72940831698402)
(('although', 'utterli'), 15.72940831698402)
(('anywher', 'else'), 15.72940831698402)
(('blenheim', 'warwick'), 15.72940831698402)
(('bounti', 'benefic'), 15.72940831698402)
(('bowl', 'punch'), 15.72940831698402)
(('briberi', 'corrupt'), 15.72940831698402)
(('broadfac', 'stuffi'), 15.72940831698402)
(('bromley', 'course'), 15.72940831698402)
(('buri', 'parishion'), 15.72940831698402)
(('cap', 'powder'), 15.72940831698402)
(('caper', 'frisk'), 15.72940831698402)
(('cleans', 'impur'), 15.72940831698402)
(('coax', 'threaten'), 15.72940831698402)
(('commonest', 'dullest'), 15.72940831698402)
(('conniv', 'aid'), 15.72940831698402)
(('deem', 'indispens'), 15.72940831698402)
(('default', 'heir'), 15.72940831698402)
(('deter', 'foul'), 15.72940831698402)
(('dishonest', 'dishonesti'), 15.72940831698402)
(('dove', 'dale'), 15.72940831698402)
(('droop', 'apac')

In [432]:
PP_bigrams.apply_freq_filter(5)
PP_bigrams = PP_bigrams.score_ngrams(bigram_measures.pmi)
print("The top 50 bigrams in Pride and Prejudice by MI are:\n")
for bscore in PP_bigrams[:50]:
    print (bscore)

The top 50 bigrams in Pride and Prejudice by MI are:

(('st', 'jame'), 12.922053394926415)
(('humbl', 'abod'), 11.243981489813779)
(('thousand', 'pound'), 10.378911069899887)
(('shook', 'head'), 10.18251385709638)
(('beg', 'pardon'), 9.922053394926413)
(('lift', 'eye'), 9.663319126526247)
(('gone', 'scotland'), 9.360174507318298)
(('ten', 'thousand'), 9.013914253120637)
(('card', 'tabl'), 8.922053394926415)
(('younger', 'son'), 8.871427321856448)
(('insist', 'upon'), 8.863159705872846)
(('drew', 'near'), 8.518006679565548)
(('step', 'forward'), 8.483380336455916)
(('fair', 'cousin'), 8.363086102738201)
(('luca', 'lodg'), 8.257733102591974)
(('put', 'end'), 8.237555220654347)
(('depend', 'upon'), 8.122078003234407)
(('great', 'deal'), 8.056982975012522)
(('make', 'hast'), 8.042907789800802)
(('half', 'hour'), 8.028968598842928)
(('short', 'paus'), 7.948048603459361)
(('georg', 'wickham'), 7.9368503968544974)
(('year', 'ago'), 7.905586183337647)
(('uncl', 'aunt'), 7.775696864622507)
(('w