In [2]:
## CODE IDEAS FOR HMW 2, Exploratory exercise for sentiment analysis
# finding adverb and adjective phrases, and computing basic statistics

# importing required nltk libraries
import nltk
from nltk import sent_tokenize

# loading our corpus (for this example: "Crime and Punishment," by F. Dostoevsky)
f = open('CrimeAndPunishment.txt')
text = f.read()
print(text[:150])

Produced by John Bickers; and Dagny

CRIME AND PUNISHMENT

By Fyodor Dostoevsky


Translated By Constance Garnett


TRANSLATOR'S PREFACE

A few words 


In [3]:
# Preprocessing, as explained in the Labs
# Separate the text into sentences first
textsplit = nltk.sent_tokenize(text)
print(textsplit[:10])

["Produced by John Bickers; and Dagny\n\nCRIME AND PUNISHMENT\n\nBy Fyodor Dostoevsky\n\n\nTranslated By Constance Garnett\n\n\nTRANSLATOR'S PREFACE\n\nA few words about Dostoevsky himself may help the English reader to\nunderstand his work.", 'Dostoevsky was the son of a doctor.', 'His parents were very hard-working\nand deeply religious people, but so poor that they lived with their five\nchildren in only two rooms.', 'The father and mother spent their evenings\nin reading aloud to their children, generally from books of a serious\ncharacter.', 'Though always sickly and delicate Dostoevsky came out third in the\nfinal examination of the Petersburg school of Engineering.', 'There he had\nalready begun his first work, "Poor Folk."', 'This story was published by the poet Nekrassov in his review and\nwas received with acclamations.', 'The shy, unknown youth found himself\ninstantly something of a celebrity.', 'A brilliant and successful career\nseemed to open before him, but those hopes 

In [4]:
# Apply the word tokenizer to each sentence
tokentext = [nltk.word_tokenize(sent) for sent in textsplit]
print(tokentext[:2])
#the output is a list of strings that contains the sentences
type(tokentext)
len(tokentext)

[['Produced', 'by', 'John', 'Bickers', ';', 'and', 'Dagny', 'CRIME', 'AND', 'PUNISHMENT', 'By', 'Fyodor', 'Dostoevsky', 'Translated', 'By', 'Constance', 'Garnett', 'TRANSLATOR', "'S", 'PREFACE', 'A', 'few', 'words', 'about', 'Dostoevsky', 'himself', 'may', 'help', 'the', 'English', 'reader', 'to', 'understand', 'his', 'work', '.'], ['Dostoevsky', 'was', 'the', 'son', 'of', 'a', 'doctor', '.']]


14723

In [5]:
## POS Tagging, to retrieve adjective (JJs) and adverb (RBs) tags

# use the Stanford POS tagger to POS tag tokens of each sentence
# this is the default tagger in nltk
taggedtext = [nltk.pos_tag(tokens) for tokens in tokentext]
print(taggedtext[:2])

[[('Produced', 'VBN'), ('by', 'IN'), ('John', 'NNP'), ('Bickers', 'NNP'), (';', ':'), ('and', 'CC'), ('Dagny', 'NNP'), ('CRIME', 'NNP'), ('AND', 'NNP'), ('PUNISHMENT', 'NNP'), ('By', 'IN'), ('Fyodor', 'NNP'), ('Dostoevsky', 'NNP'), ('Translated', 'NNP'), ('By', 'IN'), ('Constance', 'NNP'), ('Garnett', 'NNP'), ('TRANSLATOR', 'NNP'), ("'S", 'POS'), ('PREFACE', 'NNP'), ('A', 'NNP'), ('few', 'JJ'), ('words', 'NNS'), ('about', 'IN'), ('Dostoevsky', 'NNP'), ('himself', 'PRP'), ('may', 'MD'), ('help', 'VB'), ('the', 'DT'), ('English', 'JJ'), ('reader', 'NN'), ('to', 'TO'), ('understand', 'VB'), ('his', 'PRP$'), ('work', 'NN'), ('.', '.')], [('Dostoevsky', 'NNP'), ('was', 'VBD'), ('the', 'DT'), ('son', 'NN'), ('of', 'IN'), ('a', 'DT'), ('doctor', 'NN'), ('.', '.')]]


In [6]:
# Following our NLTK textbook, chapter on Information Extraction--Chunking (https://www.nltk.org/book/ch07.html)

# Using CHUNKING to parse sentences 
# to look for "adjective phrases", i.e. phrases (or chunks) that have adverbs and adjectives ('RB'+'JJ')
# First step: writing a grammar that defines the POS in the chunk
# we name this grammar "ADJPH" ("ADJective PHrase") using regexes 

import re
grammar_adjph = "ADJPH: {<RB.?>+<JJ.?>}"
# This regex reads as: "find groups ("< >") of RBs (adverbs) together with groups of JJs (adjectives), with groups defineds as
# RBs with any ending (the "." is a placeholder or wildcard for the "R" and the "S" at the end of RBR and RBS, 
# while "?" indicates "optional character" so RB can be found alone as well). Same regex operators apply to JJs.

# Second step: import the nltk parser to process each sentence
chunk_parser_adj = nltk.RegexpParser(grammar_adjph)

adjph_tags = []
for sent in taggedtext:
    if len(sent) > 0:
        tree = chunk_parser_adj.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'ADJPH':
                adjph_tags.append(subtree)
                
# Visualizing the actual adjective phrase
adjective_phrases = []
for sent in adjph_tags:
    temp = ''
    for w, t in sent:
        temp += w+ ' '    
    adjective_phrases.append(temp)
    
print('First 10 adjective phrases: ', adjective_phrases[:10])


# Following our NLTK textbook, chapter 1 on Language Processing (https://www.nltk.org/book/ch01.html)

## FREQUENCY DISTRIBUTIONS
# Top 50 adjective phrases
freq_adjph = nltk.FreqDist(adjective_phrases)

print('Top adjective phrases by frequency: ')
for word, freq in freq_adjph.most_common(50):
    print(word, freq)

            
#print the list of our sentences:
print('Length of adjective phrase sentences: ', len(adjph_tags))


First 10 adjective phrases:  ['very hard-working ', 'deeply religious ', 'so poor ', 'most widely read ', 'so much more ', 'exceptionally hot ', 'acutely aware ', 'most .... ', 'too much ', '_that_ serious ']
Top adjective phrases with frequency: 
too much  23
so much  18
very much  15
too ....  13
very good  11
once more  10
very glad  9
quite different  9
most likely  8
very likely  8
not worth  8
very little  7
very important  7
very minute  6
very pale  6
too late  6
as much  6
too great  6
very different  5
not afraid  5
so many  5
most important  5
not so much  5
very anxious  5
very strange  4
very next  4
so little  4
very difficult  4
as clear  4
so strange  4
very young  4
not asleep  4
hardly able  4
Very good  4
not right  4
so stupid  4
not drunk  4
not mad  4
not delirious  4
most interesting  4
very weak  3
just such  3
very clean  3
very bad  3
quite right  3
once ....  3
very poor  3
very great  3
very busy  3
very short  3
Length of adjective phrase sentences:  1636


In [8]:
# Now we look for "adverb phrases" or chunks that have 2 consecutive adverbs ('RB')
# First step: writing a grammar that defines POS rules of the adverb phrase the chunk
# we name this grammar "ADVPH" ("ADVerb PHrase")
grammar_advph = "ADVPH: {<RB>+<RB>}"

# Second step: import the nltk parser to process each sentence
chunk_parser_adv = nltk.RegexpParser(grammar_advph)

advph_tags = []
for sent in taggedtext:
    if len(sent) > 0:
        tree = chunk_parser_adv.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'ADVPH':
                advph_tags.append(subtree)
                
# Visualizing the actual adjective phrase
adverb_phrases = []
for sent in advph_tags:
    temp = ''
    for w, t in sent:
        temp += w+ ' '    
    adverb_phrases.append(temp)
    
print('First 10 adverb phrases: ', adverb_phrases[:10])

# top 50 adjective phrases
freq_advph = nltk.FreqDist(adverb_phrases)

print('Top adverb phrases by frequency: ')
for word, freq in freq_advph.most_common(50):
    print(word, freq)

            
#print the list of our sentences:
print('Length of adverb phrase sentences: ', len(advph_tags))

First 10 adverb phrases:  ['always sickly ', 'as soon ', 'still probably ', 'so much ', 'so completely ', 'not only ', 'there now ', 'along not ', 'so badly ', 'not far ']
Top adverb phrases with frequency: 
just now  47
as soon  33
not even  30
not so  19
only just  17
as well  16
very well  14
so long  13
not quite  13
so much  12
not only  12
not yet  12
very much  12
As soon  10
long ago  10
down again  9
even now  9
so far  8
not very  8
as far  7
too soon  7
n't even  7
quite well  6
n't quite  6
very soon  6
Very well  6
not exactly  6
not far  5
so soon  5
not now  5
away somewhere  5
Quite so  5
not merely  5
so completely  4
there now  4
so directly  4
not simply  4
perhaps not  4
too far  4
long before  4
just before  4
perfectly well  4
almost always  4
far away  4
only now  4
far too  4
As far  4
not here  3
down beside  3
sometimes even  3
Length of adverb phrase sentences:  1189


In [7]:
# Top 50 adjective tokens

adjective_tokens = []
for sentence in taggedtext:
    for word, pos in sentence:
        if pos in ['JJ', 'JJR', 'JJS']: # adjective, comparative, superlative
            if len(word)>1:
                adjective_tokens.append(word)
freq_adjective = nltk.FreqDist(adjective_tokens)

for word, freq in freq_adjective.most_common(50):
    print(word,freq)

little 275
last 274
such 206
old 198
first 196
other 188
same 178
more 172
own 168
great 162
good 141
strange 113
young 112
whole 109
much 105
right 97
new 94
long 91
better 88
least 85
.... 75
many 72
certain 72
open 65
flat 64
sure 62
to-day 62
dear 60
next 60
poor 56
possible 56
few 55
sudden 54
clear 54
second 53
full 53
true 52
afraid 50
angry 49
present 49
ready 48
Good 48
terrible 47
late 47
pale 47
different 46
able 45
dead 44
stupid 41
special 40


In [8]:
# Top 50 adverb tokens

adverb_tokens = []
for sentence in taggedtext:
    for word, pos in sentence:
        if pos in ['RB', 'RBR', 'RBS']: # adverb, comparative, superlative
            if len(word)>1:
                adverb_tokens.append(word)
freq_adverb = nltk.FreqDist(adverb_tokens)

for word, freq in freq_adverb.most_common(50):
    print(word,freq)

not 1737
n't 1065
so 666
too 500
now 493
very 436
only 390
again 390
then 348
once 340
even 310
here 289
still 269
just 268
suddenly 262
more 240
away 222
almost 208
there 194
perhaps 162
Well 162
quite 157
never 148
back 141
always 136
well 135
as 134
simply 131
indeed 131
down 131
really 128
up 125
Then 123
soon 102
most 99
already 94
yet 91
long 86
alone 85
rather 82
much 77
far 76
So 76
Here 73
Now 71
certainly 68
together 65
ever 65
sometimes 63
ago 60


In [9]:
## TO DO / YOUR TURN NOW!
## NOUN EXTRACTION
## VERB EXTRACTION
## REMEMBER TO CHECK THE PENN POS TAGS LIST: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
## TO FIND ALL TAGS

In [10]:
# Now we have two lists of POS tags combinations we can compare
# We need to get the sentences back from the tagging exercise and run some stats

# Create a list of original sentences from the ADJECTIVE phrase subset:
adjph_whole_sentences = []

# loop over the sentences in the adjective phrase sentences we created:
for sents in adjph_tags:
    temp=''
    for (word,tag) in sents:
        temp += word+' '
        adjph_whole_sentences.append(temp)
        
print(len(adjph_whole_sentences))

3416


In [11]:
# Create a list of original sentences from the ADVERB phrase subset:
advph_whole_sentences = []

# loop over the sentences in the adjective phrase sentences we created:
for sents in advph_tags:
    temp=''
    for (word,tag) in sents:
        temp += word+' '
        advph_whole_sentences.append(temp)
        
print(len(advph_whole_sentences))

2471


In [12]:
# OPTIONAL STEP: Combine lists together to have a single list of adjective/adverb phrases:
# Useful to know which sentences are heavy in qualifiers

# create a new variable to store all adjective phrase sentences
adv_adj_phrase_sentences = adjph_whole_sentences

# iterate over adverb phrase sentences
for sent in advph_whole_sentences:
    # if a sentence is not in the adjective phrases list imported
    if sent not in adv_adj_phrase_sentences:
        # attach that sentence
        adv_adj_phrase_sentences.append(sent)

# print the lenght of the list (i.e. number of sentences with both adjective and adverb phrases)
print(len(adv_adj_phrase_sentences))


4102


In [13]:
# Following our NLTK textbook, Writing Structural Programs chapter
# section on Procedural vs Declarative style (http://www.nltk.org/book_1ed/ch04.html) 

## CORPUS STATISTICS--SENTENCES LENGTH

# Calculating the average length of sentences in the entire corpus
# from http://www.nltk.org/book_1ed/ch04.html
total_corpus = sum(len(sent) for sent in textsplit) # remember: 'textsplit' is our text split into sentences
print(total_corpus / len(textsplit))

75.85607552808531


In [14]:
# Calculate the average length of an adjective phrase sentence
# We can then compare the average length of the adjective phrases to 
# the average sentences we calculated for all sentences in the corpus
total_adjph_sentences = sum(len(sent) for sent in adjph_whole_sentences) # adjph_whole_sentences stores our adjective phrases
print(total_adjph_sentences / len(adjph_whole_sentences))

10.254266211604095
