# Part 1 & Part 2

In [1]:
import pandas as pd
import re

In [2]:
# Extract the whole document in one string.
with open('hamlet_act1.txt', 'r') as textfile:
  hamlet = textfile.read()

In [3]:
#Regex to find and extract all the eligible lines
all_lines = re.findall("[A-Z][(?<=I) ]?[a-z][A-Za-z,' \n]*[.|?|!]", hamlet)

In [4]:
# Converting all the line to list of lines
clean_text = []
for line in all_lines:
  clean_text.append(line.replace("\n", " "))

In [5]:
print(clean_text)



In [6]:
#Total number of lines extracted
print(len(clean_text))

552


In [7]:
# Prints the list of lines
for line in clean_text:
  print(line)

Enter Barnardo and Francisco, two sentinels.
Who's there?
Nay, answer me.
Stand and unfold yourself.
Long live the King!
Barnardo?
He.
You come most carefully upon your hour.
Tis now struck twelve.
Get thee to bed, Francisco.
For this relief much thanks.
Tis bitter cold, And I am sick at heart.
Have you had quiet guard?
Not a mouse stirring.
Well, good night.
If you do meet Horatio and Marcellus, The rivals of my watch, bid them make haste.
Enter Horatio and Marcellus.
I think I hear them.
Stand ho!
Who is there?
Friends to this ground.
And liegemen to the Dane.
Give you good night.
O farewell, honest soldier.
Who hath relieved you?
Barnardo hath my place.
Give you good night.
Francisco exits.
Holla, Barnardo.
Say, what, is Horatio there?
A piece of him.
Welcome, Horatio.
Welcome, good Marcellus.
What, has this thing appeared again tonight?
I have seen nothing.
Horatio says 'tis but our fantasy And will not let belief take hold of him Touching this dreaded sight twice seen of us.
There

# Part 3

In [8]:
#importing all the  nltk libraries
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [9]:
# Creates main dataframe to concatenate all the df of each line
main_bigram_freq_df = pd.DataFrame(columns=['bigram','freq'])

#for loop to extract each line, generate bigrams and its frequency, convert to dataframe, and add to the main dataframe.
for line in clean_text:
  line = re.sub("[^A-Za-z]", " ", line)
  tokens = nltk.word_tokenize(line.lower())
  bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
  bigram_freq = bigramFinder.ngram_fd.items()
  bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)
  main_bigram_freq_df = pd.concat([main_bigram_freq_df, bigramFreqTable])

#gets the frequency of every unique bigram.
main_bigram_freq_df = main_bigram_freq_df.groupby(['bigram']).sum().sort_values(by='freq', ascending=False).reset_index()
print(main_bigram_freq_df.head(30))
print(main_bigram_freq_df.shape)

           bigram  freq
0      (my, lord)    40
1       (in, the)    20
2       (of, the)    13
3        (it, is)    12
4       (to, the)    12
5       (do, not)    10
6      (of, this)    10
7     (the, king)     9
8         (i, am)     9
9       (to, you)     9
10    (speak, to)     9
11        (i, do)     7
12      (i, will)     7
13       (of, my)     7
14     (what, is)     7
15      (i, have)     6
16    (father, s)     6
17     (and, the)     6
18        (is, t)     6
19     (you, are)     6
20       (to, my)     6
21  (of, denmark)     6
22    (think, it)     6
23       (to, me)     5
24   (my, father)     5
25       (if, it)     5
26    (will, not)     5
27    (speak, of)     5
28      (is, not)     5
29   (shall, not)     5
(4040, 2)


In [16]:
#method to extract all the (Noun, Verb) bigrams
def rightTypes(ngram):
    if 's' in ngram:
        return False
    acceptable_types = ('NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'WP', 'WP$')
    second_type = ('VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

# VB	verb, base form	take
# VBD	verb, past tense	took
# VBG	verb, gerund/present participle	taking
# VBN	verb, past participle	taken
# VBP	verb, sing. present, non-3d	take
# VBZ	verb, 3rd person sing. present	takes
# NN	noun, singular 'desk'
# NNS	noun plural	'desks'
# NNP	proper noun, singular	'Harrison'
# NNPS	proper noun, plural	'Americans'

In [17]:
filtered_bi = main_bigram_freq_df[main_bigram_freq_df.bigram.map(lambda x: rightTypes(x))]
print(filtered_bi.shape)

# Top ten most frequent (Noun, Verb) bigrams.
print(filtered_bi.head(10)) 

(299, 2)
          bigram  freq
3       (it, is)    12
8        (i, am)     9
11       (i, do)     7
14    (what, is)     7
15     (i, have)     6
19    (you, are)     6
31    (i, think)     5
43   (you, have)     4
58  (they, exit)     4
86     (it, was)     3


In [18]:
#Converts the bigram df to text file without the header
filtered_bi.to_csv('result.txt', header = None, sep='\t', index=False)