In [101]:
import glob
import os
import string
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize.casual import TweetTokenizer, casual_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from collections import Counter
from nltk.util import ngrams

import nltk
nltk.download('punkt')
nltk.download('stopwords')

import warnings
warnings.filterwarnings("ignore") 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mukes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mukes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Corpus 1

In [102]:
corpus1_dir = 'Corpus1'

text_files = []

for filename in os.listdir(str(os.getcwd())+'/'+corpus1_dir+'/'):
    if filename.endswith('.txt'):
        file_path = os.path.join(corpus1_dir, filename)

        text_files.append(file_path)


print(text_files)

['Corpus1\\pg11774.txt', 'Corpus1\\pg16320.txt', 'Corpus1\\pg17374.txt', 'Corpus1\\pg1903.txt', 'Corpus1\\pg26841.txt', 'Corpus1\\pg27957.txt', 'Corpus1\\pg28409.txt', 'Corpus1\\pg29364.txt', 'Corpus1\\pg29499.txt', 'Corpus1\\pg33331.txt', 'Corpus1\\pg34463.txt', 'Corpus1\\pg34823.txt', 'Corpus1\\pg4359.txt', 'Corpus1\\pg44052.txt', 'Corpus1\\pg44274.txt', 'Corpus1\\pg46499.txt', 'Corpus1\\pg52460.txt', 'Corpus1\\pg55099.txt', 'Corpus1\\pg59042.txt', 'Corpus1\\pg60029.txt', 'Corpus1\\pg6716.txt', 'Corpus1\\pg68369.txt', 'Corpus1\\pg70377.txt', 'Corpus1\\pg88.txt']


In [103]:
def tokenize_words():
    lst = []
    for file_path in text_files:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            words = word_tokenize(text)
            words = [word.lower() for word in words if word.isalnum()]
            lst.extend(words)
    return lst

In [104]:
def most_common_words(lst):
    word_counter = Counter()
    word_counter.update(lst)
    return word_counter

def most_common_bigrams(lst):
    bigram_counter = Counter()
    bi_grams = list(ngrams(lst, 2))
    bigram_counter.update(bi_grams)
    return bigram_counter

In [105]:
tokenized_words_corpus1 = tokenize_words()

# Get the 30 most common words
top_30_words = most_common_words(tokenized_words_corpus1).most_common(30)

# Print the 30 most common words
print("30 Most Common Words in Corpus 1:")
for word in top_30_words:
    print(word)

30 Most Common Words in Corpus 1:
('the', 99996)
('of', 64773)
('and', 38888)
('to', 35795)
('in', 32331)
('a', 27181)
('that', 16991)
('is', 16736)
('it', 13464)
('for', 12513)
('as', 11396)
('be', 10413)
('was', 9556)
('by', 9094)
('this', 8814)
('on', 8691)
('or', 8154)
('with', 8104)
('not', 7811)
('which', 7709)
('are', 7393)
('at', 7275)
('he', 6846)
('have', 6258)
('i', 5677)
('from', 5599)
('money', 5484)
('his', 5436)
('but', 5303)
('all', 5244)


In [106]:
# Get the 100 most common words
top_100_words = most_common_words(tokenized_words_corpus1).most_common(100)

# Print the 100 most common words
print("100 Most Common Words in Corpus 1:")
for word in top_100_words:
    print(word)

100 Most Common Words in Corpus 1:
('the', 99996)
('of', 64773)
('and', 38888)
('to', 35795)
('in', 32331)
('a', 27181)
('that', 16991)
('is', 16736)
('it', 13464)
('for', 12513)
('as', 11396)
('be', 10413)
('was', 9556)
('by', 9094)
('this', 8814)
('on', 8691)
('or', 8154)
('with', 8104)
('not', 7811)
('which', 7709)
('are', 7393)
('at', 7275)
('he', 6846)
('have', 6258)
('i', 5677)
('from', 5599)
('money', 5484)
('his', 5436)
('but', 5303)
('all', 5244)
('they', 4806)
('you', 4519)
('their', 4256)
('had', 4190)
('would', 4190)
('has', 4166)
('an', 4148)
('we', 4139)
('will', 3891)
('bank', 3803)
('one', 3796)
('if', 3778)
('any', 3712)
('were', 3699)
('its', 3683)
('4', 3674)
('been', 3647)
('other', 3481)
('there', 3314)
('no', 3310)
('so', 3186)
('new', 3071)
('value', 2975)
('may', 2944)
('est', 2881)
('can', 2808)
('than', 2797)
('our', 2789)
('who', 2784)
('these', 2773)
('when', 2747)
('more', 2716)
('gold', 2711)
('stock', 2643)
('them', 2483)
('only', 2357)
('great', 2344)
('

In [107]:
# Get the 10 most common words
top_10_words = most_common_words(tokenized_words_corpus1).most_common(10)

# Print the 10 most common words
print("10 Most Common Words in Corpus 1:")
for word in top_10_words:
    print(word)

10 Most Common Words in Corpus 1:
('the', 99996)
('of', 64773)
('and', 38888)
('to', 35795)
('in', 32331)
('a', 27181)
('that', 16991)
('is', 16736)
('it', 13464)
('for', 12513)


### Q2 Repeat exercise 1 above but now do it for n-grams with n = 2 (bi-grams).

In [108]:
# Get the 30 most common words for ngram with n = 2
top_30_words = most_common_bigrams(tokenized_words_corpus1).most_common(30)

# Print the 30 most common words
print("30 Most Common bi-grams in Corpus 1:")
for word in top_30_words:
    print(word)

30 Most Common bi-grams in Corpus 1:
(('of', 'the'), 17240)
(('in', 'the'), 8638)
(('to', 'the'), 5524)
(('and', 'the'), 3683)
(('on', 'the'), 3462)
(('for', 'the'), 3022)
(('it', 'is'), 2975)
(('that', 'the'), 2877)
(('4', 'est'), 2877)
(('to', 'be'), 2632)
(('by', 'the'), 2620)
(('with', 'the'), 2285)
(('of', 'a'), 2127)
(('at', 'the'), 1924)
(('new', 'york'), 1752)
(('from', 'the'), 1648)
(('of', 'this'), 1599)
(('united', 'states'), 1571)
(('in', 'a'), 1492)
(('the', 'bank'), 1424)
(('of', 'money'), 1394)
(('is', 'a'), 1393)
(('as', 'a'), 1391)
(('the', 'same'), 1383)
(('it', 'was'), 1309)
(('as', 'the'), 1279)
(('the', 'united'), 1275)
(('is', 'the'), 1227)
(('all', 'the'), 1209)
(('have', 'been'), 1148)


In [109]:
# Get the 100 most common words for ngram with n = 2
top_100_words = most_common_bigrams(tokenized_words_corpus1).most_common(100)

# Print the 100 most common words
print("100 Most Common bi-grams in Corpus 1:")
for word in top_100_words:
    print(word)

100 Most Common bi-grams in Corpus 1:
(('of', 'the'), 17240)
(('in', 'the'), 8638)
(('to', 'the'), 5524)
(('and', 'the'), 3683)
(('on', 'the'), 3462)
(('for', 'the'), 3022)
(('it', 'is'), 2975)
(('that', 'the'), 2877)
(('4', 'est'), 2877)
(('to', 'be'), 2632)
(('by', 'the'), 2620)
(('with', 'the'), 2285)
(('of', 'a'), 2127)
(('at', 'the'), 1924)
(('new', 'york'), 1752)
(('from', 'the'), 1648)
(('of', 'this'), 1599)
(('united', 'states'), 1571)
(('in', 'a'), 1492)
(('the', 'bank'), 1424)
(('of', 'money'), 1394)
(('is', 'a'), 1393)
(('as', 'a'), 1391)
(('the', 'same'), 1383)
(('it', 'was'), 1309)
(('as', 'the'), 1279)
(('the', 'united'), 1275)
(('is', 'the'), 1227)
(('all', 'the'), 1209)
(('have', 'been'), 1148)
(('value', 'of'), 1141)
(('per', 'cent'), 1084)
(('would', 'be'), 1077)
(('there', 'is'), 1064)
(('the', 'stock'), 1062)
(('has', 'been'), 1045)
(('in', 'this'), 1039)
(('may', 'be'), 1025)
(('will', 'be'), 994)
(('to', 'a'), 899)
(('the', 'country'), 878)
(('is', 'not'), 875)
((

In [110]:
# Get the 10 most common words for ngram with n = 2
top_10_words = most_common_bigrams(tokenized_words_corpus1).most_common(10)

# Print the 10 most common words
print("10 Most Common bi-grams in Corpus 1:")
for word in top_10_words:
    print(word)

10 Most Common bi-grams in Corpus 1:
(('of', 'the'), 17240)
(('in', 'the'), 8638)
(('to', 'the'), 5524)
(('and', 'the'), 3683)
(('on', 'the'), 3462)
(('for', 'the'), 3022)
(('it', 'is'), 2975)
(('that', 'the'), 2877)
(('4', 'est'), 2877)
(('to', 'be'), 2632)


### Q3 Find a stop word list. Then remove all words in that stop list and repeat exercise 1 and 2.

In [111]:
stop_words = set(nltk.corpus.stopwords.words('english'))

not_stop_words_list = [word for word in tokenized_words_corpus1 if word not in stop_words]

def stop_word_common_words(not_stop_words_list):
    word_counter = Counter()
    word_counter.update(not_stop_words_list)
    return word_counter

def stop_word_common_bigrams(not_stop_words_list):
    bigram_counter = Counter()
    bi_grams = list(ngrams(not_stop_words_list, 2))
    bigram_counter.update(bi_grams)
    return bigram_counter

In [112]:
# list of all the stop words in corpus 1
stop_words_list = [word for word in tokenized_words_corpus1 if word in stop_words]
print("List of all the stopwords in corpus1",set(stop_words_list))
print(len(set(stop_words_list)))

List of all the stopwords in corpus1 {'about', 'these', 'while', 'd', 'hasn', 'between', 'by', 'ourselves', 'you', 'own', 'isn', 'was', 'yourselves', 'other', 'their', 'our', 'why', 'for', 'further', 'so', 'during', 'before', 'myself', 'herself', 'which', 'my', 'or', 'had', 've', 'having', 'this', 'be', 'whom', 'each', 'm', 'yourself', 'more', 'have', 'out', 'on', 'haven', 'too', 'over', 'its', 'in', 'any', 'wouldn', 'wasn', 'just', 'been', 'has', 'her', 'y', 'now', 'most', 'itself', 'as', 'once', 'won', 'your', 'a', 'few', 'does', 'above', 'yours', 'they', 'who', 'the', 'both', 'only', 'to', 'should', 'i', 'them', 'until', 'when', 'what', 'some', 'his', 'being', 'couldn', 'll', 'with', 'such', 'all', 'theirs', 'can', 'he', 'is', 'under', 'don', 'after', 'here', 'than', 'did', 'o', 't', 'an', 'from', 'himself', 'doing', 'of', 'themselves', 'am', 'because', 'where', 'nor', 'against', 'him', 'then', 'up', 'will', 'same', 's', 'but', 'do', 'those', 'it', 're', 'are', 'there', 'if', 'very'

In [113]:
# Get the 30 most common words
top_30_words = stop_word_common_words(not_stop_words_list).most_common(30)

# Print the 30 most common words
print("30 Most Common Words without stopwords in Corpus 1:")
for word in top_30_words:
    print(word)

30 Most Common Words without stopwords in Corpus 1:
('money', 5484)
('would', 4190)
('bank', 3803)
('one', 3796)
('4', 3674)
('new', 3071)
('value', 2975)
('may', 2944)
('est', 2881)
('gold', 2711)
('stock', 2643)
('great', 2344)
('business', 2319)
('time', 2282)
('banks', 2204)
('exchange', 2197)
('project', 2169)
('states', 2127)
('made', 2061)
('country', 1940)
('upon', 1931)
('market', 1916)
('york', 1768)
('work', 1753)
('per', 1729)
('much', 1697)
('credit', 1672)
('united', 1650)
('years', 1560)
('must', 1480)


In [114]:
# Get the 10 most common words
top_10_words = stop_word_common_words(not_stop_words_list).most_common(10)

# Print the 10 most common words
print("10 Most Common Words without stopwords in Corpus 1:")
for word in top_10_words:
    print(word)

10 Most Common Words without stopwords in Corpus 1:
('money', 5484)
('would', 4190)
('bank', 3803)
('one', 3796)
('4', 3674)
('new', 3071)
('value', 2975)
('may', 2944)
('est', 2881)
('gold', 2711)


In [115]:
# Get the 100 most common words
top_100_words = stop_word_common_words(not_stop_words_list).most_common(100)

# Print the 100 most common words
print("100 Most Common Words without stopwords in Corpus 1:")
for word in top_100_words:
    print(word)

100 Most Common Words without stopwords in Corpus 1:
('money', 5484)
('would', 4190)
('bank', 3803)
('one', 3796)
('4', 3674)
('new', 3071)
('value', 2975)
('may', 2944)
('est', 2881)
('gold', 2711)
('stock', 2643)
('great', 2344)
('business', 2319)
('time', 2282)
('banks', 2204)
('exchange', 2197)
('project', 2169)
('states', 2127)
('made', 2061)
('country', 1940)
('upon', 1931)
('market', 1916)
('york', 1768)
('work', 1753)
('per', 1729)
('much', 1697)
('credit', 1672)
('united', 1650)
('years', 1560)
('must', 1480)
('state', 1406)
('many', 1400)
('prices', 1363)
('make', 1349)
('every', 1345)
('company', 1332)
('year', 1328)
('public', 1327)
('large', 1323)
('could', 1311)
('amount', 1305)
('two', 1295)
('general', 1295)
('cent', 1287)
('first', 1271)
('theory', 1271)
('interest', 1263)
('good', 1255)
('capital', 1241)
('silver', 1223)
('government', 1175)
('part', 1155)
('street', 1151)
('reserve', 1144)
('price', 1126)
('men', 1125)
('without', 1104)
('trade', 1103)
('also', 1087)

In [116]:
# Get the 30 most common bigrams
top_30_words = stop_word_common_bigrams(not_stop_words_list).most_common(30)

# Print the 30 most common bigrams
print("30 Most Common bigrams without stopwords in Corpus 1:")
for word in top_30_words:
    print(word)

30 Most Common bigrams without stopwords in Corpus 1:
(('4', 'est'), 2877)
(('new', 'york'), 1752)
(('united', 'states'), 1571)
(('per', 'cent'), 1084)
(('stock', 'exchange'), 836)
(('wall', 'street'), 762)
(('project', 'gutenberg'), 754)
(('bank', 'england'), 473)
(('value', 'money'), 464)
(('project', 'electronic'), 432)
(('clearing', 'house'), 392)
(('electronic', 'works'), 384)
(('quantity', 'theory'), 347)
(('federal', 'reserve'), 326)
(('gutenberg', 'literary'), 312)
(('literary', 'archive'), 312)
(('archive', 'foundation'), 300)
(('trust', 'company'), 272)
(('electronic', 'work'), 264)
(('gold', 'silver'), 256)
(('national', 'bank'), 242)
(('set', 'forth'), 236)
(('bank', 'notes'), 223)
(('terms', 'agreement'), 217)
(('san', 'francisco'), 207)
(('money', 'market'), 201)
(('legal', 'tender'), 200)
(('paper', 'money'), 196)
(('project', 'license'), 192)
(('national', 'banks'), 186)


In [117]:
# Get the 100 most common bigrams
top_100_words = stop_word_common_bigrams(not_stop_words_list).most_common(100)

# Print the 100 most common bigrams
print("100 Most Common bigrams without stopwords in Corpus 1:")
for word in top_100_words:
    print(word)

100 Most Common bigrams without stopwords in Corpus 1:
(('4', 'est'), 2877)
(('new', 'york'), 1752)
(('united', 'states'), 1571)
(('per', 'cent'), 1084)
(('stock', 'exchange'), 836)
(('wall', 'street'), 762)
(('project', 'gutenberg'), 754)
(('bank', 'england'), 473)
(('value', 'money'), 464)
(('project', 'electronic'), 432)
(('clearing', 'house'), 392)
(('electronic', 'works'), 384)
(('quantity', 'theory'), 347)
(('federal', 'reserve'), 326)
(('gutenberg', 'literary'), 312)
(('literary', 'archive'), 312)
(('archive', 'foundation'), 300)
(('trust', 'company'), 272)
(('electronic', 'work'), 264)
(('gold', 'silver'), 256)
(('national', 'bank'), 242)
(('set', 'forth'), 236)
(('bank', 'notes'), 223)
(('terms', 'agreement'), 217)
(('san', 'francisco'), 207)
(('money', 'market'), 201)
(('legal', 'tender'), 200)
(('paper', 'money'), 196)
(('project', 'license'), 192)
(('national', 'banks'), 186)
(('reserve', 'bank'), 177)
(('rate', 'interest'), 174)
(('full', 'project'), 168)
(('million', 'dol

In [118]:
# Get the 10 most common bigrams
top_10_words = stop_word_common_bigrams(not_stop_words_list).most_common(10)

# Print the 10 most common bigrams
print("10 Most Common bigrams without stopwords in Corpus 1:")
for word in top_10_words:
    print(word)

10 Most Common bigrams without stopwords in Corpus 1:
(('4', 'est'), 2877)
(('new', 'york'), 1752)
(('united', 'states'), 1571)
(('per', 'cent'), 1084)
(('stock', 'exchange'), 836)
(('wall', 'street'), 762)
(('project', 'gutenberg'), 754)
(('bank', 'england'), 473)
(('value', 'money'), 464)
(('project', 'electronic'), 432)


### Q4) You will need to preprocess the documents

In [119]:
#nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
import re

# this has all the texts without puncuations marks and all alphanumeric texts
preprocessed_text = tokenized_words_corpus1

#remove stop words
stop_words = set(stopwords.words('english'))
preprocessed_text = [word for word in preprocessed_text if word not in stop_words]

#remove special headers, tags and codes
preprocessed_text = [re.sub(r'\[.*?\]', '', word) for word in preprocessed_text]

#remove short words
preprocessed_text = [word for word in preprocessed_text if len(word) > 2]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mukes\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [120]:
# Get the 30 most common words after preprossing 
top_30_words = most_common_words(preprocessed_text).most_common(30)
top_30_words

[('money', 5484),
 ('would', 4190),
 ('bank', 3803),
 ('one', 3796),
 ('new', 3071),
 ('value', 2975),
 ('may', 2944),
 ('est', 2881),
 ('gold', 2711),
 ('stock', 2643),
 ('great', 2344),
 ('business', 2319),
 ('time', 2282),
 ('banks', 2204),
 ('exchange', 2197),
 ('project', 2169),
 ('states', 2127),
 ('made', 2061),
 ('country', 1940),
 ('upon', 1931),
 ('market', 1916),
 ('york', 1768),
 ('work', 1753),
 ('per', 1729),
 ('much', 1697),
 ('credit', 1672),
 ('united', 1650),
 ('years', 1560),
 ('must', 1480),
 ('state', 1406)]

# Corpus 2

In [122]:
corpus1_dir = 'Corpus2'

text_files = []

for filename in os.listdir(str(os.getcwd())+'/'+corpus1_dir+'/'):
    if filename.endswith('.txt'):
        file_path = os.path.join(corpus1_dir, filename)

        text_files.append(file_path)


print(text_files)

['Corpus2\\1322-0.txt', 'Corpus2\\19002-0.txt', 'Corpus2\\2413-0.txt', 'Corpus2\\2638-0.txt', 'Corpus2\\29019-0.txt', 'Corpus2\\35-0.txt', 'Corpus2\\5200-0.txt', 'Corpus2\\pg1399.txt', 'Corpus2\\pg145.txt', 'Corpus2\\pg14838.txt', 'Corpus2\\pg1513.txt', 'Corpus2\\pg164.txt', 'Corpus2\\pg20796.txt', 'Corpus2\\pg24104.txt', 'Corpus2\\pg25344.txt', 'Corpus2\\pg32154.txt', 'Corpus2\\pg345.txt', 'Corpus2\\pg35997.txt', 'Corpus2\\pg42671.txt', 'Corpus2\\pg552.txt', 'Corpus2\\pg67979.txt', 'Corpus2\\pg76.txt', 'Corpus2\\pg768.txt', 'Corpus2\\pg8655.txt']


In [123]:
def tokenize_words():
    lst = []
    for file_path in text_files:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            words = word_tokenize(text)
            words = [word.lower() for word in words if word.isalnum()]
            lst.extend(words)
    return lst

In [124]:
def most_common_words(lst):
    word_counter = Counter()
    word_counter.update(lst)
    return word_counter

def most_common_bigrams(lst):
    bigram_counter = Counter()
    bi_grams = list(ngrams(lst, 2))
    bigram_counter.update(bi_grams)
    return bigram_counter

In [125]:
tokenized_words_corpus2 = tokenize_words()

# Get the 30 most common words
top_30_words = most_common_words(tokenized_words_corpus2).most_common(30)

# Print the 30 most common words
print("30 Most Common Words in Corpus 2:")
for word in top_30_words:
    print(word)

30 Most Common Words in Corpus 2:
('the', 126172)
('and', 84471)
('to', 65979)
('of', 64658)
('a', 47553)
('i', 44511)
('in', 37030)
('he', 35860)
('that', 30940)
('was', 29563)
('it', 28776)
('you', 25082)
('his', 24536)
('her', 22643)
('with', 21929)
('not', 19767)
('she', 19523)
('had', 19369)
('for', 18796)
('as', 17429)
('but', 16564)
('at', 16094)
('on', 15108)
('him', 14903)
('is', 14599)
('be', 13389)
('s', 12931)
('my', 12326)
('me', 12304)
('all', 12291)


In [126]:
# Get the 100 most common words
top_100_words = most_common_words(tokenized_words_corpus2).most_common(100)

# Print the 100 most common words
print("100 Most Common Words in Corpus 2:")
for word in top_100_words:
    print(word)

100 Most Common Words in Corpus 2:
('the', 126172)
('and', 84471)
('to', 65979)
('of', 64658)
('a', 47553)
('i', 44511)
('in', 37030)
('he', 35860)
('that', 30940)
('was', 29563)
('it', 28776)
('you', 25082)
('his', 24536)
('her', 22643)
('with', 21929)
('not', 19767)
('she', 19523)
('had', 19369)
('for', 18796)
('as', 17429)
('but', 16564)
('at', 16094)
('on', 15108)
('him', 14903)
('is', 14599)
('be', 13389)
('s', 12931)
('my', 12326)
('me', 12304)
('all', 12291)
('said', 11859)
('have', 11313)
('this', 11006)
('so', 9907)
('they', 9290)
('by', 9153)
('from', 9115)
('what', 8587)
('or', 8052)
('which', 7989)
('there', 7974)
('we', 7907)
('no', 7724)
('would', 7497)
('were', 7410)
('one', 7392)
('if', 7126)
('when', 6949)
('t', 6857)
('up', 6712)
('out', 6701)
('them', 6341)
('are', 6275)
('an', 6166)
('do', 5940)
('then', 5868)
('could', 5846)
('been', 5801)
('will', 5598)
('who', 5304)
('more', 4941)
('now', 4924)
('your', 4902)
('their', 4750)
('about', 4729)
('can', 4681)
('did', 

In [127]:
# Get the 10 most common words
top_10_words = most_common_words(tokenized_words_corpus2).most_common(10)

# Print the 10 most common words
print("10 Most Common Words in Corpus 2:")
for word in top_10_words:
    print(word)

10 Most Common Words in Corpus 2:
('the', 126172)
('and', 84471)
('to', 65979)
('of', 64658)
('a', 47553)
('i', 44511)
('in', 37030)
('he', 35860)
('that', 30940)
('was', 29563)


### Q2 Repeat exercise 1 above but now do it for n-grams with n = 2 (bi-grams).

In [128]:
# Get the 30 most common words for ngram with n = 2
top_30_words = most_common_bigrams(tokenized_words_corpus2).most_common(30)

# Print the 30 most common words
print("30 Most Common bi-grams in Corpus 2:")
for word in top_30_words:
    print(word)

30 Most Common bi-grams in Corpus 2:
(('of', 'the'), 14411)
(('in', 'the'), 9968)
(('to', 'the'), 6803)
(('and', 'the'), 4995)
(('on', 'the'), 4725)
(('it', 'was'), 4239)
(('to', 'be'), 4083)
(('at', 'the'), 3926)
(('he', 'had'), 3820)
(('he', 'was'), 3615)
(('with', 'the'), 3101)
(('and', 'i'), 2990)
(('in', 'a'), 2984)
(('of', 'his'), 2857)
(('of', 'a'), 2779)
(('that', 'he'), 2760)
(('for', 'the'), 2742)
(('it', 'is'), 2655)
(('with', 'a'), 2626)
(('i', 'am'), 2585)
(('from', 'the'), 2571)
(('had', 'been'), 2457)
(('did', 'not'), 2346)
(('i', 'have'), 2318)
(('by', 'the'), 2243)
(('all', 'the'), 2177)
(('was', 'a'), 2164)
(('don', 't'), 2157)
(('and', 'he'), 2122)
(('she', 'was'), 2108)


In [129]:
# Get the 100 most common words for ngram with n = 2
top_100_words = most_common_bigrams(tokenized_words_corpus2).most_common(100)

# Print the 100 most common words
print("100 Most Common bi-grams in Corpus 2:")
for word in top_100_words:
    print(word)

100 Most Common bi-grams in Corpus 2:
(('of', 'the'), 14411)
(('in', 'the'), 9968)
(('to', 'the'), 6803)
(('and', 'the'), 4995)
(('on', 'the'), 4725)
(('it', 'was'), 4239)
(('to', 'be'), 4083)
(('at', 'the'), 3926)
(('he', 'had'), 3820)
(('he', 'was'), 3615)
(('with', 'the'), 3101)
(('and', 'i'), 2990)
(('in', 'a'), 2984)
(('of', 'his'), 2857)
(('of', 'a'), 2779)
(('that', 'he'), 2760)
(('for', 'the'), 2742)
(('it', 'is'), 2655)
(('with', 'a'), 2626)
(('i', 'am'), 2585)
(('from', 'the'), 2571)
(('had', 'been'), 2457)
(('did', 'not'), 2346)
(('i', 'have'), 2318)
(('by', 'the'), 2243)
(('all', 'the'), 2177)
(('was', 'a'), 2164)
(('don', 't'), 2157)
(('and', 'he'), 2122)
(('she', 'was'), 2108)
(('she', 'had'), 2099)
(('in', 'his'), 2085)
(('that', 'i'), 2008)
(('of', 'her'), 1983)
(('to', 'her'), 1959)
(('out', 'of'), 1912)
(('there', 'was'), 1888)
(('i', 'was'), 1887)
(('that', 'the'), 1805)
(('he', 'said'), 1776)
(('to', 'him'), 1768)
(('into', 'the'), 1718)
(('could', 'not'), 1707)
(('

In [130]:
# Get the 10 most common words for ngram with n = 2
top_10_words = most_common_bigrams(tokenized_words_corpus2).most_common(10)

# Print the 10 most common words
print("10 Most Common bi-grams in Corpus 2:")
for word in top_10_words:
    print(word)

10 Most Common bi-grams in Corpus 2:
(('of', 'the'), 14411)
(('in', 'the'), 9968)
(('to', 'the'), 6803)
(('and', 'the'), 4995)
(('on', 'the'), 4725)
(('it', 'was'), 4239)
(('to', 'be'), 4083)
(('at', 'the'), 3926)
(('he', 'had'), 3820)
(('he', 'was'), 3615)


### Q3 Find a stop word list. Then remove all words in that stop list and repeat exercise 1 and 2.

In [131]:
stop_words = set(nltk.corpus.stopwords.words('english'))

not_stop_words_list = [word for word in tokenized_words_corpus2 if word not in stop_words]

def stop_word_common_words(not_stop_words_list):
    word_counter = Counter()
    word_counter.update(not_stop_words_list)
    return word_counter

def stop_word_common_bigrams(not_stop_words_list):
    bigram_counter = Counter()
    bi_grams = list(ngrams(not_stop_words_list, 2))
    bigram_counter.update(bi_grams)
    return bigram_counter

In [132]:
stop_words_list = [word for word in tokenized_words_corpus2 if word in stop_words]

print('List of all stop words in corpus2: ')
print(set(stop_words_list))
print(len(set(stop_words_list)))

List of all stop words in corpus2: 
{'these', 'about', 'ain', 'd', 'while', 'hasn', 'between', 'by', 'ourselves', 'you', 'own', 'isn', 'ma', 'was', 'yourselves', 'hers', 'other', 'their', 'our', 'why', 'for', 'further', 'so', 'during', 'before', 'myself', 'herself', 'which', 'my', 'or', 'had', 've', 'having', 'this', 'be', 'weren', 'whom', 'mightn', 'each', 'm', 'yourself', 'more', 'have', 'out', 'on', 'haven', 'too', 'over', 'its', 'in', 'any', 'wouldn', 'wasn', 'just', 'been', 'has', 'her', 'y', 'now', 'most', 'itself', 'as', 'once', 'won', 'your', 'a', 'few', 'does', 'above', 'yours', 'they', 'who', 'the', 'both', 'only', 'to', 'should', 'i', 'them', 'until', 'when', 'what', 'some', 'his', 'being', 'couldn', 'll', 'with', 'such', 'all', 'theirs', 'can', 'he', 'is', 'under', 'aren', 'don', 'after', 'here', 'than', 'did', 'o', 't', 'an', 'needn', 'shan', 'from', 'himself', 'doing', 'of', 'themselves', 'am', 'because', 'where', 'nor', 'him', 'against', 'then', 'up', 'hadn', 'will', 'sa

In [141]:
# Get the 30 most common words
top_30_words = stop_word_common_words(not_stop_words_list).most_common(30)

# Print the 30 most common words
print("30 Most Common Words without stopwords in Corpus 2:")
for word in top_30_words:
    print(word)

30 Most Common Words without stopwords in Corpus 2:
('said', 11859)
('would', 7497)
('one', 7392)
('could', 5846)
('see', 4312)
('like', 4148)
('know', 4118)
('man', 3998)
('time', 3867)
('little', 3786)
('come', 3727)
('must', 3504)
('well', 3451)
('go', 3396)
('never', 3131)
('went', 3125)
('came', 2875)
('good', 2870)
('us', 2846)
('old', 2818)
('made', 2813)
('much', 2712)
('say', 2711)
('back', 2578)
('thought', 2575)
('shall', 2544)
('away', 2511)
('may', 2507)
('eyes', 2467)
('think', 2463)


In [134]:
# Get the 10 most common words
top_10_words = stop_word_common_words(not_stop_words_list).most_common(10)

# Print the 10 most common words
print("10 Most Common Words without stopwords in Corpus 2:")
for word in top_10_words:
    print(word)

10 Most Common Words without stopwords in Corpus 2:
('said', 11859)
('would', 7497)
('one', 7392)
('could', 5846)
('see', 4312)
('like', 4148)
('know', 4118)
('man', 3998)
('time', 3867)
('little', 3786)


In [135]:
# Get the 100 most common words
top_100_words = stop_word_common_words(not_stop_words_list).most_common(100)

# Print the 100 most common words
print("100 Most Common Words without stopwords in Corpus 2:")
for word in top_100_words:
    print(word)



100 Most Common Words without stopwords in Corpus 2:
('said', 11859)
('would', 7497)
('one', 7392)
('could', 5846)
('see', 4312)
('like', 4148)
('know', 4118)
('man', 3998)
('time', 3867)
('little', 3786)
('come', 3727)
('must', 3504)
('well', 3451)
('go', 3396)
('never', 3131)
('went', 3125)
('came', 2875)
('good', 2870)
('us', 2846)
('old', 2818)
('made', 2813)
('much', 2712)
('say', 2711)
('back', 2578)
('thought', 2575)
('shall', 2544)
('away', 2511)
('may', 2507)
('eyes', 2467)
('think', 2463)
('nothing', 2455)
('long', 2441)
('without', 2404)
('two', 2358)
('day', 2354)
('even', 2339)
('might', 2304)
('way', 2265)
('life', 2231)
('work', 2209)
('make', 2190)
('first', 2147)
('though', 2144)
('project', 2143)
('prince', 2136)
('saw', 2109)
('face', 2103)
('hand', 2085)
('upon', 2037)
('great', 1993)
('still', 1969)
('last', 1968)
('tell', 1936)
('seemed', 1924)
('look', 1908)
('thou', 1892)
('looked', 1892)
('take', 1883)
('took', 1881)
('let', 1874)
('get', 1873)
('love', 1855)
(

In [136]:
# Get the 30 most common bigrams
top_30_words = stop_word_common_bigrams(not_stop_words_list).most_common(30)

# Print the 30 most common bigrams
print("30 Most Common bigrams without stopwords in Corpus 2:")
for word in top_30_words:
    print(word)

30 Most Common bigrams without stopwords in Corpus 2:
(('project', 'gutenberg'), 743)
(('alexey', 'alexandrovitch'), 570)
(('stepan', 'arkadyevitch'), 547)
(('project', 'electronic'), 432)
(('electronic', 'works'), 384)
(('captain', 'nemo'), 383)
(('old', 'man'), 373)
(('could', 'see'), 360)
(('united', 'states'), 359)
(('van', 'helsing'), 315)
(('gutenberg', 'literary'), 312)
(('literary', 'archive'), 312)
(('archive', 'foundation'), 302)
(('sergey', 'ivanovitch'), 290)
(('young', 'man'), 275)
(('electronic', 'work'), 264)
(('let', 'us'), 258)
(('nastasia', 'philipovna'), 239)
(('sir', 'james'), 239)
(('one', 'day'), 230)
(('said', 'dorothea'), 227)
(('first', 'time'), 220)
(('next', 'day'), 218)
(('terms', 'agreement'), 216)
(('darya', 'alexandrovna'), 204)
(('set', 'forth'), 203)
(('one', 'another'), 198)
(('said', 'prince'), 196)
(('ned', 'land'), 196)
(('project', 'license'), 192)


In [137]:
# Get the 100 most common bigrams
top_100_words = stop_word_common_bigrams(not_stop_words_list).most_common(100)

# Print the 100 most common bigrams
print("100 Most Common bigrams without stopwords in Corpus 2:")
for word in top_100_words:
    print(word)

100 Most Common bigrams without stopwords in Corpus 2:
(('project', 'gutenberg'), 743)
(('alexey', 'alexandrovitch'), 570)
(('stepan', 'arkadyevitch'), 547)
(('project', 'electronic'), 432)
(('electronic', 'works'), 384)
(('captain', 'nemo'), 383)
(('old', 'man'), 373)
(('could', 'see'), 360)
(('united', 'states'), 359)
(('van', 'helsing'), 315)
(('gutenberg', 'literary'), 312)
(('literary', 'archive'), 312)
(('archive', 'foundation'), 302)
(('sergey', 'ivanovitch'), 290)
(('young', 'man'), 275)
(('electronic', 'work'), 264)
(('let', 'us'), 258)
(('nastasia', 'philipovna'), 239)
(('sir', 'james'), 239)
(('one', 'day'), 230)
(('said', 'dorothea'), 227)
(('first', 'time'), 220)
(('next', 'day'), 218)
(('terms', 'agreement'), 216)
(('darya', 'alexandrovna'), 204)
(('set', 'forth'), 203)
(('one', 'another'), 198)
(('said', 'prince'), 196)
(('ned', 'land'), 196)
(('project', 'license'), 192)
(('come', 'back'), 185)
(('great', 'deal'), 182)
(('said', 'levin'), 176)
(('let', 'go'), 169)
(('fu

In [138]:
# Get the 10 most common bigrams
top_10_words = stop_word_common_bigrams(not_stop_words_list).most_common(10)

# Print the 10 most common bigrams
print("10 Most Common bigrams without stopwords in Corpus 2:")
for word in top_10_words:
    print(word)

10 Most Common bigrams without stopwords in Corpus 2:
(('project', 'gutenberg'), 743)
(('alexey', 'alexandrovitch'), 570)
(('stepan', 'arkadyevitch'), 547)
(('project', 'electronic'), 432)
(('electronic', 'works'), 384)
(('captain', 'nemo'), 383)
(('old', 'man'), 373)
(('could', 'see'), 360)
(('united', 'states'), 359)
(('van', 'helsing'), 315)


### Q4) You will need to preprocess the documents

In [139]:
#nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
import re

# this has all the texts without puncuations marks and all alphanumeric texts
preprocessed_text = tokenized_words_corpus2

#remove stop words
stop_words = set(stopwords.words('english'))
preprocessed_text = [word for word in preprocessed_text if word not in stop_words]

#remove special headers, tags and codes
preprocessed_text = [re.sub(r'\[.*?\]', '', word) for word in preprocessed_text]

#remove short words
preprocessed_text = [word for word in preprocessed_text if len(word) > 2]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mukes\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [140]:
# Get the 30 most common words after preprossing 
top_30_words = most_common_words(preprocessed_text).most_common(30)
top_30_words

[('said', 11859),
 ('would', 7497),
 ('one', 7392),
 ('could', 5846),
 ('see', 4312),
 ('like', 4148),
 ('know', 4118),
 ('man', 3998),
 ('time', 3867),
 ('little', 3786),
 ('come', 3727),
 ('must', 3504),
 ('well', 3451),
 ('never', 3131),
 ('went', 3125),
 ('came', 2875),
 ('good', 2870),
 ('old', 2818),
 ('made', 2813),
 ('much', 2712),
 ('say', 2711),
 ('back', 2578),
 ('thought', 2575),
 ('shall', 2544),
 ('away', 2511),
 ('may', 2507),
 ('eyes', 2467),
 ('think', 2463),
 ('nothing', 2455),
 ('long', 2441)]