In [106]:
import nltk
from nltk.book import *
import spacy
nlp = spacy.load("en_core_web_sm")
from nltk.util import bigrams



# Task 1

### Reading Shakepeare's Work:

In [107]:
import string
from nltk.corpus import gutenberg

# List available texts in the Gutenberg corpus
#print(gutenberg.fileids())

# Access Shakespeare's Macbeth
macbeth_text = gutenberg.raw('shakespeare-macbeth.txt')
#print(macbeth_text[:500])  #Display the first 500 characters

#Accessing Hamlet from Guttenberg Libray 
hamlet_text = gutenberg.raw('shakespeare-hamlet.txt')

#Cleaning Text
hamlet_text = hamlet_text.lower()  #Make it lower case 
hamlet_text = hamlet_text.translate(str.maketrans('', '', string.punctuation))  #Remove Punctuation


#Display 
print(hamlet_text[:200])  # Display the first 100 characters


the tragedie of hamlet by william shakespeare 1599


actus primus scoena prima

enter barnardo and francisco two centinels

  barnardo whos there
  fran nay answer me stand  vnfold
your selfe

   bar 


In [108]:
#Further Clenaing
hamlet_text = hamlet_text.replace('\n', ' ')  #replace newlines with a space
hamlet_text = hamlet_text.replace('\r', '')   #Remove carriage returns if present

In [109]:
#Tokenize
hamlet_tokenized = nlp(hamlet_text)

#Display tokenized words, excluding spaces

tokens = [token.text for token in hamlet_tokenized if token.text.strip() != '']


#Print tokenized words
print(tokens)  

# # Tokenize sentences
# doc = nlp(hamlet_text)
# sentences = [sent.text for sent in doc.sents]
# print(sentences)



In [110]:
#Creating bigrams
hamlet_bigrams = list(bigrams(tokens))
#Print bigrams
print(hamlet_bigrams)



### Dictionary of Bigram Counts

In [111]:
from collections import defaultdict

#initialize dictionary with default dictionaries inside
from_bigram_to_next_token_counts = defaultdict(lambda: defaultdict(int))

#loop through the bigram list
for i in range(len(hamlet_bigrams) - 1):  #stop at len(bigram_list) - 1 to access the next token
    bigram = hamlet_bigrams[i]  #crrent bigram
    next_token = hamlet_bigrams[i + 1][0]  # get the first token of the next bigram
    from_bigram_to_next_token_counts[bigram][next_token] += 1  #increment count

#print it nicely
for bigram, next_tokens in from_bigram_to_next_token_counts.items():
    print(f"{bigram} -> {dict(next_tokens)}")


('the', 'tragedie') -> {'tragedie': 2}
('tragedie', 'of') -> {'of': 2}
('of', 'hamlet') -> {'hamlet': 5}
('hamlet', 'by') -> {'by': 1}
('by', 'william') -> {'william': 1}
('william', 'shakespeare') -> {'shakespeare': 1}
('shakespeare', '1599') -> {'1599': 1}
('1599', 'actus') -> {'actus': 1}
('actus', 'primus') -> {'primus': 1}
('primus', 'scoena') -> {'scoena': 1}
('scoena', 'prima') -> {'prima': 1}
('prima', 'enter') -> {'enter': 1}
('enter', 'barnardo') -> {'barnardo': 1}
('barnardo', 'and') -> {'and': 2}
('and', 'francisco') -> {'francisco': 1}
('francisco', 'two') -> {'two': 1}
('two', 'centinels') -> {'centinels': 1}
('centinels', 'barnardo') -> {'barnardo': 1}
('barnardo', 'who') -> {'who': 1}
('who', 's') -> {'s': 2}
('s', 'there') -> {'there': 2}
('there', 'fran') -> {'fran': 1}
('fran', 'nay') -> {'nay': 1}
('nay', 'answer') -> {'answer': 1}
('answer', 'me') -> {'me': 2}
('me', 'stand') -> {'stand': 1}
('stand', 'vnfold') -> {'vnfold': 1}
('vnfold', 'your') -> {'your': 1}
('y

# Task 2

In [112]:
#initialize the dictionary to store probabilities
from_bigram_to_next_token_probs = defaultdict(lambda: defaultdict(float))

# Loop through the bigram counts to calculate the probabilities
for bigram, next_tokens in from_bigram_to_next_token_counts.items():
    total_count = sum(next_tokens.values())  #total count this bigram happens
    for next_token, count in next_tokens.items():
        #calculate the probability for each next_token given the bigram
        from_bigram_to_next_token_probs[bigram][next_token] = count / total_count

#print the probabilities
for bigram, next_tokens in from_bigram_to_next_token_probs.items():
    print(f"{bigram} -> {dict(next_tokens)}")

('the', 'tragedie') -> {'tragedie': 1.0}
('tragedie', 'of') -> {'of': 1.0}
('of', 'hamlet') -> {'hamlet': 1.0}
('hamlet', 'by') -> {'by': 1.0}
('by', 'william') -> {'william': 1.0}
('william', 'shakespeare') -> {'shakespeare': 1.0}
('shakespeare', '1599') -> {'1599': 1.0}
('1599', 'actus') -> {'actus': 1.0}
('actus', 'primus') -> {'primus': 1.0}
('primus', 'scoena') -> {'scoena': 1.0}
('scoena', 'prima') -> {'prima': 1.0}
('prima', 'enter') -> {'enter': 1.0}
('enter', 'barnardo') -> {'barnardo': 1.0}
('barnardo', 'and') -> {'and': 1.0}
('and', 'francisco') -> {'francisco': 1.0}
('francisco', 'two') -> {'two': 1.0}
('two', 'centinels') -> {'centinels': 1.0}
('centinels', 'barnardo') -> {'barnardo': 1.0}
('barnardo', 'who') -> {'who': 1.0}
('who', 's') -> {'s': 1.0}
('s', 'there') -> {'there': 1.0}
('there', 'fran') -> {'fran': 1.0}
('fran', 'nay') -> {'nay': 1.0}
('nay', 'answer') -> {'answer': 1.0}
('answer', 'me') -> {'me': 1.0}
('me', 'stand') -> {'stand': 1.0}
('stand', 'vnfold') ->