In [17]:
import numpy as np 
import re
from collections import Counter

In [21]:
def extract_words_from_file(file_path):   
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    words = re.findall(r'\b\w+\b', text)
    return words


file_path= "shakespeare.txt"  
words_only= extract_words_from_file(file_path)

print(words_only[:50])
print(len(words_only))


['THE', 'SONNETS', 'ALL', 'S', 'WELL', 'THAT', 'ENDS', 'WELL', 'THE', 'TRAGEDY', 'OF', 'ANTONY', 'AND', 'CLEOPATRA', 'AS', 'YOU', 'LIKE', 'IT', 'THE', 'COMEDY', 'OF', 'ERRORS', 'THE', 'TRAGEDY', 'OF', 'CORIOLANUS', 'CYMBELINE', 'THE', 'TRAGEDY', 'OF', 'HAMLET', 'PRINCE', 'OF', 'DENMARK', 'THE', 'FIRST', 'PART', 'OF', 'KING', 'HENRY', 'THE', 'FOURTH', 'THE', 'SECOND', 'PART', 'OF', 'KING', 'HENRY', 'THE', 'FOURTH']
930499


In [11]:
def decapitalizer(words):
    noncapitalized = [word.lower() for word in words]
    return noncapitalized

In [16]:
lowrwords= decapitalizer(words=words_only)

In [18]:
def popWords(words,k=1000):
    counts= Counter(words)
    return counts.most_common(k)

In [29]:
top_5000= popWords(words=lowrwords,k=5000)

In [30]:
top_5000

[('the', 28345),
 ('and', 26805),
 ('i', 23122),
 ('to', 19327),
 ('of', 17723),
 ('a', 15493),
 ('you', 14096),
 ('my', 12469),
 ('in', 11720),
 ('that', 11433),
 ('is', 9381),
 ('not', 8504),
 ('s', 8095),
 ('me', 7900),
 ('with', 7887),
 ('for', 7769),
 ('it', 7757),
 ('be', 6937),
 ('his', 6904),
 ('he', 6806),
 ('your', 6740),
 ('this', 6707),
 ('but', 6307),
 ('have', 5938),
 ('as', 5765),
 ('thou', 5603),
 ('him', 5271),
 ('will', 5080),
 ('so', 5049),
 ('what', 4886),
 ('d', 4603),
 ('thy', 4072),
 ('all', 4063),
 ('her', 4022),
 ('do', 3902),
 ('no', 3824),
 ('by', 3809),
 ('we', 3749),
 ('_', 3737),
 ('shall', 3664),
 ('if', 3583),
 ('are', 3512),
 ('thee', 3245),
 ('on', 3230),
 ('our', 3184),
 ('king', 3130),
 ('o', 3077),
 ('lord', 3056),
 ('sir', 2880),
 ('good', 2851),
 ('now', 2840),
 ('from', 2673),
 ('at', 2579),
 ('come', 2545),
 ('they', 2528),
 ('ll', 2498),
 ('she', 2481),
 ('or', 2444),
 ('enter', 2443),
 ('let', 2396),
 ('here', 2378),
 ('more', 2345),
 ('would'

In [41]:
def replace_rare_words(lower_words, top_5000, min_freq=5):
    word_counts = Counter(lower_words)
    rare_words = {word for word, count in word_counts.items() if count < min_freq and word not in top_5000}
        
    processed_corpus = ["<OOV>" if word in rare_words else word for word in lower_words]
    return processed_corpus

processed_corpora = replace_rare_words(lowrwords, top_5000, min_freq=5)


In [53]:
nboov= processed_corpora.count("<OOV>")
tot= len(processed_corpora)
pecnt= format((nboov/tot)*100,'.2f')
print(f"percentage of OOV is: {pecnt}%")

percentage of OOV is: 2.92%


In [None]:
def tokenize_corpus(corpus, vocab, oov_index):
    tokenized = []
    for sentence in corpus:
        tokenized.append([vocab.get(word, oov_index) for word in sentence.split()])
    return tokenized

# Assign unique indices to words in top_5000
vocab = {word: idx + 1 for idx, word in enumerate(top_5000)}  # Start from 1
oov_index = len(top_5000) + 1  # Assign OOV a unique index

# Convert corpus to tokenized form
tokenized_corpus = tokenize_corpus(processed_corpora, vocab, oov_index)

# Display results
import ace_tools as tools
import pandas as pd

df = pd.DataFrame({"Sentence": processed_corpora, "Tokenized": tokenized_corpus})
tools.display_dataframe_to_user(name="Tokenized Corpus", dataframe=df)


[4, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 4, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 4, 0, 0, 0, 1, 0, 4, 0, 0, 4, 0, 4, 0, 4, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 1, 4, 0, 4, 0, 0, 0, 0, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 4, 0, 1, 0, 0, 0, 