In [16]:
import numpy as np
import nltk
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize, sent_tokenize
import string
from collections import Counter, defaultdict
import math
import re

In [17]:
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

def tokenize_text(text):
    return word_tokenize(preprocess_text(text))

def get_vocabulary(corpus):
    vocabulary = set()
    for doc in corpus:
        words = tokenize_text(doc)
        vocabulary.update(words)
    return list(vocabulary)

### Task 0:
Take an arbitrary text from NLTK corpora (e.g. text3) and implement a Bag-of-Words tagger for it.


In [18]:
def create_bow_vectors(corpus):
    vocabulary = get_vocabulary(corpus)
    vocab_to_idx = {word: i for i, word in enumerate(vocabulary)}
    
    bow_vectors = []
    for doc in corpus:
        vector = np.zeros(len(vocabulary))
        words = tokenize_text(doc)
        word_counts = Counter(words)
        
        for word, count in word_counts.items():
            if word in vocab_to_idx:
                vector[vocab_to_idx[word]] = count
        
        bow_vectors.append(vector)
    
    return bow_vectors, vocabulary, vocab_to_idx

### Task 1: 
Enhance the tagger so that it will use N-grams instead of words

In [19]:
def create_ngram_bow_vectors(corpus, n=2):
    all_ngrams = set()
    
    for doc in corpus:
        words = tokenize_text(doc)
        doc_ngrams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]
        all_ngrams.update(doc_ngrams)
    
    ngram_to_idx = {ngram: i for i, ngram in enumerate(all_ngrams)}
    
    ngram_bow_vectors = []
    for doc in corpus:
        vector = np.zeros(len(all_ngrams))
        words = tokenize_text(doc)
        doc_ngrams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]
        ngram_counts = Counter(doc_ngrams)
        
        for ngram, count in ngram_counts.items():
            if ngram in ngram_to_idx:
                vector[ngram_to_idx[ngram]] = count
        
        ngram_bow_vectors.append(vector)
    
    return ngram_bow_vectors, list(all_ngrams), ngram_to_idx

### Task 2:
Implement PPMI weighting with co-occurrence based on the presence within the same paragraph.
 

In [8]:
paragraphs = text.lower().split('\n\n')

In [9]:
from collections import defaultdict
import math

cooccur = defaultdict(lambda: defaultdict(int))
word_freq = Counter()

for para in paragraphs:
    words = set(word_tokenize(para))
    for w in words:
        word_freq[w] += 1
        for c in words:
            if w != c:
                cooccur[w][c] += 1

In [10]:
total_paragraphs = len(paragraphs)
ppmi = defaultdict(dict)

for w in cooccur:
    for c in cooccur[w]:
        p_wc = cooccur[w][c] / total_paragraphs
        p_w = word_freq[w] / total_paragraphs
        p_c = word_freq[c] / total_paragraphs
        pmi = math.log2(p_wc / (p_w * p_c))
        ppmi[w][c] = max(0, pmi)

### Task 3:
Implement PPMI weighting with co-occurrence based on a sliding window of neighboring words. Pick some number between 2-10.

In [15]:
from nltk.probability import FreqDist
window_size = 4 

word_count = FreqDist(filtered_words)
cooccurrence = defaultdict(int)

for i in range(len(filtered_words)):
    center_word = filtered_words[i]
    window_start = max(0, i - window_size)
    window_end = min(len(filtered_words), i + window_size + 1)
    for j in range(window_start, window_end):
        if i == j:
            continue
        context_word = filtered_words[j]
        pair = tuple(sorted((center_word, context_word)))
        cooccurrence[pair] += 1


total_pairs = sum(cooccurrence.values())

ppmi = {}
for pair, count in cooccurrence.items():
    p_w1_w2 = count / total_pairs
    p_w1 = word_count[pair[0]] / len(filtered_words)
    p_w2 = word_count[pair[1]] / len(filtered_words)
    pmi = math.log2(p_w1_w2 / (p_w1 * p_w2)) if p_w1_w2 > 0 else 0
    ppmi[pair] = max(pmi, 0)

print(f"Top 10 PPMI pairs (window size = {window_size}):")
for pair, score in sorted(ppmi.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"{pair}: {score:.4f}")

NameError: name 'filtered_words' is not defined