In [1]:
import os
import re
from collections import defaultdict
from itertools import combinations
import nltk
from nltk.stem import PorterStemmer

In [None]:
# Index the collection (without stemming)
def build_inverted_index(folder_path):
    index = defaultdict(set)
    doc_id = 0

    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)
        if not os.path.isfile(filepath):
            continue

        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read().lower()
            tokens = re.findall(r"[a-z]+", text)
            for token in tokens:
                index[token].add(doc_id)

        doc_id += 1

    return index

In [3]:
def get_first_1000_words(index):
    vocab = sorted(index.keys())
    return vocab[:1000]

In [4]:
def create_stem_classes(words):
    stemmer = PorterStemmer()
    stem_classes = defaultdict(list)
    for w in words:
        stem = stemmer.stem(w)
        stem_classes[stem].append(w)
    return stem_classes

In [5]:
def dice_coefficient(index, word1, word2):
    docs1, docs2 = index[word1], index[word2]
    if not docs1 or not docs2:
        return 0.0
    inter = len(docs1 & docs2)
    return (2 * inter) / (len(docs1) + len(docs2))


def compute_dice_for_stem_classes(index, stem_classes):
    results = {}
    for stem, words in stem_classes.items():
        if len(words) > 1:
            pair_scores = {}
            for w1, w2 in combinations(words, 2):
                score = dice_coefficient(index, w1, w2)
                pair_scores[(w1, w2)] = score

            results[stem] = pair_scores
            
    return results

In [None]:
# Path to text documents
folder_path = "C:/Users/HanDong/Documents/Study/Semester 5/TMG301/dataset"

index = build_inverted_index(folder_path)
first_1000 = get_first_1000_words(index)
stem_classes = create_stem_classes(first_1000)
dice_results = compute_dice_for_stem_classes(index, stem_classes)

# Example: print first few results
for stem, pairs in list(dice_results.items())[:5]:
    print(f"\nStem: {stem}")
    for (w1, w2), score in pairs.items():
        print(f" {w1} - {w2}: {score:.2f}")



Stem: algorithm
 algorithm - algorithms: 1.000

Stem: analyz
 analyze - analyzed: 1.000
 analyze - analyzing: 1.000
 analyzed - analyzing: 1.000

Stem: appear
 appear - appeared: 1.000

Stem: approach
 approach - approaches: 1.000

Stem: articl
 article - articles: 1.000
