In [None]:
# !pip install nltk

In [30]:
import nltk
import os
import math

In [18]:
def load_data(directory):
    files = dict()
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename)) as f:

            contents = [
                word.lower() for word in nltk.word_tokenize(f.read()) if word.isalpha()
            ]

            frequencies = dict()
            for word in contents:
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
            files[filename] = frequencies

    return files

In [19]:
corpus = load_data("../data/holmes")

In [23]:
def extract_words(corpus: dict) -> set:
    words = set()
    for filename in corpus:
        words.update(corpus[filename])
    return words

In [24]:
def calculate_tf(corpus: dict) -> dict:
    tfidfs = dict()
    for filename in corpus:
        tfidfs[filename] = []
        for word in corpus[filename]:
            tf = corpus[filename][word]
            tfidfs[filename].append((word, tf))
    return tfidfs

In [25]:
def get_top_terms(tfidfs: dict):
    for filename in corpus:
        tfidfs[filename].sort(key=lambda tfidf: tfidf[1], reverse=True)
        tfidfs[filename] = tfidfs[filename][:5]

    for filename in corpus:
        print(filename)
        for term, score in tfidfs[filename]:
            print(f"    {term}: {score:.4f}")

In [26]:
get_top_terms(calculate_tf(corpus))

speckled.txt
    the: 600.0000
    and: 281.0000
    of: 276.0000
    a: 252.0000
    i: 233.0000
face.txt
    the: 326.0000
    i: 298.0000
    and: 226.0000
    to: 185.0000
    a: 173.0000
twisted.txt
    the: 493.0000
    a: 275.0000
    and: 270.0000
    i: 238.0000
    of: 234.0000
squires.txt
    the: 508.0000
    of: 206.0000
    and: 169.0000
    to: 168.0000
    a: 152.0000
coronet.txt
    the: 466.0000
    i: 356.0000
    to: 270.0000
    and: 238.0000
    a: 213.0000
carbuncle.txt
    the: 463.0000
    of: 233.0000
    a: 208.0000
    and: 199.0000
    i: 188.0000
treaty.txt
    the: 688.0000
    i: 348.0000
    of: 319.0000
    and: 318.0000
    to: 316.0000
bachelor.txt
    the: 401.0000
    i: 236.0000
    and: 234.0000
    to: 233.0000
    a: 211.0000
patient.txt
    the: 346.0000
    i: 187.0000
    to: 184.0000
    and: 172.0000
    of: 171.0000
bohemia.txt
    the: 443.0000
    i: 261.0000
    and: 254.0000
    to: 245.0000
    of: 237.0000
problem.txt
    the: 427.0

In [27]:
def load_data(directory):

    with open("../data/function-words/function_words.txt") as f:
        function_words = set(f.read().splitlines())

    files = dict()
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename)) as f:

            # Extract words
            contents = [
                word.lower() for word in nltk.word_tokenize(f.read()) if word.isalpha()
            ]

            # Count frequencies
            frequencies = dict()
            for word in contents:

                if word in function_words:
                    continue
                elif word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
            files[filename] = frequencies

    return files

In [28]:
corpus = load_data("../data/holmes")

In [29]:
get_top_terms(calculate_tf(corpus))

speckled.txt
    holmes: 56.0000
    room: 25.0000
    see: 22.0000
    sister: 21.0000
    roylott: 20.0000
face.txt
    face: 24.0000
    cottage: 24.0000
    man: 23.0000
    little: 23.0000
    wife: 23.0000
twisted.txt
    man: 31.0000
    holmes: 29.0000
    clair: 27.0000
    little: 21.0000
    face: 20.0000
squires.txt
    holmes: 53.0000
    man: 35.0000
    cunningham: 31.0000
    inspector: 29.0000
    colonel: 25.0000
coronet.txt
    coronet: 27.0000
    holmes: 27.0000
    man: 27.0000
    think: 26.0000
    house: 23.0000
carbuncle.txt
    man: 39.0000
    holmes: 38.0000
    hat: 27.0000
    see: 27.0000
    goose: 26.0000
treaty.txt
    holmes: 67.0000
    phelps: 39.0000
    room: 36.0000
    come: 34.0000
    came: 28.0000
bachelor.txt
    simon: 40.0000
    lord: 35.0000
    holmes: 34.0000
    little: 26.0000
    lady: 24.0000
patient.txt
    holmes: 39.0000
    man: 28.0000
    blessington: 26.0000
    doctor: 18.0000
    street: 16.0000
bohemia.txt
    holmes: 48

In [31]:
def get_idfs(words: set, corpus: dict) -> dict:
    idfs = dict()
    for word in words:
        freq = sum(word in corpus[filename] for filename in corpus)
        idf = math.log(len(corpus) / freq)
        idfs[word] = idf
    return idfs

In [35]:
def get_tfidf(idfs: dict, corpus: dict):
    tfidfs = dict()
    for filename in corpus:
        tfidfs[filename] = []
        for word in corpus[filename]:
            tf = corpus[filename][word]
            tfidfs[filename].append((word, tf * idfs[word]))
    return tfidfs

In [36]:
words = extract_words(corpus)
idfs = get_idfs(words, corpus)
tfidf_scores = get_tfidf(idfs, corpus)

In [37]:
get_top_terms(tfidf_scores)

speckled.txt
    roylott: 60.8904
    stoner: 57.8459
    ventilator: 42.6233
    stepfather: 36.5343
    stoke: 33.4897
face.txt
    cottage: 56.4330
    munro: 18.8110
    jack: 18.2405
    grant: 16.4596
    effie: 15.2226
twisted.txt
    clair: 82.2021
    neville: 57.8459
    lascar: 36.5343
    opium: 25.8651
    whitney: 24.3562
squires.txt
    cunningham: 94.3802
    alec: 57.8459
    acton: 45.6678
    william: 31.5063
    colonel: 31.3191
coronet.txt
    coronet: 82.2021
    arthur: 44.6761
    gems: 39.5788
    holder: 29.8481
    snow: 23.5138
carbuncle.txt
    goose: 61.1358
    geese: 51.7569
    horner: 39.5788
    ryder: 36.5343
    peterson: 33.4897
treaty.txt
    phelps: 118.7364
    joseph: 70.0240
    harrison: 60.8904
    holdhurst: 42.6233
    woking: 42.6233
bachelor.txt
    simon: 121.7809
    doran: 36.5343
    lestrade: 32.9193
    wedding: 30.5679
    lord: 29.6554
patient.txt
    blessington: 79.1576
    trevelyan: 48.7124
    brook: 24.3562
    consultation