In [1]:
import numpy as np 
import os 
import re
from collections import Counter

In [2]:
test_dir = './lab7/test'
def list_autors_and_books(directory):
    authors = {}
    for item in os.listdir(directory):
        author, _ = item.split('-')
        if author not in authors:
            authors[author] = []
        authors[author].append(item)
    test_set, train_set = [], []
    for author in authors:
        if len(authors[author]) < 2:
            raise ValueError("Insufficient number of books to create test set!")
        if len(authors[author]) > 7:
            for book in authors[author][:3]:
                test_set.append((author, book))
            for book in authors[author][3:8]:
                train_set.append((author, book))
        else:
            test_set.append((author, authors[author][0]))
            for book in authors[author][1:8]:
                train_set.append((author, book))
    return authors, test_set, train_set

if dict is [word_n][word_n-1, word_n-2, ...] then it's better results than
[word_n-1, word_n-2, ...][word]

P(w_i | w_i-1, w_i-2 ...) = C(w_i, w_i-1, w_i-2...)/C(w_i-1, w_i-2...)

In [3]:
authors, test_set, train_set = list_autors_and_books(test_dir)
for author in authors:
    print(f"{author}, books: {len(authors[author])}")
print(test_set)

Philip_K_Dick_, books: 7
Terry_Pratchett_, books: 19
Stephen_King_, books: 6
Sapkowski_Andrzej_, books: 5
Tolkien_J_R_R_, books: 3
Lem_Stanislaw_, books: 8
[('Philip_K_Dick_', 'Philip_K_Dick_-_Kosmiczne_marionetki.txt'), ('Terry_Pratchett_', 'Terry_Pratchett_-_Czarodzicielstwo.txt'), ('Terry_Pratchett_', 'Terry_Pratchett_-_Kosiarz.txt'), ('Terry_Pratchett_', 'Terry_Pratchett_-_Piramidy.txt'), ('Stephen_King_', 'Stephen_King_-_Marzenia_i_Koszmary_2.txt'), ('Sapkowski_Andrzej_', 'Sapkowski_Andrzej_-_Okruch_lodu.txt'), ('Tolkien_J_R_R_', 'Tolkien_J_R_R_-_O_Tuorze_i_jego_przybyciu_do_Gondolinu.txt'), ('Lem_Stanislaw_', 'Lem_Stanislaw_-_Summa_Technologiae.txt'), ('Lem_Stanislaw_', 'Lem_Stanislaw_-_Bajki_robotow.txt'), ('Lem_Stanislaw_', 'Lem_Stanislaw_-_Niezwyciezony.txt')]


In [4]:
def build_markov_chain(markov, text_sample, order=4):
    words = re.findall(r'\w+', open(text_sample, 'r').read().lower())
    for i in range(len(words)-order):
        ngram = words[i:i+order]
        x = ', '.join(ngram[:-1])
        if x not in markov:
            markov[x] = Counter()
        markov[x][ngram[-1]] += 1
        markov[x]['#sum'] += 1
        if ngram[-2] not in markov:
            markov[ngram[-2]] = Counter()
        markov[ngram[-2]][ngram[-1]] += 1
        markov[ngram[-2]]['#sum'] += 1
    return markov
markov = {}
build_markov_chain(markov, os.path.join(test_dir, 'Philip_K_Dick_-_Kosmiczne_marionetki.txt'))

{'philip, k, dick': Counter({'kosmiczne': 1, '#sum': 1}),
 'dick': Counter({'kosmiczne': 1, '#sum': 1}),
 'k, dick, kosmiczne': Counter({'marionetki': 1, '#sum': 1}),
 'kosmiczne': Counter({'marionetki': 1, '#sum': 2, 'bieguny': 1}),
 'dick, kosmiczne, marionetki': Counter({'przełożyła': 1, '#sum': 1}),
 'marionetki': Counter({'przełożyła': 1, '#sum': 1}),
 'kosmiczne, marionetki, przełożyła': Counter({'jadwiga': 1, '#sum': 1}),
 'przełożyła': Counter({'jadwiga': 1, '#sum': 1}),
 'marionetki, przełożyła, jadwiga': Counter({'andruszkiewicz': 1, '#sum': 1}),
 'jadwiga': Counter({'andruszkiewicz': 1, '#sum': 1}),
 'przełożyła, jadwiga, andruszkiewicz': Counter({'fiejtek': 1, '#sum': 1}),
 'andruszkiewicz': Counter({'fiejtek': 1, '#sum': 1}),
 'jadwiga, andruszkiewicz, fiejtek': Counter({'mojej': 1, '#sum': 1}),
 'fiejtek': Counter({'mojej': 1, '#sum': 1}),
 'andruszkiewicz, fiejtek, mojej': Counter({'siostrze': 1, '#sum': 1}),
 'mojej': Counter({'siostrze': 1,
          '#sum': 5,
       

In [5]:
order = 3
author_chains = Counter()
author_chains['global'] = {}
for author, book in train_set:
    if author not in author_chains:
        author_chains[author] = {} 
    author_chains[author] = build_markov_chain(author_chains[author],os.path.join(test_dir, book), order)
    # calculate global through reference
    author_chains['global'] = build_markov_chain(author_chains['global'],os.path.join(test_dir, book), order)

In [6]:
author_chains['global']

{'philip, k': Counter({'dick': 6, '#sum': 6}),
 'k': Counter({'dick': 6,
          '#sum': 70,
          'stuknięty': 1,
          'edy': 1,
          'ohl': 9,
          'tze': 1,
          'me': 1,
          'nie': 1,
          'cisza': 1,
          'u': 4,
          'a': 9,
          'n': 4,
          'o': 6,
          'i': 5,
          'r': 4,
          'dobrze': 1,
          'j': 1,
          'l': 2,
          'c': 1,
          'ą': 1,
          'w': 3,
          'z': 2,
          'który': 1,
          'profesor': 1,
          'cyberleutnant': 1,
          'czyli': 1,
          'do': 1,
          'zmieniona': 1}),
 'k, dick': Counter({'trzy': 1,
          '#sum': 6,
          'pani': 1,
          'galaktyczny': 1,
          '1969': 1,
          'paszcza': 1,
          'za': 1}),
 'dick': Counter({'trzy': 1,
          '#sum': 14,
          'pani': 1,
          'galaktyczny': 1,
          '1969': 1,
          'paszcza': 1,
          'za': 1,
          'o': 3,
          'greg': 1,
  

In [7]:
def backoff(chains, author, ngram):
    try:
        x = ', '.join(ngram[:-1])
        p = author_chains[author][x][ngram[-1]]\
                / author_chains['global'][x]["#sum"]
        return p
    except KeyError:
        try:
            backoff_weight = backoff_weight_chain(chains, author, ngram[-2], x)
            return backoff_weight*backoff(chains, author, ngram[:-1])
        except (IndexError, ZeroDivisionError):
            return 0
        
def backoff_weight_chain(chains, author, w_p1, w_p_all):
    top, bottom = 1, 1
    try:
        top = 1 - (chains[author][w_p_all]['#sum']/chains['global'][w_p_all]['#sum'])
    except KeyError:
        pass
    try:
        bottom = 1 - (chains[author][w_p1]['#sum']/chains['global'][w_p1]['#sum'])
    except KeyError:
        pass 
    return top/bottom

In [9]:
hit = 0
total = 0
tp, fp, tn, fn = 0, 0, 0, 0
total_authors = len(author_chains) -1 
for author_org, book in test_set:
    author_metrics = {author: 0 for author in author_chains}
    words = re.findall(r'\w+', open(os.path.join(test_dir, book), 'r').read().lower())[:1000] # sample
    for i in range(len(words)-order):
        ngram = words[i:i+order]
        for author in author_chains:
            p = backoff(author_chains, author, ngram)
            author_metrics[author]  += p
    max_author_val, max_author = 0, None
    for author in author_metrics:
        if author == 'global':
            continue
        if author_metrics[author] >= max_author_val:
            max_author_val, max_author = author_metrics[author], author
    if max_author == author_org:
        tp += 1
        tn += (total_authors -1)
        hit += 1
    else:
        tn += (total_authors - 2)
        fp += 1 
        fn += 1 
    total += 1
    print(f'Book {book} by {author_org}: {max_author} with {np.around(max_author_val, decimals=2)}')
print(f"Accuracy: {hit/total}, {hit}/{total}")
print(f"Recall {tp/(tp + fp)}, precision {tp/(tp+fn)}")

Book Philip_K_Dick_-_Kosmiczne_marionetki.txt by Philip_K_Dick_: Stephen_King_ with 11.22
Book Terry_Pratchett_-_Czarodzicielstwo.txt by Terry_Pratchett_: Terry_Pratchett_ with 16.7
Book Terry_Pratchett_-_Kosiarz.txt by Terry_Pratchett_: Terry_Pratchett_ with 19.0
Book Terry_Pratchett_-_Piramidy.txt by Terry_Pratchett_: Terry_Pratchett_ with 15.51
Book Stephen_King_-_Marzenia_i_Koszmary_2.txt by Stephen_King_: Stephen_King_ with 13.79
Book Sapkowski_Andrzej_-_Okruch_lodu.txt by Sapkowski_Andrzej_: Sapkowski_Andrzej_ with 4.99
Book Tolkien_J_R_R_-_O_Tuorze_i_jego_przybyciu_do_Gondolinu.txt by Tolkien_J_R_R_: Tolkien_J_R_R_ with 7.86
Book Lem_Stanislaw_-_Summa_Technologiae.txt by Lem_Stanislaw_: Lem_Stanislaw_ with 8.51
Book Lem_Stanislaw_-_Bajki_robotow.txt by Lem_Stanislaw_: Lem_Stanislaw_ with 4.42
Book Lem_Stanislaw_-_Niezwyciezony.txt by Lem_Stanislaw_: Lem_Stanislaw_ with 9.09
Accuracy: 0.9, 9/10
Recall 0.9, precision 0.9


In [10]:
print(tp, tn, fp, fn)

9 49 1 1
