In [12]:
import re

import numpy as np

x = re.compile(r'[^\\#]|\\[a-zA-Z]+|\\[^a-zA-Z]|#[1-9]?')
s = r"\abcd\efgh{ijklmno\pqrstuvwxyz\}\\#asd#12#"
print(x.findall(s))

['\\abcd', '\\efgh', '{', 'i', 'j', 'k', 'l', 'm', 'n', 'o', '\\pqrstuvwxyz', '\\}', '\\\\', '#', 'a', 's', 'd', '#1', '2', '#']


In [108]:
from texdocument import TexDocument, TextFragment
import abc

# import tools for natural language processing
import json
import math
import os
import pickle
import re
from collections import defaultdict
from copy import copy
from typing import List, Tuple, Optional, Union, Dict

import yaml
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import *

from parse_text import tokenize_text_nltk, math_env_names


expand_tags = {'IN','EX','DT','CC','PDT','WDT','WP','WP$','WRB','IE','EG', 'TO', 'PRP','PRP$'}

# statistics class for ngrams
# contains following stats:
# - ngrams by parts of speech obtained from nltk (number of occurences of each ngram)
# - for each i in 1...n each part of ngram(w[1], w[2], w[i-1], w[i+1],..., w[n]) stats of w[i] by parts of speech
# - for each word x stats of (w[1], w[2], w[i-1], x, w[i+1],..., w[n])
#        for different i for different parts of speech w[1], w[2],..., w[n]
class NgramStats:
    def __init__(self, n: int):
        self.words_stats = None
        self.n = n
        self.ngrams = defaultdict(lambda: 0)
        self.ngrams_by_pos = defaultdict(lambda: defaultdict(lambda: 0))
        self.ngrams_by_word = defaultdict(lambda: defaultdict(lambda: 0))
        self.words = defaultdict(lambda: 0)

    # add ngram to stats
    def add_ngram(self, ngram: List[Tuple[str, str]]):  # ngram is a list of tuples (word, part of speech)
        parts_of_speech = [ngram[i][1] for i in range(self.n)]  # type: List[Optional[str]]
        self.ngrams[tuple(parts_of_speech)] += 1
        for i in range(self.n):
            parts_of_speech[i] = '_'
            self.ngrams_by_pos[tuple(parts_of_speech)][ngram[i][1]] += 1
            self.ngrams_by_word[ngram[i][0]][tuple(parts_of_speech)] += 1
            parts_of_speech[i] = ngram[i][1]

    def add_tokenized_text(self, text: List[List[Tuple[str, str]]]):
        for sentence in text:
            # replace tags from expand_tags with corresponding words
            sentence = [(word, pos) if pos not in expand_tags else (word, '='+word) for word, pos in sentence]
            for i in range(len(sentence) - self.n):
                self.add_ngram(sentence[i:i + self.n])
            for i in range(len(sentence)):
                self.words[sentence[i][0]] += 1

    def collect_stats_for_words(self, excluded_ps=('EQN', 'EQNP', 'SOL', 'EOL')):
        self.words_stats = defaultdict(lambda: [])
        for word in self.words:
            if word == '' or not word[0].isalpha():
                continue
            stats = defaultdict(lambda: 0)
            total_num = 0
            for ngram, num in self.ngrams_by_word[word].items():
                total_num += 1
                ngram_stats = defaultdict(lambda: 0)
                total = 0
                for ps, num_ps in self.ngrams_by_pos[ngram].items():
                    if ps and ps not in excluded_ps and ps[0].isalpha():
                        ngram_stats[ps] += num_ps
                        total += num_ps
                for ps, num_ps in ngram_stats.items():
                    if num_ps > 0:
                        stats[ps] += math.log(num_ps)  # ?? do we need to divide by total?
            if stats:
                for key in stats:
                    stats[key] /= total_num
                max_stat = max(stats.values())
                sub = sum(math.exp(stat - max_stat) for stat in stats.values())
                probs = sorted([(key, math.exp(stat - max_stat) / sub) for key, stat in stats.items()],
                               reverse=True, key=lambda x: x[1])
                self.words_stats[word] = probs

        return self.words_stats

    def calc_specific_ngrams(self, threshold=2):  # determines ngrams specific for parts of speech
        self.ps_ngram_lists = defaultdict(lambda: [])
        for key, value in self.ngrams_by_pos.items():
            vals = sorted(value.items(), reverse=True, key=lambda x: x[1])
            vals = [(x,y) for x,y in vals if x and x[0].isalpha()]
            if not vals:
                continue
            v1 = vals[0][1]
            v2 = vals[1][1] if len(vals)>1 else 1
            if v1/v2>threshold:
                self.ps_ngram_lists[vals[0][0]].append((v1/v2, key))
        for key, value in self.ps_ngram_lists:
            self.ps_ngram_lists[key].sort(reverse=True)
        return self.ps_ngram_lists

    def calc_specific(self, word, accept_threshold=(100,10), reject_threshold=(1000,10), excluded_ps=(), include_ps=None):
        pos_results = defaultdict(lambda: (0, 0, 0, ()))
        rej_results = defaultdict(lambda: (1, 0, 0, ()))
        excluded_ps = set(excluded_ps)
        excluded_ps.update({'EQN', 'EQNP', 'SOL', 'EOL'})
        for ngram, count in self.ngrams_by_word[word].items():
            if include_ps:
                stats = sorted([(x,y) for x, y in self.ngrams_by_pos[ngram].items() if x and x[0].isalpha() and x in include_ps], reverse=True, key=lambda z: z[1])
            else:
                stats = sorted([(x,y) for x, y in self.ngrams_by_pos[ngram].items() if x and x[0].isalpha() and x not in excluded_ps], reverse=True, key=lambda z: z[1])
            total = sum(y for x,y in stats)
            if total < accept_threshold[0] or count<accept_threshold[1]:
                continue

            mx = stats[0][1]
            mx2 = stats[1][1] if len(stats)>1 else 1
            pos_results[stats[0][0]] = max(pos_results[stats[0][0]], (mx/mx2, total, count, ngram))
            for x, y in stats[1:]:
                pos_results[x] = max(pos_results[x], (y/mx, total, count, ngram))
                if total >= reject_threshold[0] and count>=reject_threshold[1]:
                    rej_results[x] = min(rej_results[x], (y/mx, total, count, ngram))
        return pos_results, rej_results



    def save_stats(self, path=None):
        path = path or f'word_stats_{self.n}.pickle'
        with open(path, 'wb') as f:
            pickle.dump(self.words_stats, f)

    def load_stats(self, path=None):
        path = path or f'word_stats_{self.n}.pickle'
        with open(path, 'rb') as f:
            self.words_stats = pickle.load(f)
        return self

    def print_stats(self, word, threshold=0.01):
        if word not in self.words_stats:
            print(f'{word} not found')
            return
        print(f'{word}:')
        ps_width = max(len(ps) for ps in self.words_stats[word])
        for ps, prob in self.words_stats[word]:
            if prob > threshold:
                print(f'{ps:<{ps_width}}: {prob:.3f}')


def prepare_tex_file_contents(filename):
    document = TexDocument(filename=filename)
    # collect text and math environments
    text_segments = []
    eqn_counter = 0
    for env in document.items_and_envs([TextFragment], math_env_names):
        if isinstance(env, TextFragment):
            text_segments.append(env.remove_formatting_macros())
        else:
            eqn_counter += 1
            text_segments.append(f"equation_{eqn_counter}")
            last_frag = env.items[-1]
            if isinstance(last_frag, TextFragment) and last_frag.text.count('.'):
                text_segments.append('.')

    return " ".join(text_segments)


# collect ngram stats from file
def collect_ngram_stats(n: int, path: str):
    stats = NgramStats(n)
    if path.endswith('.tex'):
        contents = prepare_tex_file_contents(path)
    else:
        with open(path, 'r') as f:
            contents = f.read()
    tokenized = tokenize_text_nltk(contents)
    print(f'{len(tokenized)} sentences; collecting stats for {n}-grams')
    stats.add_tokenized_text(tokenized)
    print('collecting stats for words')
    stats.collect_stats_for_words()
    print('done')
    return stats


# iterate over files in .tar file
import tarfile
def iterate_tar_contents(filename):
    tar = tarfile.open(filename)
    try:
        while member := tar.next():
            if member.isdir():
                continue
            content = tar.extractfile(member).read().decode("utf-8")
            yield member.name, content
    finally:
        tar.close()


# collect ngram stats from first max_num items of .tar file
def collect_ngram_stats_from_tar(n: int, path: str, max_num: int):
    stats = NgramStats(n)
    print(f'collecting {n}-gram stats from {path}')
    for i, (name, content) in enumerate(iterate_tar_contents(path)):
        stats.add_tokenized_text(tokenize_text_nltk(content))
        if i >= max_num:
            break
        if (i+1)%100 == 0:
            print(f'{i+1} files processed')
    print('collecting stats for words')
    stats.collect_stats_for_words()
    print('done')
    return stats


def collect_ngram_stats_from_texts(n, texts: Union[list, str], max_num=None, stats_for_words=False):
    if isinstance(texts, str):
        with open(texts, 'rb') as f:
            texts = pickle.load(f)
    stats = NgramStats(n)
    print(f'collecting {n}-gram stats from {len(texts)} texts')
    for i, text in enumerate(texts):
        stats.add_tokenized_text(text)
        if max_num is not None and i >= max_num:
            break
        if (i+1)%100 == 0:
            print(f'{i+1} texts processed')
    if stats_for_words:
        print('collecting stats for words')
        stats.collect_stats_for_words()
    print('done')
    return stats


def load_tag_texts(path: str, max_num: int):
    texts = []
    for i, (name, content) in enumerate(iterate_tar_contents(path)):
        if i >= max_num:
            break
        texts.append(tokenize_text_nltk(content))
        if (i+1)%100 == 0:
            print(f'{i+1} files processed')
    return texts

def print_pos_rej_results(pos_results, rej_results):
    pos_stats = sorted(pos_results.items(), reverse=True, key=lambda x:x[1])
    rej_stats = sorted(rej_results.items(), key=lambda x:x[1])
    print("Positive stats:")
    for key, (value, total, count, ngram) in pos_stats:
        print(f"\t{key:<5}: {value:10.3f}  | {' '.join(x or '_' for x in ngram):20}  :  {total}  {count}")

    print("Negative stats:")
    for key, (value, total, count, ngram) in rej_stats:
        print(f"\t{key:<5}: {value:10g}  | {' '.join(x or '_' for x in ngram):20}  :  {total}  {count}")

def join_pos_rej(pos, rej):
    rpos = pos[0]
    rrej = rej[0]
    for p in pos[1:]:
        for k,v in p.items():
            rpos[k] = max(rpos[k], v)
    for p in rej[1:]:
        for k,v in p.items():
            rrej[k] = min(rrej[k], v)
    print_pos_rej_results(rpos, rrej)

In [87]:
stats = collect_ngram_stats_from_texts(3, texts, max_num=1000)

collecting 3-gram stats from 1000 texts
100 texts processed
200 texts processed
300 texts processed
400 texts processed
500 texts processed
600 texts processed
700 texts processed
800 texts processed
900 texts processed
1000 texts processed
collecting stats for words
done


In [97]:
save_dict([stats], 'dict_3gram_1000.yml')

1000 words done
2000 words done
3000 words done
4000 words done
5000 words done
6000 words done


In [103]:
import gc
stats2=None
stats3=None
stats4=None
gc.collect()
stats2 = collect_ngram_stats_from_texts(2, texts, max_num=10000)
stats3 = collect_ngram_stats_from_texts(3, texts, max_num=10000)
stats4 = collect_ngram_stats_from_texts(4, texts, max_num=10000)

collecting 2-gram stats from 10000 texts
100 texts processed
200 texts processed
300 texts processed
400 texts processed
500 texts processed
600 texts processed
700 texts processed
800 texts processed
900 texts processed
1000 texts processed
1100 texts processed
1200 texts processed
1300 texts processed
1400 texts processed
1500 texts processed
1600 texts processed
1700 texts processed
1800 texts processed
1900 texts processed
2000 texts processed
2100 texts processed
2200 texts processed
2300 texts processed
2400 texts processed
2500 texts processed
2600 texts processed
2700 texts processed
2800 texts processed
2900 texts processed
3000 texts processed
3100 texts processed
3200 texts processed
3300 texts processed
3400 texts processed
3500 texts processed
3600 texts processed
3700 texts processed
3800 texts processed
3900 texts processed
4000 texts processed
4100 texts processed
4200 texts processed
4300 texts processed
4400 texts processed
4500 texts processed
4600 texts processed
47

In [111]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

stop_words = set(stopwords.words('english'))


def save_dict(statss: List[NgramStats], file: str, accept_threshold=(100, 10), reject_threshold=(100, 10), threshold=2, excluded_ps=('NNP', 'NNPS'), filtered_pos=None):
    res_dict = {}
    with open(file, 'w') as f:
        nw = 0
        for word in sorted(statss[0].words):
            if not word or not word.isalpha() or word[0].isupper():
                continue
            # if word is stopword, skip it
            if word in stop_words:
                continue
            #print(f'word={word}')
            pos = [None]*len(statss)
            include_ps = [p for p in filtered_pos[word] if p not in excluded_ps] if filtered_pos and word in filtered_pos else None
            for i, stats in enumerate(statss):
                pos[i], _ = NgramStats.calc_specific(stats, word, accept_threshold=accept_threshold, reject_threshold=reject_threshold, excluded_ps=excluded_ps, include_ps=include_ps)
            rpos = pos[0]
            for p in pos[1:]:
                for k,v in p.items():
                    rpos[k] = max(rpos[k], v)
            res = sorted([(v[0],k) for k,v in rpos.items() if v[0]>threshold], reverse=True)
            if res:
                res_dict[word] = {k: v for v,k in res}
                f.write(f'{word}: { {k:v for v,k in res} }\n')
                nw += 1
                if nw%1000 == 0:
                    print(f'{nw} words done')
                    f.flush()
    return res_dict

In [113]:
filtered2 = save_dict([stats2,stats3,stats4], 'dict_2-4gram_10000-f.yml', filtered_pos=filtered)

1000 words done
2000 words done
3000 words done
4000 words done
5000 words done
6000 words done
7000 words done
8000 words done
9000 words done
10000 words done
11000 words done
12000 words done
13000 words done
14000 words done
15000 words done


In [116]:
word = 'diffuse'
pos2, rej2 = stats2.calc_specific(word, accept_threshold=(100, 20), reject_threshold=(100, 5))
pos3, rej3 = stats3.calc_specific(word, accept_threshold=(100, 20), reject_threshold=(100, 5))
pos4, rej4 = stats4.calc_specific(word, accept_threshold=(100, 20), reject_threshold=(100, 5))
join_pos_rej([pos2,pos3,pos4], [rej2,rej3,rej4])

Positive stats:
	JJ   :      6.473  | =the _ JJ NN          :  94603  50
	VB   :      6.440  | =to _                 :  505933  57
	NN   :      6.124  | =a JJ _               :  366325  20
	NNP  :      4.468  | _ NNP                 :  4868162  131
	VBP  :      2.102  | NNS _                 :  775541  41
	SOL  :      1.368  | _ RB                  :  969914  37
	EQN  :      1.262  | =and _                :  795909  52
	VBZ  :      0.731  | _ RB                  :  969914  37
	VBN  :      0.718  | RB _                  :  774420  48
	NNS  :      0.637  | NN =of _              :  571061  64
	RB   :      0.529  | _ JJ                  :  1807637  166
	CD   :      0.305  | , _                   :  1737141  37
	MD   :      0.231  | _ RB                  :  969914  37
	VBG  :      0.205  | _ JJ NNS              :  278857  24
	VBD  :      0.140  | NNS _                 :  775541  41
	FW   :      0.107  | NNP _ NN              :  432452  23
	RBS  :      0.096  | =the _ JJ             :  16412

In [15]:
for w in ['commute','denote','recall','decrease','increases']:
    stats.print_stats(w)
    #stats3.print_stats(w)
    #stats4.print_stats(w)
    print("=============================================")

commute:
NN: 0.152
NNS: 0.095
RB: 0.083
JJ: 0.083
VBN: 0.065
NNP: 0.064
VBP: 0.053
VB: 0.049
VBZ: 0.049
CD: 0.038
VBG: 0.036
VBD: 0.030
MD: 0.023
JJR: 0.019
RBR: 0.018
FW: 0.017
RP: 0.016
JJS: 0.016
POS: 0.016
SYM: 0.016
NNPS: 0.015
RBS: 0.015
UH: 0.015
LS: 0.015
denote:
VB: 0.138
VBP: 0.119
VBZ: 0.073
NN: 0.072
RB: 0.063
VBG: 0.058
NNP: 0.054
JJ: 0.054
VBD: 0.046
NNS: 0.040
VBN: 0.039
CD: 0.032
MD: 0.023
JJR: 0.019
FW: 0.018
JJS: 0.018
RP: 0.018
RBR: 0.018
RBS: 0.017
POS: 0.017
NNPS: 0.016
SYM: 0.016
UH: 0.016
LS: 0.016
recall:
VB: 0.124
VBP: 0.109
NN: 0.098
JJ: 0.069
RB: 0.065
NNP: 0.063
VBG: 0.054
VBZ: 0.051
VBD: 0.047
VBN: 0.042
NNS: 0.038
CD: 0.032
MD: 0.021
JJR: 0.019
RP: 0.019
RBR: 0.018
JJS: 0.018
FW: 0.017
UH: 0.016
POS: 0.016
RBS: 0.016
NNPS: 0.016
SYM: 0.016
LS: 0.016
decrease:
NN: 0.359
NNS: 0.114
JJ: 0.072
NNP: 0.067
RB: 0.043
VB: 0.042
VBN: 0.034
VBZ: 0.033
VBG: 0.029
CD: 0.029
VBP: 0.025
VBD: 0.020
JJR: 0.015
MD: 0.015
JJS: 0.013
RBR: 0.013
RP: 0.012
FW: 0.011
RBS: 0.011

In [None]:
texts = load_tag_texts("/Users/Gleb/Desktop/Solver/2020-09-08-arxiv-extracts-nofallback-until-2007-068.tar", max_num=10000)

In [21]:
# dump texts to file
with open('texts.pickle', 'wb') as f:
    pickle.dump(texts, f)

In [98]:
# load texts from file
with open('texts.pickle', 'rb') as f:
    texts = pickle.load(f)

In [5]:
import gc

texts = list(texts[:1000])
gc.collect()

2467700

In [28]:
stats = collect_ngram_stats(1, 'tests/main.tex')

226 sentences; collecting stats for 1-grams
collecting stats for words
done


In [None]:
for key, value in sorted(stats.ngrams_by_word['commute'].items(), key=lambda x: x[1], reverse=True):
    print(f'{" ".join(str(x or "_") for x in key)}: {value}')
    for ps, num in sorted(stats.ngrams_by_pos[key].items(), key=lambda x: x[1], reverse=True)[:3]:
        print(f'\t{ps}: {num}')

In [10]:
sum(1 for w in stats.words if w and w[0].isalpha())

136524

In [16]:
with open('word_stats_2.dump', 'wb') as f:
    pickle.dump(stats, f)

AttributeError: Can't pickle local object 'NgramStats.collect_stats_for_words.<locals>.<lambda>'

In [17]:
set(sum(map(list, stats.ngrams.keys()), []))

{'#',
 '$',
 "''",
 '(',
 ')',
 ',',
 '.',
 ':',
 'CC',
 'CD',
 'DT',
 'EG',
 'EQN',
 'EQNP',
 'EX',
 'FOOTNOTE',
 'FW',
 'IE',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SOL',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``'}

In [312]:
import random
#neural network predicting part of speech for a given word (input is a word)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

# output is probabilities of tags: 'FW', 'IE', 'JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'POS',
# 'RB', 'RBR', 'RBS', 'RP', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'
# network consists of k fully connected layers, each with n neurons, last layer has a neuron for each tag
# input is a word, output is a vector of probabilities for each tag
pos_list = ['FW', 'IE', 'JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'POS', 'RB', 'RBR', 'RBS', 'RP', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
class PosNet:
    def __init__(self, n: List[int], max_word_len: int = 20):
        self.k = len(n)-1
        self.n = n
        self.max_word_len = max_word_len
        self.net = nn.Sequential()
        # each letter of input word is mapped to a vector of length 26 (26 letters in alphabet)
        self.net.add_module('input', nn.Linear(max_word_len*26, n[0]))
        for i in range(self.k):
            self.net.add_module(f'fc{i}', nn.Linear(n[i], n[i+1]))
            self.net.add_module(f'relu{i}', nn.ReLU())
            self.net.add_module(f'dropout{i}', nn.Dropout(0.5))

        self.net.add_module('fc_out', nn.Linear(n[-1], len(pos_list)))
        self.net.add_module('relu_out', nn.ReLU())
        #self.net.add_module('sigmoid', nn.Sigmoid())
        self.net.add_module('softmax', nn.Softmax(dim=1))
        print(self.net)


    def train(self, data, epochs=10, test_data=None):
        self.net.train()
        optimizer = optim.Adam(self.net.parameters(), lr=0.01)

        # convert data to vector of size 27*max_word_len (word is aligned to the end)
        input = torch.tensor([self.get_letter_vector(w) for w, _ in data], dtype=torch.float32)
        # convert tags to vector of size len(pos_list)
        target = torch.tensor([self.get_tag_vector(t) for _, t in data], dtype=torch.float32)
        positive_idx = [i for i, t in enumerate(target) if sum(t) > 0]
        input = input[positive_idx]
        target = target[positive_idx]
        if test_data:
            test_input = torch.tensor([self.get_letter_vector(w) for w, _ in test_data], dtype=torch.float32)
            test_target = torch.tensor([self.get_tag_vector(t) for _, t in test_data], dtype=torch.float32)
            test_positive_idx = [i for i, t in enumerate(test_target) if sum(t) > 0]
            test_input = test_input[test_positive_idx]
            test_target = test_target[test_positive_idx]
        else:
            test_input = None
            test_target = None

        print(f'{len(input)} samples')
        test_res = []
        min_test = 1e10
        for epoch in range(epochs):
            # train
            optimizer.zero_grad()
            output = self.net(input)
            loss = F.mse_loss(output, target)
            #loss /= len(data)
            loss.backward()
            optimizer.step()
            if test_data:
                test_output = self.net(test_input)
                test_loss = F.mse_loss(test_output, test_target)
                test_res.append(test_loss.item())
            # print loss and check if it is converged
            if epoch % 10 == 9:
                if test_data:
                    mean_test = np.mean(test_res[-10:])
                    print(f'epoch {epoch+1:4}: loss {loss.item():7.4f};\t test loss {mean_test:7.4f}')
                    if len(test_res)>=20 and mean_test > min_test+0.001:
                        print('early stopping')
                        break
                    min_test = min(min_test, mean_test)
                else:
                    print(f'epoch {epoch+1}: loss {loss.item():.4f}')

        self.net.eval()
        if test_data:
            return test_res[-1]

    def validate(self, data):
        self.net.eval()
        input = torch.tensor([self.get_letter_vector(w) for w, _ in data], dtype=torch.float32)
        target = torch.tensor([self.get_tag_vector(t) for _, t in data], dtype=torch.float32)
        positive_idx = [i for i, t in enumerate(target) if sum(t) > 0]
        input = input[positive_idx]
        target = target[positive_idx]
        output = self.net(input)
        loss = F.mse_loss(output, target)
        print(f'loss {loss}')
        return loss

    def test(self, word, max_items=None):
        if max_items is None:
            max_items = len(pos_list)
        self.net.eval()
        input = torch.tensor([self.get_letter_vector(word)], dtype=torch.float32)
        output = self.net(input)[0]
        #print parts of speech for each word with highest probabilities
        res = sorted(zip(pos_list, output), key=lambda x: x[1], reverse=True)
        for i in range(min(max_items, len(res))):
            print(f'{i+1:3}. {res[i][0]:5} {res[i][1]:.4f}') # pos, probability
        return res

    def get_letter_vector(self, word):
        res = [0]*26*self.max_word_len
        start = (self.max_word_len - len(word))*26
        for i, c in enumerate(word):
            res[start+i*26+ord(c)-ord('a')] = 1
        return res

    def get_tag_vector(self, tags: Dict[str, float], threshold=5):
        res = torch.zeros(len(pos_list), dtype=torch.float32)
        for tag, prob in tags.items():
            if tag in pos_list and prob > threshold:
                res[pos_list.index(tag)] = 1
        if res.sum() > 0:
            res /= res.sum()
        return list(res)

    # redurns a list of tags with their probabilities in decreasing order
    def predict(self, word):
        self.net.eval()
        input = self.get_letter_vector(word)
        output = self.net(input)
        return [(pos_list[i], output[0, i].item()) for i in range(len(pos_list))]

train_data = [(k,v) for k,v in filtered.items() if len(k) <= 20 and k.isalpha() and all(ord('a')<=ord(c)<=ord('z') for c in k)]
# shuffle train_data
random.shuffle(train_data)
test_data = train_data[:1000]
train_data = train_data[1000:]

# train the network
net = PosNet(n=[100,50], max_word_len=20)
net.train(train_data, epochs=1000, test_data=test_data)

# test the network
test_loss = net.validate(test_data)
#print(f'test loss: {test_loss}')

Sequential(
  (input): Linear(in_features=520, out_features=100, bias=True)
  (fc0): Linear(in_features=100, out_features=50, bias=True)
  (relu0): ReLU()
  (dropout0): Dropout(p=0.5, inplace=False)
  (fc_out): Linear(in_features=50, out_features=21, bias=True)
  (relu_out): ReLU()
  (softmax): Softmax(dim=1)
)
9196 samples
epoch   10: loss  0.0237;	 test loss  0.0248
epoch   20: loss  0.0196;	 test loss  0.0208
epoch   30: loss  0.0177;	 test loss  0.0187
epoch   40: loss  0.0168;	 test loss  0.0182
epoch   50: loss  0.0163;	 test loss  0.0180
epoch   60: loss  0.0160;	 test loss  0.0180
epoch   70: loss  0.0156;	 test loss  0.0181
epoch   80: loss  0.0152;	 test loss  0.0181
epoch   90: loss  0.0150;	 test loss  0.0183
epoch  100: loss  0.0146;	 test loss  0.0186
epoch  110: loss  0.0144;	 test loss  0.0186
epoch  120: loss  0.0142;	 test loss  0.0186
epoch  130: loss  0.0140;	 test loss  0.0188
epoch  140: loss  0.0138;	 test loss  0.0188
epoch  150: loss  0.0137;	 test loss  0.0190

In [230]:
#net = PosNet(k=2, n=100, max_word_len=20)
net.net.eval()
net.net(torch.tensor([net.get_letter_vector('tensorly')], dtype=torch.float32)).detach().numpy()

array([[0.01450993, 0.01450993, 0.0496132 , 0.01450993, 0.01450993,
        0.01450993, 0.08943307, 0.01450993, 0.01450993, 0.01450993,
        0.01450993, 0.43111312, 0.01450993, 0.01450993, 0.01450993,
        0.1482599 , 0.01450993, 0.01450993, 0.026211  , 0.03772076,
        0.01450993]], dtype=float32)

In [199]:
del stats2
del stats3
del stats4
gc.collect()

21692284

In [258]:
net.net.state_dict()

OrderedDict([('input.weight',
              tensor([[ 0.0171, -0.0024, -0.0251,  ...,  0.1406, -0.2134,  0.0371],
                      [ 0.0186,  0.0428,  0.0091,  ...,  0.0467,  0.0746,  0.1979],
                      [ 0.0193,  0.0410,  0.0380,  ...,  0.0066,  0.1092, -0.0121],
                      ...,
                      [-0.0269,  0.0293, -0.0058,  ...,  0.0713, -0.1289,  0.0541],
                      [-0.0101,  0.0206, -0.0212,  ..., -0.1208, -0.0140, -0.1371],
                      [ 0.0188,  0.0199,  0.0094,  ...,  0.0652,  0.0213,  0.2003]])),
             ('input.bias',
              tensor([ 0.0643,  0.0928,  0.1025, -0.0282, -0.0124,  0.0623,  0.1137,  0.0053,
                       0.1082,  0.0482,  0.0822,  0.1060, -0.0467,  0.0660, -0.0989, -0.0980,
                      -0.1000, -0.0574,  0.0804,  0.1122, -0.0659, -0.0944,  0.0813, -0.1261,
                      -0.1304,  0.0544, -0.0825, -0.1079,  0.0181, -0.1151, -0.1173,  0.0976,
                       0.0352, -

In [325]:
'abcd'.translate(str.maketrans({'ab':'2','c':'5','d':'a'}))

ValueError: string keys in translate table must be of length 1

In [323]:
s = 'one two one two one'

print(s.translate(str.maketrans({'o': 'O', 't': 'T'})))

One TwO One TwO One


In [3]:
import os
import pickle
# read parse trees from files from directory results/*.pickle

def read_trees(dir):
    trees = []
    for f in os.listdir(dir):
        if f.endswith('.pickle'):
            with open(os.path.join(dir, f), 'rb') as f:
                trees.append(pickle.load(f))
    return trees

trees = read_trees('results')

ModuleNotFoundError: No module named '__mp_main__'

In [5]:
import pickle
mobypos_dict = {}
mobypos_comb_dict = {}
with open('dicts/mobypos.txt', 'r', encoding='ascii',errors='ignore') as f:
    for line in f:
        word, tag = line.split('\\')
        if ' ' in word:
            mobypos_comb_dict[word] = tag
        else:
            mobypos_dict[word] = tag

# dump the dictionaries to pickle file
with open('dicts/mobypos.pickle', 'wb') as f:
    pickle.dump((mobypos_dict, mobypos_comb_dict), f)

In [1]:
import tagger
dicts = tagger.Dictionaries()
dicts.load_from_src('dicts')
dicts.save_json_compressed('dicts/dicts.gz')

In [9]:
len(mobypos_comb_dict)

35939