In [1]:
from functional import seq
from functional.streams import Sequence
import pandas as pd
import numpy as np
import os
from typing import Dict, List, Tuple, Optional
import random
from collections import namedtuple
from random import shuffle
import nltk

In [2]:
ustawy_dir = "../lower_ustawy"
art_keyword = "art"
ustawy_files = seq(os.listdir(ustawy_dir)).map(
    lambda filename: open(ustawy_dir + "/" + filename).read())

ustawy_files.size()

1180

In [3]:
good, bad = ustawy_files.partition(lambda x: art_keyword in x[:2400])

In [4]:
good.size()

1178

In [5]:
bad.size()

2

In [6]:
bad[0][:300].split("\n")

['',
 'ustawa',
 'z dnia 11 października 2013 r ',
 'o wzajemnej pomocy przy dochodzeniu podatków  należności ',
 'celnych i innych należności pieniężnych',
 '',
 '     ',
 'font definitions   ',
 '  font face',
 '\t font family helvetica ',
 '\tpanose 1 2 11 5 4 2 2 2 2 2 4  ',
 ' font face',
 '\t font family courier ',
 '\tpanose 1 2 7 4 9 2 2 5 2 4 4  ',
 '']

In [7]:
bad[1][:300].split("\n")

['', '', '', '', '', 'brak tekstu w postaci elektronicznej ', '']

In [8]:
def is_change(ustawa:str) -> bool:
    return "o zmianie ustawy" in ustawa[:800]

In [9]:
changes, not_changes = good.partition(is_change)

In [10]:
Labeled =  namedtuple("Labeled","text is_change")

In [11]:
def strip_title(text:str) -> str:
    return text.split(art_keyword,maxsplit=1)[1]

In [12]:
labeled_changes = changes.map(lambda txt: Labeled(text = strip_title(txt), is_change = True))

In [13]:
labeled_not_changes = not_changes.map(lambda txt: Labeled(text = strip_title(txt), is_change = False))

In [14]:
data = (labeled_changes + labeled_not_changes).to_list()

In [15]:
shuffle(data)

In [16]:
def split(data):
    data_len = len(data)
    train_size = int(0.6 * data_len)
    validation_size = int(0.2 * data_len)

    train = data[:train_size]
    validation = data[train_size:train_size + validation_size]
    test = data[train_size + validation_size:]
    return train,test,validation
    
# train, test,validation = split(data)
# len(train),len(validation),len(test)

In [17]:
from random import choices # todo:bcm - take without replacements
def full_selector(text):
    return text

def percentage_selector(text):
    lines = text.split("\n")
    take_count = int(0.1* len(lines))
    return "\n".join(choices(lines,k= take_count))

def lines_selector(text):
    lines = text.split("\n")
    return "\n".join(choices(lines,k= 10))
    
def line_selector(text):
    lines = text.split("\n")
    lines_len = len(lines)
    return lines[random.randint(0,lines_len-1)]

In [18]:
Selector = namedtuple("Selector", "name selector")

In [19]:
selectors = seq([
    Selector(name = "full", selector = full_selector),
    Selector(name = "percentage", selector = percentage_selector),
    Selector(name = "lines", selector = lines_selector),
    Selector(name = "line", selector = line_selector),
])

## Learning things

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

In [21]:
# TODO - use or loose it 
def grid_search(train_x, train_y, test_x, test_y,  parameters, pipeline):
    grid_search_tune = GridSearchCV(
        pipeline, parameters, cv=2, n_jobs=3, verbose=10)
    grid_search_tune.fit(train_x, train_y)

    print("Best parameters set:")
    print(grid_search_tune.best_estimator_.steps)

    print("Applying best classifier on test data:")
    best_clf = grid_search_tune.best_estimator_
    predictions = best_clf.predict(test_x)

    print(classification_report(test_y, predictions))

In [22]:
sw_file = open("polish.stopwords.txt")
stop_words = set(word[:-1] for word in sw_file.readlines())
sw_file.close()
list(stop_words)[:3]

['cały', 'cokolwiek', 'nad']

In [23]:
# nltk.download('punkt')
tokenized = [Labeled(text = nltk.word_tokenize(x.text), is_change= x.is_change) for x in data]

In [24]:
def map_text(mapper, data):
    return [Labeled(text = mapper(x.text), is_change= x.is_change) for x in data]

In [25]:
# train_x = [x.text for x in train]
# y_train = [x.is_change for x in train]
# test_x = [x.text for x in test]
# y_test = [x.is_change for x in test]

In [26]:
def split2(x,y):
    data_len = len(y)
    train_size = int(0.6 * data_len)
    validation_size = int(0.2 * data_len)

    train = x[:train_size]
    validation = x[train_size:train_size + validation_size]
    test = x[train_size + validation_size:]
    
    y_train = y[:train_size]
    y_validation = y[train_size:train_size + validation_size]
    y_test = y[train_size + validation_size:]
    
    
    return (train,test,validation), (y_train,y_test,y_validation)

In [27]:
xs = [x.text for x in data]
ys = [x.is_change for x in data]

In [28]:
from collections import Counter
word_counter = Counter()

for labeled_doc in tokenized:
    for word in labeled_doc.text:
        word_counter[word] += 1
    

In [29]:
words_for_vocab = int(8* len(word_counter))
words_for_vocab

489032

In [30]:
vocab = [word for word,_ in word_counter.most_common(words_for_vocab)]

In [31]:
idf = TfidfVectorizer(stop_words=stop_words,vocabulary=vocab)
idfed = idf.fit_transform(xs)

In [32]:
(tr_x,test_x,val_x),(tr_y,test_y,val_y)= split2(idfed,ys)

In [33]:
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()
logistic.fit(tr_x,tr_y)
logistic.score(test_x,test_y)



0.8438818565400844

In [34]:
svc = SVC(kernel= 'linear')#OneVsRestClassifier(SVC(), n_jobs=1)
svc.fit(tr_x,tr_y)
predicted_y = svc.predict(test_x)

In [35]:
def evaluate(predicted,expected):
    prec,recall,fbeta,support = precision_recall_fscore_support(y_true=expected,y_pred = predicted,average='weighted')
   

    print("F1 score: {}".format(fbeta))
    print("Precission: {}".format(prec))
    print("Recall: {}".format(recall))

In [36]:
evaluate(predicted_y,test_y)

F1 score: 0.8523996916698404
Precission: 0.8527519390335958
Recall: 0.8523206751054853


# Fasttext

In [37]:
import fastText

In [38]:
oneline_data = map_text(lambda text: text.replace("\n", " "),data)
oneline_data[0]

Labeled(text='  1   w ustawie z dnia 28 marca 2003 r  o transporcie kolejowym  dz  u  z 2007 r  nr 16  poz  94  z późn  zm  1    wprowadza się następujące zmiany  1 \xa0\xa0 w art  4  a \xa0\xa0 po pkt 6e dodaje się pkt 6f w brzmieniu   6f  pojazd kolejowy specjalny   pojazd kolejowy przeznaczony do utrzymania  naprawy lub budowy infrastruktury kolejowej  lub przeznaczony do prowadzenia działań ratowniczych    b \xa0\xa0 pkt 9a otrzymuje brzmienie   9a  usługa trakcyjna   działalność przewoźnika kolejowego polegającą na zapewnieniu pojazdu kolejowego z napędem wraz z obsługą maszynistów do wykonywania przewozu kolejowego albo zapewnienie obsługi maszynistów do prowadzenia pojazdu kolejowego z napędem    2 \xa0\xa0 w art  13  a \xa0\xa0 w ust  1a   \xa0\xa0\xa0 pkt 2 otrzymuje brzmienie   2 \xa0\xa0\xa0 wydawanie  przedłużanie ważności  zawieszanie  przywracanie i cofanie licencji maszynisty  aktualizacja danych zawartych w licencji maszynisty  wydawanie wtórników licencji maszynisty or

In [39]:
def to_fast_text(x: Labeled)-> str :
    label = "1" if x.is_change else "0"
    replaced = x.text.replace("\"","\"\"")
    return "__label__{}, \"{}\"\n".format(label,replaced)

In [40]:
data_len = len(oneline_data)
train_size = int(0.6 * data_len)
validation_size = int(0.2 * data_len)

In [41]:
ol_train = oneline_data[:train_size]
ol_test = oneline_data[train_size:train_size+ validation_size]
ol_val = oneline_data[validation_size+ train_size:]

In [42]:
def ol_to_file(name:str, ol:List[Labeled])-> None:
    file = open("data.bak/{}".format(name),"w")

    for t in ol:
        file.write(to_fast_text(t))
    file.close()

In [43]:
ol_to_file("train.csv",ol_train)
ol_to_file("test.csv",ol_test)
ol_to_file("val.csv",ol_val)

In [44]:
model = fastText.train_supervised("data.bak/train.csv",lr=0.7,epoch = 20)

In [45]:
test_result = model.test("data.bak/test.csv")

In [46]:
# TODO:bcm - compute my own precision
test_result

(235, 0.8127659574468085, 0.8127659574468085)

# Flair

In [47]:
ROOT_URL = 'https://s3.eu-central-1.amazonaws.com/borchmann'
FORWARD_FILE = 'cse/lm-polish-forward-v0.2.pt'
BACKWARD_FILE = 'cse/lm-polish-backward-v0.2.pt'
GLOVE_FILE = 'glove/poleval.txt'

FORWARD_LM = f'{ROOT_URL}/{FORWARD_FILE}'
BACKWARD_LM = f'{ROOT_URL}/{BACKWARD_FILE}'
GLOVE = f'{ROOT_URL}/{GLOVE_FILE}'

print(FORWARD_LM)
print(BACKWARD_LM)
print(GLOVE)

https://s3.eu-central-1.amazonaws.com/borchmann/cse/lm-polish-forward-v0.2.pt
https://s3.eu-central-1.amazonaws.com/borchmann/cse/lm-polish-backward-v0.2.pt
https://s3.eu-central-1.amazonaws.com/borchmann/glove/poleval.txt


In [48]:
# ! wget https://s3.eu-central-1.amazonaws.com/borchmann/cse/lm-polish-forward-v0.2.pt -O data.bak/polish_forward.pt
# ! wget https://s3.eu-central-1.amazonaws.com/borchmann/cse/lm-polish-backward-v0.2.pt -O data.bak/polish_backward.pt
# ! wget https://s3.eu-central-1.amazonaws.com/borchmann/glove/poleval.txt -O data.bak/glove.txt


In [49]:
from pathlib import Path

In [50]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.trainers import ModelTrainer
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.embeddings import StackedEmbeddings, CharLMEmbeddings, TokenEmbeddings   

AttributeError: module 'wrapt' has no attribute 'AdapterFactory'

In [None]:
corpus = NLPTaskDataFetcher.load_classification_corpus(
    Path('./data.bak'),
    test_file='test.csv',
    dev_file='val.csv',
    train_file='train.csv'
)

word_embeddings = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward-fast'),
    FlairEmbeddings('news-backward-fast')
]

document_embeddings = DocumentLSTMEmbeddings(
    word_embeddings,
    hidden_size=512,
    reproject_words=True,
    reproject_words_dimension=256
)

classifier = TextClassifier(
    document_embeddings,
    label_dictionary=corpus.make_label_dictionary(),
    multi_label=False
)

trainer = ModelTrainer(classifier, corpus)

trainer.train('./data.', max_epochs=10)

In [51]:
import wrapt

In [53]:
help(wrapt)

Help on package wrapt:

NAME
    wrapt

PACKAGE CONTENTS
    _wrappers
    arguments
    decorators
    importer
    wrappers

DATA
    __version_info__ = ('1', '9', '0')

VERSION
    1.9.0

FILE
    /usr/lib/python3/dist-packages/wrapt/__init__.py


