In [33]:
from functional import seq
from functional.streams import Sequence
import pandas as pd
import numpy as np
import os
from typing import Dict, List, Tuple, Optional
import random
from collections import namedtuple
from random import shuffle
import pandas as pd
from IPython.core.display import HTML

# Preprocessing

In [34]:
ustawy_dir = "../lower_ustawy"
art_keyword = "art"
ustawy_files = seq(os.listdir(ustawy_dir)).map(
    lambda filename: open(ustawy_dir + "/" + filename).read())

ustawy_files.size()

1180

In [35]:
good, bad = ustawy_files.partition(lambda x: art_keyword in x[:2400])

In [36]:
good.size()

1178

In [37]:
bad.size()

2

In [38]:
bad[0][:300].split("\n")

['',
 'ustawa',
 'z dnia 11 października 2013 r ',
 'o wzajemnej pomocy przy dochodzeniu podatków  należności ',
 'celnych i innych należności pieniężnych',
 '',
 '     ',
 'font definitions   ',
 '  font face',
 '\t font family helvetica ',
 '\tpanose 1 2 11 5 4 2 2 2 2 2 4  ',
 ' font face',
 '\t font family courier ',
 '\tpanose 1 2 7 4 9 2 2 5 2 4 4  ',
 '']

In [39]:
bad[1][:300].split("\n")

['', '', '', '', '', 'brak tekstu w postaci elektronicznej ', '']

In [40]:
def is_change(ustawa:str) -> bool:
    return "o zmianie ustawy" in ustawa[:800]

In [41]:
changes, not_changes = good.partition(is_change)

In [42]:
Labeled =  namedtuple("Labeled","text is_change")

In [43]:
def strip_title(text:str) -> str:
    return text.split(art_keyword,maxsplit=1)[1]

In [44]:
labeled_changes = changes.map(lambda txt: Labeled(text = strip_title(txt), is_change = True))

In [45]:
labeled_not_changes = not_changes.map(lambda txt: Labeled(text = strip_title(txt), is_change = False))

In [46]:
data = (labeled_changes + labeled_not_changes).to_list()

In [47]:
shuffle(data)

## Selectors

In [48]:
#todo:bcm - use it or loose it
def split(data):
    data_len = len(data)
    train_size = int(0.6 * data_len)
    validation_size = int(0.2 * data_len)

    train = data[:train_size]
    validation = data[train_size:train_size + validation_size]
    test = data[train_size + validation_size:]
    return train,test,validation
    
# train, test,validation = split(data)
# len(train),len(validation),len(test)

In [49]:
from random import choices # todo:bcm - take without replacements
def full_selector(text):
    return text

def percentage_selector(text):
    lines = text.split("\n")
    take_count = int(0.1* len(lines))
    return "\n".join(choices(lines,k= take_count))

def lines_selector(text):
    lines = text.split("\n")
    return "\n".join(choices(lines,k= 10))
    
def line_selector(text):
    lines = text.split("\n")
    lines_len = len(lines)
    return lines[random.randint(0,lines_len-1)]

In [50]:
Selector = namedtuple("Selector", "name selector")

In [51]:
selectors = seq([
    Selector(name = "full", selector = full_selector),
    Selector(name = "percentage", selector = percentage_selector),
    Selector(name = "lines", selector = lines_selector),
    Selector(name = "line", selector = line_selector),
])

# SVM + TF-IDF

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

### Stop words

In [53]:
sw_file = open("polish.stopwords.txt")
stop_words = set(word[:-1] for word in sw_file.readlines())
sw_file.close()
list(stop_words)[:3]

['jakaś', 'mój', 'mozna']

In [54]:
def map_text(mapper, data):
    return [Labeled(text = mapper(x.text), is_change= x.is_change) for x in data]

In [55]:
def split2(x,y):
    data_len = len(y)
    train_size = int(0.6 * data_len)
    validation_size = int(0.2 * data_len)

    train = x[:train_size]
    validation = x[train_size:train_size + validation_size]
    test = x[train_size + validation_size:]
    
    y_train = y[:train_size]
    y_validation = y[train_size:train_size + validation_size]
    y_test = y[train_size + validation_size:]
    
    
    return (train,test,validation), (y_train,y_test,y_validation)

## Evaluation

In [56]:
def grid_search(train_x, train_y,  parameters, pipeline):
    grid_search_tune = GridSearchCV(
        pipeline, parameters, cv=2, n_jobs=3, verbose=10, return_train_score =True)
    grid_search_tune.fit(train_x, train_y)

    
    return (
        grid_search_tune.best_estimator_,
        grid_search_tune.best_params_,
        grid_search_tune.cv_results_
    )

In [57]:
def evaluate(predicted,expected):
    prec,recall,fbeta,support = precision_recall_fscore_support(y_true=expected,y_pred = predicted,average='weighted')
   

    print("F1 score: {}".format(fbeta))
    print("Precission: {}".format(prec))
    print("Recall: {}".format(recall))

In [58]:
def present_results(results,clf,val_x,val_y):
    other = seq(range(0,2))\
    .flat_map(lambda i : ["split{}_test_score".format(i),"split{}_train_score".format(i)])\
    .to_list()
    
    labels = ["mean_fit_time","std_fit_time","std_score_time","mean_score_time","params",
              "std_test_score","std_train_score"] + other
    
    seq(labels).for_each(lambda label: results.pop(label))
    frame = pd.DataFrame(results)
    
    display(
        frame.sort_values("rank_test_score",ascending = 1)
    )
    
    print("On cross validation:")
    true_val = clf.predict(val_x)
    evaluate(val_y,true_val)

In [59]:
# xs,ys = [x.text for x in data], [x.is_change for x in data]
# (tr_x,test_x,val_x),(tr_y,test_y,val_y)= split2(xs,ys)

In [60]:
# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer(stop_words=stop_words)),
#     ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=3)),
# ])
# 
# parameters = {
#     'tfidf__max_df': ( 0.25, 0.5,0.75,),
#     'tfidf__ngram_range': [(1,2),(1, 3)],
#     "clf__estimator__C": [0.1,0.15,0.2,0.25,0.3],
# }

# best_clf, best_params, results  = grid_search(tr_x + test_x, tr_y+ test_y, parameters, pipeline)

In [61]:
# best_params

In [62]:
def evaluate_linear_svc(data, params,selector:Selector):
    mapped_data =  map_text(selector.selector,data)
    xs,ys = [x.text for x in mapped_data], [x.is_change for x in mapped_data]
    (tr_x,test_x,val_x),(tr_y,test_y,val_y)= split2(xs,ys)
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=stop_words)),
        ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=3)),
    ])
    
    display(HTML("<h2>Selector: {}</h2>".format(selector.name)))
    
    best_clf, best_params, results  = grid_search(tr_x + test_x, tr_y+ test_y, parameters, pipeline)
 #todo:bcm - resutls are the same :XXXX
    print(best_params)
    present_results(results,best_clf,val_x,val_y)

In [63]:
parameters = {
    'tfidf__max_df': ( 0.25, 0.5,0.75,),
    'tfidf__ngram_range': [(1,2),(1, 3)],
    "clf__estimator__C": [0.1,0.2,0.25,0.3],
}

In [64]:
raise Exception # To avoid loosing results

Exception: 

In [65]:
selectors.for_each(lambda selector: evaluate_linear_svc(data,parameters,selector))

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   13.6s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   49.2s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:  1.2min
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:  1.9min
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  2.5min


KeyboardInterrupt: 

In [None]:
raise Exception

# Fasttext

Installation:
    >

In [None]:
import fastText

In [None]:
oneline_data = map_text(lambda text: text.replace("\n", " "),data)
# oneline_data[0]

In [None]:
def to_fast_text(x: Labeled)-> str :
    label = "1" if x.is_change else "0"
    replaced = x.text.replace("\"","\"\"")
    return "__label__{}, \"{}\"\n".format(label,replaced)

In [None]:
data_len = len(oneline_data)
train_size = int(0.6 * data_len)
validation_size = int(0.2 * data_len)

In [None]:
ol_train = oneline_data[:train_size]
ol_test = oneline_data[train_size:train_size+ validation_size]
ol_val = oneline_data[validation_size+ train_size:]

In [None]:
def ol_to_file(name:str, ol:List[Labeled])-> None:
    file = open("data.bak/{}".format(name),"w")

    for t in ol:
        file.write(to_fast_text(t))
    file.close()

In [None]:
ol_to_file("train.csv",ol_train)
ol_to_file("test.csv",ol_test)
ol_to_file("val.csv",ol_val)

In [None]:
model = fastText.train_supervised("data.bak/train.csv",lr=0.7,epoch = 20)

In [None]:
test_result = model.test("data.bak/test.csv")

In [None]:
# TODO:bcm - compute my own precision
test_result

# Flair

In [None]:
ROOT_URL = 'https://s3.eu-central-1.amazonaws.com/borchmann'
FORWARD_FILE = 'cse/lm-polish-forward-v0.2.pt'
BACKWARD_FILE = 'cse/lm-polish-backward-v0.2.pt'
GLOVE_FILE = 'glove/poleval.txt'

FORWARD_LM = f'{ROOT_URL}/{FORWARD_FILE}'
BACKWARD_LM = f'{ROOT_URL}/{BACKWARD_FILE}'
GLOVE = f'{ROOT_URL}/{GLOVE_FILE}'

print(FORWARD_LM)
print(BACKWARD_LM)
print(GLOVE)

In [None]:
# ! wget https://s3.eu-central-1.amazonaws.com/borchmann/cse/lm-polish-forward-v0.2.pt -O data.bak/polish_forward.pt
# ! wget https://s3.eu-central-1.amazonaws.com/borchmann/cse/lm-polish-backward-v0.2.pt -O data.bak/polish_backward.pt
# ! wget https://s3.eu-central-1.amazonaws.com/borchmann/glove/poleval.txt -O data.bak/glove.txt


In [None]:
from pathlib import Path

In [None]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.trainers import ModelTrainer
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.embeddings import StackedEmbeddings, CharLMEmbeddings, TokenEmbeddings   

In [None]:
# using #https://github.com/applicaai/poleval-2018.git

In [None]:
from flair.embeddings import TokenEmbeddings
from flair.data import Sentence
from typing import List
from gensim.models import KeyedVectors
import torch
import numpy as np

class KeyedWordEmbeddings(TokenEmbeddings):
    def __init__(self, embeddings):
        self.name = embeddings
        self.static_embeddings = True
        self.precomputed_word_embeddings = KeyedVectors.load_word2vec_format(embeddings)
        self.known_words = set(self.precomputed_word_embeddings.index2word)
        self.__embedding_length: int = self.precomputed_word_embeddings.vector_size
        super().__init__()

    @property
    def embedding_length(self) -> int:
        return self.__embedding_length

    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
        for i, sentence in enumerate(sentences):
            for token, token_idx in zip(sentence.tokens, range(len(sentence.tokens))):
                token: Token = token
                if token.text in self.known_words:
                    word_embedding = self.precomputed_word_embeddings[token.text]
                elif token.text.lower() in self.known_words:
                    word_embedding = self.precomputed_word_embeddings[token.text.lower()]
                else:
                    word_embedding = self.precomputed_word_embeddings['<unk>']
                word_embedding = torch.FloatTensor(word_embedding)
                token.set_embedding(self.name, word_embedding)
        return sentences


In [None]:
# embedding_types: List[TokenEmbeddings] = [
#     KeyedWordEmbeddings(GLOVE),
#     CharLMEmbeddings(FORWARD_LM),
#     CharLMEmbeddings(BACKWARD_LM)
# ]

# embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

#taskes 4 ever

In [None]:
# corpus = NLPTaskDataFetcher.load_classification_corpus(
#     Path('./data.bak'),
#     test_file='test.csv',
#     dev_file='val.csv',
#     train_file='train.csv'
# )

# document_embeddings = DocumentLSTMEmbeddings(
#     embeddings,
#     hidden_size=512,
#     reproject_words=True,
#     reproject_words_dimension=256
# )

# classifier = TextClassifier(
#     document_embeddings,
#     label_dictionary=corpus.make_label_dictionary(),
#     multi_label=False
# )

# trainer = ModelTrainer(classifier, corpus)

# trainer.train('./data.', max_epochs=10)

In [None]:
#
" " + 1

In [None]:
from flair.data_fetcher import NLPTaskDataFetcher
from pathlib import Path
from flair.data import TaggedCorpus

# use your own data path
data_folder = Path('./data.bak')

# load corpus containing training, test and dev data
corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus(data_folder,
                                                                     test_file='test.csv',
                                                                     dev_file='val.csv',
                                                                     train_file='train.csv')

In [None]:
import wrapt

In [None]:
help(wrapt)

In [None]:
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

In [None]:
# 4. initialize embeddings
word_embeddings = [WordEmbeddings('pl'),
                   # comment in flair embeddings for state-of-the-art results
#                    FlairEmbeddings('polish-forward'),
                   # FlairEmbeddings('polish-backward'),
                   ]


document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
                                                                     hidden_size=512,
                                                                     reproject_words=True,
                                                                     reproject_words_dimension=256,
                                                                     ) 


In [None]:
label_dict = corpus.make_label_dictionary()
len(label_dict)

In [None]:

# 5. initialize sequence tagger
from flair.models import SequenceTagger

# tagger: SequenceTagger = SequenceTagger(hidden_size=256,
#                                         embeddings=embeddings,
#                                         tag_dictionary=tag_dictionary,
#                                         tag_type=tag_type,
#                                         use_crf=True)

# 6. initialize trainer

In [None]:

# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)

# 6. initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

# 7. start the training
trainer.train('resources/taggers/ag_news',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=150)

# 8. plot training curves (optional)
from flair.visual.training_curves import Plotter
plotter = Plotter()
plotter.plot_training_curves('resources/taggers/ag_news/loss.tsv')
plotter.plot_weights('resources/taggers/ag_news/weights.txt')