In [134]:
from functional import seq
from functional.streams import Sequence
import pandas as pd
import numpy as np
import os
from typing import Dict, List, Tuple, Optional
import random

In [66]:
ustawy_dir = "../ustawy"
ustawy_files = seq(os.listdir(ustawy_dir)).map(
    lambda filename: open(ustawy_dir + "/" + filename).read())

In [70]:
good, bad = ustawy_files.partition(lambda x: "Art" in x[:2400])

In [94]:
bad.size()

2

In [74]:
bad[0][:300].split("\n")

['',
 'USTAWA',
 'z dnia 11 października 2013 r.',
 'o wzajemnej pomocy przy dochodzeniu podatków, należności ',
 'celnych i innych należności pieniężnych',
 '',
 '<!-/*',
 'Font Definitions */',
 ' @font-face',
 '\t{font-family:Helvetica;',
 '\tpanose-1:2 11 5 4 2 2 2 2 2 4;}',
 '@font-face',
 '\t{font-family:Courier;',
 '\tpanose-1:2 7 4 9 2 2 5 2 4 4;}',
 '']

In [76]:
bad[1][:300].split("\n")

['', '', '', '', '', 'Brak tekstu w postaci elektronicznej ', '']

In [77]:
def is_change(ustawa:str) -> bool:
    return "o zmianie ustawy" in ustawa[:800]

In [78]:
changes, not_changes = good.partition(is_change)

In [79]:
from collections import namedtuple

In [80]:
Labeled =  namedtuple("Labeled","text is_change")

In [104]:
def strip_title(text:str) -> str:
    return text.split("Art",maxsplit=1)[1]

In [105]:
labeled_changes = changes.map(lambda txt: Labeled(text = strip_title(txt), is_change = True))

In [106]:
labeled_not_changes = not_changes.map(lambda txt: Labeled(text = strip_title(txt), is_change = False))

In [114]:
data = (labeled_changes + labeled_not_changes).to_list()

In [115]:
from random import shuffle
shuffle(data)

In [118]:
data_len = len(data)
train_size = int(0.6 * data_len)
validation_size = int(0.2 * data_len)

train = data[:train_size]
validation = data[train_size:train_size + validation_size]
test = data[train_size + validation_size:]

In [122]:
len(train),len(validation),len(test)

(706, 235, 237)

In [138]:
from random import choices # todo:bcm - take without replacements
def full_selector(text):
    return text

def percentage_selector(text):
    lines = text.split("\n")
    take_count = int(0.1* len(lines))
    return "\n".join(choices(lines,k= take_count))

def lines_selector(text):
    lines = text.split("\n")
    return "\n".join(choices(lines,k= 10))
    
def line_selector(text):
    lines = text.split("\n")
    lines_len = len(lines)
    return lines[random.randint(0,lines_len-1)]

In [139]:
Selector = namedtuple("Selector", "name selector")

In [140]:
selectors = seq([
    Selector(name = "full", selector = full_selector),
    Selector(name = "percentage", selector = percentage_selector),
    Selector(name = "lines", selector = lines_selector),
    Selector(name = "line", selector = line_selector),
])

In [145]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report


In [158]:
def grid_search(train_x, train_y, test_x, test_y,  parameters, pipeline):
    grid_search_tune = GridSearchCV(
        pipeline, parameters, cv=2, n_jobs=3, verbose=10)
    grid_search_tune.fit(train_x, train_y)

    print("Best parameters set:")
    print(grid_search_tune.best_estimator_.steps)

    print("Applying best classifier on test data:")
    best_clf = grid_search_tune.best_estimator_
    predictions = best_clf.predict(test_x)

    print(classification_report(test_y, predictions))

In [159]:
stop_words = [] #todo:bcm

In [160]:
train_x = [x.text for x in train]
y_train = [x.is_change for x in train]
test_x = [x.text for x in test]
y_test = [x.is_change for x in test]

In [165]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
#     ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
    ('clf', OneVsRestClassifier(SVC(), n_jobs=1)),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__estimator__kernel': ['rbf'],
    'clf__estimator__gamma': [1e-3, 1e-4],
    'clf__estimator__C': [1, 10]

#     "clf__estimator__C": [0.01, 0.1, 1],
#     "clf__estimator__class_weight": ['balanced', None],
}
grid_search(train_x, y_train, test_x, y_test, parameters, pipeline)

Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    7.9s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   39.3s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  2.4min
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  3.2min
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  4.0min
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:  5.1min
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:  6.0min
[Parallel(n_jobs=3)]: Done  72 out of  72 | elapsed:  6.7min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=[], strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', OneVsRestClassifier(estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1))]
Applying best classifier on test data:
              precision    recall  f1-score   support

       False       0.51      1.00      0.67       120
        True       0.00      0.00      0.00       117

   micro avg       0.51      0.51 

  'precision', 'predicted', average, warn_for)
