In [1]:
from functional import seq
from functional.streams import Sequence
import pandas as pd
import numpy as np
import os
from typing import Dict, List, Tuple, Optional
import random
from collections import namedtuple
from random import shuffle
import pandas as pd
from IPython.core.display import HTML

# Preprocessing

In [2]:
ustawy_dir = "../lower_ustawy"
art_keyword = "art"
ustawy_files = seq(os.listdir(ustawy_dir)).map(
    lambda filename: open(ustawy_dir + "/" + filename).read())

ustawy_files.size()

1180

In [3]:
good, bad = ustawy_files.partition(lambda x: art_keyword in x[:2400])

In [4]:
good.size()

1178

In [5]:
bad.size()

2

In [6]:
bad[0][:300].split("\n")

['',
 'ustawa',
 'z dnia 11 października 2013 r ',
 'o wzajemnej pomocy przy dochodzeniu podatków  należności ',
 'celnych i innych należności pieniężnych',
 '',
 '     ',
 'font definitions   ',
 '  font face',
 '\t font family helvetica ',
 '\tpanose 1 2 11 5 4 2 2 2 2 2 4  ',
 ' font face',
 '\t font family courier ',
 '\tpanose 1 2 7 4 9 2 2 5 2 4 4  ',
 '']

In [7]:
bad[1][:300].split("\n")

['', '', '', '', '', 'brak tekstu w postaci elektronicznej ', '']

In [8]:
def is_change(ustawa:str) -> bool:
    return "o zmianie ustawy" in ustawa[:800]

In [9]:
changes, not_changes = good.partition(is_change)

In [10]:
Labeled =  namedtuple("Labeled","text is_change")

In [11]:
def strip_title(text:str) -> str:
    return text.split(art_keyword,maxsplit=1)[1]

In [12]:
labeled_changes = changes.map(lambda txt: Labeled(text = strip_title(txt), is_change = True))

In [13]:
labeled_not_changes = not_changes.map(lambda txt: Labeled(text = strip_title(txt), is_change = False))

In [14]:
data = (labeled_changes + labeled_not_changes).to_list()

In [15]:
shuffle(data)

## Selectors

In [17]:
from random import choices
def full_selector(text):
    return text

def percentage_selector(text):
    lines = text.split("\n")
    take_count = int(0.1* len(lines))
    return "\n".join(choices(lines,k= take_count))

def lines_selector(text):
    lines = text.split("\n")
    return "\n".join(choices(lines,k= 10))
    
def line_selector(text):
    lines = text.split("\n")
    lines_len = len(lines)
    return lines[random.randint(0,lines_len-1)]

In [174]:
Selector = namedtuple("Selector", "name selector")

In [175]:
selectors = seq([
    Selector(name = "full", selector = full_selector),
    Selector(name = "percentage", selector = percentage_selector),
    Selector(name = "lines", selector = lines_selector),
    Selector(name = "line", selector = line_selector),
])

# SVM + TF-IDF

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

### Stop words

In [21]:
sw_file = open("polish.stopwords.txt")
stop_words = set(word[:-1] for word in sw_file.readlines())
sw_file.close()
list(stop_words)[:3]

['lub', 'niz', 'możliwe']

### Utils

In [22]:
def map_text(mapper, data):
    return [Labeled(text = mapper(x.text), is_change= x.is_change) for x in data]

In [23]:
def split2(x,y):
    data_len = len(y)
    train_size = int(0.6 * data_len)
    validation_size = int(0.2 * data_len)

    train = x[:train_size]
    validation = x[train_size:train_size + validation_size]
    test = x[train_size + validation_size:]
    
    y_train = y[:train_size]
    y_validation = y[train_size:train_size + validation_size]
    y_test = y[train_size + validation_size:]
    
    
    return (train,test,validation), (y_train,y_test,y_validation)

## Evaluation

In [24]:
def grid_search(train_x, train_y,  parameters, pipeline):
    grid_search_tune = GridSearchCV(
        pipeline, parameters, cv=2, n_jobs=3, verbose=10, return_train_score =True)
    grid_search_tune.fit(train_x, train_y)

    
    return (
        grid_search_tune.best_estimator_,
        grid_search_tune.best_params_,
        grid_search_tune.cv_results_
    )

In [141]:
def evaluate(predicted,expected):
    prec,recall,fbeta,support = precision_recall_fscore_support(y_true=expected,y_pred = predicted,average='weighted')

    print("F1 score: {}".format(fbeta))
    print("Precission: {}".format(prec))
    print("Recall: {}".format(recall))
    return fbeata,prec,recall

In [26]:
def present_results(results,clf,val_x,val_y):
    other = seq(range(0,2))\
    .flat_map(lambda i : ["split{}_test_score".format(i),"split{}_train_score".format(i)])\
    .to_list()
    
    labels = ["mean_fit_time","std_fit_time","std_score_time","mean_score_time","params",
              "std_test_score","std_train_score"] + other
    
    seq(labels).for_each(lambda label: results.pop(label))
    frame = pd.DataFrame(results)
    
    display(
        frame.sort_values("rank_test_score",ascending = 1)
    )
    
    print("On cross validation:")
    true_val = clf.predict(val_x)
    evaluate(val_y,true_val)

In [30]:
def evaluate_linear_svc(data, params,selector:Selector):
    mapped_data =  map_text(selector.selector,data)
    xs,ys = [x.text for x in mapped_data], [x.is_change for x in mapped_data]
    (tr_x,test_x,val_x),(tr_y,test_y,val_y)= split2(xs,ys)
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=stop_words)),
        ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=3)),
    ])
    
    display(HTML("<h2>Selector: {}</h2>".format(selector.name)))
    
    best_clf, best_params, results  = grid_search(tr_x + test_x, tr_y+ test_y, parameters, pipeline)
    print(best_params)
    present_results(results,best_clf,val_x,val_y)

In [31]:
parameters = {
    'tfidf__max_df': ( 0.25, 0.5,0.75,),
    'tfidf__ngram_range': [(1,2),(1, 3)],
    "clf__estimator__C": [0.1,0.2,0.25,0.3],
}

##### Result

In [32]:
selectors.for_each(lambda selector: evaluate_linear_svc(data,parameters,selector))

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   11.0s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   46.2s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:  1.2min
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:  2.0min
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  2.6min
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  3.5min
[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:  4.4min remaining:    0.0s
[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:  4.4min finished


{'clf__estimator__C': 0.3, 'tfidf__max_df': 0.75, 'tfidf__ngram_range': (1, 3)}


Unnamed: 0,mean_test_score,mean_train_score,param_clf__estimator__C,param_tfidf__max_df,param_tfidf__ngram_range,rank_test_score
23,0.831389,0.99364,0.3,0.75,"(1, 3)",1
16,0.829268,0.980905,0.25,0.75,"(1, 2)",2
22,0.829268,0.987272,0.3,0.75,"(1, 2)",2
17,0.827147,0.991519,0.25,0.75,"(1, 3)",4
10,0.827147,0.977727,0.2,0.75,"(1, 2)",4
4,0.821845,0.971365,0.1,0.75,"(1, 2)",6
11,0.820785,0.987272,0.2,0.75,"(1, 3)",7
5,0.817603,0.97667,0.1,0.75,"(1, 3)",8
21,0.803818,0.99682,0.3,0.5,"(1, 3)",9
15,0.799576,0.992578,0.25,0.5,"(1, 3)",10


On cross validation:
F1 score: 0.8386506769825919
Precission: 0.8409459555341668
Recall: 0.8382978723404255


Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    1.0s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    3.8s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    5.9s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:    9.8s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   12.9s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   18.1s
[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:   23.6s remaining:    0.0s
[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:   23.6s finished


{'clf__estimator__C': 0.1, 'tfidf__max_df': 0.75, 'tfidf__ngram_range': (1, 2)}


Unnamed: 0,mean_test_score,mean_train_score,param_clf__estimator__C,param_tfidf__max_df,param_tfidf__ngram_range,rank_test_score
4,0.629905,0.938469,0.1,0.75,"(1, 2)",1
5,0.623542,0.944834,0.1,0.75,"(1, 3)",2
0,0.621421,0.944832,0.1,0.25,"(1, 2)",3
10,0.61824,0.943775,0.2,0.75,"(1, 2)",4
2,0.617179,0.94165,0.1,0.5,"(1, 2)",5
1,0.616119,0.945891,0.1,0.25,"(1, 3)",6
8,0.616119,0.944834,0.2,0.5,"(1, 2)",6
16,0.613998,0.944834,0.25,0.75,"(1, 2)",8
23,0.612937,0.946953,0.3,0.75,"(1, 3)",9
3,0.612937,0.943773,0.1,0.5,"(1, 3)",9


On cross validation:
F1 score: 0.6768539797859985
Precission: 0.6775039409672413
Recall: 0.676595744680851


Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    1.1s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    1.6s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:    2.5s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    3.2s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:    4.2s


{'clf__estimator__C': 0.3, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}


[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:    5.5s remaining:    0.0s
[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:    5.5s finished


Unnamed: 0,mean_test_score,mean_train_score,param_clf__estimator__C,param_tfidf__max_df,param_tfidf__ngram_range,rank_test_score
20,0.705196,0.995756,0.3,0.5,"(1, 2)",1
14,0.704136,0.995756,0.25,0.5,"(1, 2)",2
22,0.699894,0.995756,0.3,0.75,"(1, 2)",3
18,0.698834,0.998941,0.3,0.25,"(1, 2)",4
8,0.697773,0.994694,0.2,0.5,"(1, 2)",5
16,0.696713,0.995756,0.25,0.75,"(1, 2)",6
4,0.695652,0.979846,0.1,0.75,"(1, 2)",7
2,0.695652,0.980905,0.1,0.5,"(1, 2)",7
12,0.693531,0.998941,0.25,0.25,"(1, 2)",9
6,0.692471,0.997879,0.2,0.25,"(1, 2)",10


On cross validation:
F1 score: 0.7152041847847291
Precission: 0.7161615133314206
Recall: 0.7148936170212766


Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Batch computation too fast (0.1350s.) Setting batch_size=2.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.6s
[Parallel(n_jobs=3)]: Done  18 tasks      | elapsed:    0.8s
[Parallel(n_jobs=3)]: Done  32 tasks      | elapsed:    1.7s


{'clf__estimator__C': 0.1, 'tfidf__max_df': 0.25, 'tfidf__ngram_range': (1, 2)}


[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:    2.2s remaining:    0.0s
[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:    2.2s finished


Unnamed: 0,mean_test_score,mean_train_score,param_clf__estimator__C,param_tfidf__max_df,param_tfidf__ngram_range,rank_test_score
0,0.603393,0.893955,0.1,0.25,"(1, 2)",1
2,0.603393,0.893955,0.1,0.5,"(1, 2)",1
4,0.603393,0.893955,0.1,0.75,"(1, 2)",1
10,0.599152,0.908802,0.2,0.75,"(1, 2)",4
8,0.599152,0.908802,0.2,0.5,"(1, 2)",4
6,0.599152,0.908802,0.2,0.25,"(1, 2)",4
9,0.598091,0.910922,0.2,0.5,"(1, 3)",7
11,0.598091,0.910922,0.2,0.75,"(1, 3)",7
7,0.598091,0.910922,0.2,0.25,"(1, 3)",7
13,0.597031,0.910922,0.25,0.25,"(1, 3)",10


On cross validation:
F1 score: 0.6431880651161931
Precission: 0.6483879824407159
Recall: 0.6425531914893617


# Fasttext

In [36]:
import fastText

In [24]:
oneline_data = map_text(lambda text: text.replace("\n", " "),data)

In [31]:
def split_ol_data(ol_data):
    
    data_len = len(ol_data)
    train_size = int(0.6 * data_len)
    validation_size = int(0.2 * data_len)
    ol_train = ol_data[:train_size]
    ol_test = ol_data[train_size:train_size+ validation_size]
    ol_val = ol_data[validation_size+ train_size:]
    return ol_train,ol_test,ol_val

In [33]:
def to_fast_text(x: Labeled)-> str :
    label = "1" if x.is_change else "0"
    replaced = x.text.replace("\"","\"\"")
    return "__label__{}, \"{}\"\n".format(label,replaced)

def ol_to_file(name:str, ol:List[Labeled])-> None:
    file = open("data.bak/{}".format(name),"w")

    for t in ol:
        file.write(to_fast_text(t))
    file.close()

In [32]:
ol_train,ol_test,ol_val = split_ol_data(oneline_data)

In [29]:
ol_to_file("train.csv",ol_train)
ol_to_file("test.csv",ol_test)
ol_to_file("val.csv",ol_val)

In [109]:
from typing import Dict, List
from copy import deepcopy

def printAdPass(p,r):
    print(p)
    return r

def params_permutation(params:Dict[str,List[object]]) -> List[Dict[str,object]]: 
    param_left = len(params)
    if (param_left == 1):
        key, values = list(params.items())[0]
        return seq(values).map(lambda val: {key:val}).to_list()
    else:
        p = seq(params.items())
        key,values = p.head()
        rest = params_permutation(p.tail().to_dict())
        
        def append_and_return(d,k,value):
            if(k in d):
                raise Exception
            ala = deepcopy(d)
            ala[k]=value
            return ala
        
        return deepcopy(seq(values)\
            .flat_map(lambda value: seq(rest)\
                      .map(lambda d: append_and_return(d,key,value))
                     ).to_list())


        


In [180]:
def grid_search_fast_text(data_path: str, testx: List[str],
                          testy: List[str], params_list: Dict[str, List]):
    best_model = None
    best_params = None
    best_f1 = 0
    best_score = None

    
    max_runs = len(params_list)
    print_interval = max_runs // 10
    run_count = 0
    
    results = []
    
    for params in params_list:
        model = fastText.train_supervised(data_path, **params)

        if run_count% print_interval == 0:
            print("Run : {}/{}".format(run_count,max_runs))
        run_count = run_count + 1
        
        predicted, _ = model.predict(test_x)
        predicted_2 = seq(predicted).map(lambda x: x[0][:-1]).to_list()

        prec, recall, fbeta, _ = precision_recall_fscore_support(
            y_true=testy, y_pred=predicted_2, average='weighted')
        
        if fbeta > best_f1:
            best_model = model
            best_params = params
            best_f1 = fbeta
            best_score = fbeta,prec,recall
            
        score = {"f1":fbeta,"prec": prec, "recall": recall}
        score.update(params)
        results.append(score)
    
    
            
            
    return best_model,best_params, best_score,results

In [189]:
fs_params = {
    "lr" : [0.4,0.5,0.6,0.7,0.8], #,0.9,1.1],
#     "wordNgrams" : [1],#,2,3],
    "dim": [50,75,100,150],
    "loss": ["ns"],#"softmax",hs","ns","ova"]
}
params_list = params_permutation(fs_params)

In [191]:
def evaluate_fast_text(data,params_list,selector:Selector):
    selected = map_text(selector.selector,data)
    ol_selected = map_text(lambda text: text.replace("\n"," "),selected)
    
    ol_train,ol_test,ol_val = split_ol_data(ol_selected)
    ol_to_file("train.csv",ol_train)
    ol_to_file("test.csv",ol_test)
    ol_to_file("val.csv",ol_val)
    
    test_x = [x.text for x in ol_test]
    test_y = ["__label__1" if x.is_change else "__label__0" for x in ol_test]
    
    display(HTML("<h2>Selector: {}</h2>".format(selector.name)))
    
    best_model,best_params, best_score,results = \
        grid_search_fast_text('data.bak/train.csv',testx=test_x,testy=test_y,params_list=params_list)
    print("Test F1 score: {}".format(best_score[0]))
    print("Test Precision: {}".format(best_score[1]))
    print("Test Recall: {}".format(best_score[2]))
    
    
    val_x= [x.text for x in ol_val]
    val_y = ["__label__1" if x.is_change else "__label__0" for x in ol_val]
    
    
    predicted, _ = model.predict(val_x)
    predicted_2 = seq(predicted).map(lambda x: x[0][:-1]).to_list()
    
    
    prec, recall, fbeta, _ = precision_recall_fscore_support(
        y_true=val_y, y_pred=predicted_2, average='weighted')
    
    print("Validation F1 score: {}".format(fbeta))
    print("Validation Precision: {}".format(prec))
    print("Validation Recall: {}".format(recall))
    
    display(pd.DataFrame(results).sort_values("f1",ascending = 0))
    

In [192]:
selectors.for_each(lambda selector: evaluate_fast_text(data,params_list,selector))

Run : 0/20
Run : 2/20
Run : 4/20
Run : 6/20
Run : 8/20
Run : 10/20
Run : 12/20
Run : 14/20
Run : 16/20
Run : 18/20
Test F1 score: 0.8382333135641387
Test Precision: 0.8382904524290934
Test Recall: 0.8382978723404255
Validation F1 score: 0.8566527063601904
Validation Precision: 0.8572534442469572
Validation Recall: 0.8565400843881856


Unnamed: 0,dim,f1,loss,lr,prec,recall
12,50,0.838233,ns,0.7,0.83829,0.838298
16,50,0.837738,ns,0.8,0.840251,0.838298
19,150,0.834067,ns,0.8,0.834121,0.834043
7,150,0.834012,ns,0.5,0.834013,0.834043
17,75,0.82983,ns,0.8,0.829994,0.829787
15,150,0.829787,ns,0.7,0.829787,0.829787
14,100,0.829787,ns,0.7,0.829787,0.829787
6,100,0.825589,ns,0.5,0.825915,0.825532
8,50,0.825589,ns,0.6,0.825915,0.825532
13,75,0.825557,ns,0.7,0.825612,0.825532


Run : 0/20
Run : 2/20
Run : 4/20
Run : 6/20
Run : 8/20
Run : 10/20
Run : 12/20
Run : 14/20
Run : 16/20
Run : 18/20
Test F1 score: 0.8100267568833732
Test Precision: 0.8250740931166266
Test Recall: 0.8127659574468085
Validation F1 score: 0.7243249063001903
Validation Precision: 0.726080569158274
Validation Recall: 0.7257383966244726


Unnamed: 0,dim,f1,loss,lr,prec,recall
19,150,0.810027,ns,0.8,0.825074,0.812766
8,50,0.80629,ns,0.6,0.817477,0.808511
3,150,0.805059,ns,0.4,0.824246,0.808511
18,100,0.804255,ns,0.8,0.804255,0.804255
5,75,0.803355,ns,0.5,0.806712,0.804255
4,50,0.802168,ns,0.5,0.812137,0.804255
17,75,0.799746,ns,0.8,0.800219,0.8
15,150,0.799459,ns,0.7,0.826972,0.804255
12,50,0.79941,ns,0.7,0.801189,0.8
0,50,0.798046,ns,0.4,0.806881,0.8


Run : 0/20
Run : 2/20
Run : 4/20
Run : 6/20
Run : 8/20
Run : 10/20
Run : 12/20
Run : 14/20
Run : 16/20
Run : 18/20
Test F1 score: 0.7999908085613497
Test Precision: 0.8238656754678287
Test Recall: 0.8042553191489362
Validation F1 score: 0.7121313325264585
Validation Precision: 0.7128209267048071
Validation Recall: 0.7130801687763713


Unnamed: 0,dim,f1,loss,lr,prec,recall
18,100,0.799991,ns,0.8,0.823866,0.804255
12,50,0.797681,ns,0.7,0.80862,0.8
7,150,0.79074,ns,0.5,0.817665,0.795745
16,50,0.788864,ns,0.8,0.828422,0.795745
14,100,0.787431,ns,0.7,0.837233,0.795745
8,50,0.781406,ns,0.6,0.811535,0.787234
5,75,0.780755,ns,0.5,0.814999,0.787234
11,150,0.774541,ns,0.6,0.820099,0.782979
6,100,0.769717,ns,0.5,0.817357,0.778723
9,75,0.769717,ns,0.6,0.817357,0.778723


Run : 0/20
Run : 2/20
Run : 4/20
Run : 6/20
Run : 8/20
Run : 10/20
Run : 12/20
Run : 14/20
Run : 16/20
Run : 18/20
Test F1 score: 0.5693320538528424
Test Precision: 0.7526863864383324
Test Recall: 0.6297872340425532
Validation F1 score: 0.5768329634042347
Validation Precision: 0.6061772883588205
Validation Recall: 0.5991561181434599


Unnamed: 0,dim,f1,loss,lr,prec,recall
13,75,0.569332,ns,0.7,0.752686,0.629787
16,50,0.555765,ns,0.8,0.782452,0.625532
10,100,0.548871,ns,0.6,0.781022,0.621277
17,75,0.548871,ns,0.8,0.781022,0.621277
6,100,0.548871,ns,0.5,0.781022,0.621277
18,100,0.541899,ns,0.8,0.779607,0.617021
4,50,0.541899,ns,0.5,0.779607,0.617021
14,100,0.541899,ns,0.7,0.779607,0.617021
9,75,0.541899,ns,0.6,0.779607,0.617021
11,150,0.538733,ns,0.6,0.758527,0.612766


# Flair 
> Fails even on 12 GPU memory

In [22]:
from pathlib import Path

In [27]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.trainers import ModelTrainer
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.embeddings import StackedEmbeddings, CharLMEmbeddings, TokenEmbeddings   

In [28]:
# using #https://github.com/applicaai/poleval-2018.git

In [26]:
from flair.embeddings import TokenEmbeddings
from typing import List
import torch
import numpy as np

In [24]:
from flair.data_fetcher import NLPTaskDataFetcher
from pathlib import Path
from flair.data import TaggedCorpus

# use your own data path
data_folder = Path('./data.bak')

# load corpus containing training, test and dev data
corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus(data_folder,
                                                                     test_file='test.csv',
                                                                     dev_file='val.csv',
                                                                     train_file='train.csv')

2019-05-13 18:56:16,851 Reading data from data.bak
2019-05-13 18:56:16,852 Train: data.bak/train.csv
2019-05-13 18:56:16,853 Dev: data.bak/val.csv
2019-05-13 18:56:16,853 Test: data.bak/test.csv


In [29]:
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

In [30]:
# 4. initialize embeddings
word_embeddings = [WordEmbeddings('pl'),
                   # comment in flair embeddings for state-of-the-art results
#                    FlairEmbeddings('polish-forward'),
                   # FlairEmbeddings('polish-backward'),
                   ]


document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
                                                                     hidden_size=512,
                                                                     reproject_words=True,
                                                                     reproject_words_dimension=256,
                                                                     ) 


2019-05-13 18:57:51,476 this function is deprecated, use smart_open.open instead


In [31]:
label_dict = corpus.make_label_dictionary()
len(label_dict)

2

In [33]:
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)
trainer = ModelTrainer(classifier, corpus)

In [35]:
trainer.train('./data.bak',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=150)

# 8. plot training curves (optional)
from flair.visual.training_curves import Plotter
plotter = Plotter()
plotter.plot_training_curves('./data.bak/loss.tsv')
plotter.plot_weights('./data.bak/weights.txt')

2019-05-13 18:59:42,131 ----------------------------------------------------------------------------------------------------
2019-05-13 18:59:42,132 Evaluation method: MICRO_F1_SCORE
2019-05-13 18:59:42,133 ----------------------------------------------------------------------------------------------------


RuntimeError: CUDA out of memory. Tried to allocate 1.69 GiB (GPU 0; 3.95 GiB total capacity; 1.69 GiB already allocated; 1.59 GiB free; 17.19 MiB cached)