# PolEval - Punctuation restoration from read text

Głównym celem projektu jest stworzenie narzędzia, który będzie przywracał znaki interpunkcyjne - | . | , | ? | ! | etc., zgodnie z opisem zadania na [oficjalnej stronie konkursu PolEval](https://beta.poleval.pl/challenge/punctuation-restoration).

### Wczytanie bibliotek### 




In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf
from tensorflow.python.client import device_lib

print("GPUs Available: ", (tf.test.is_gpu_available()))
device_lib.list_local_devices()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPUs Available:  True


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 9140373945506772626
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 5654970368
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 9438706159457170788
 physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3080 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6"
 xla_global_id: 416903419]

In [3]:
pd.set_option('display.expand_frame_repr', False)

### Zbiór danych### 




In [4]:
data=pd.read_csv('data/train/in.tsv', names = ['FileId', 'ASROutput'], sep='\t')
data.head()

Unnamed: 0,FileId,ASROutput
0,wikinews228460,"w wywiadzie dla "" polski "" jarosław kaczyński ..."
1,wikinews183471,dowody prokuratury przedstawiające że janusz k...
2,wikinews188040,w czasie długiego weekendu 1 3 maja 2009 w jad...
3,wikinews178804,drugie zwycięstwo w tegorocznym giro d'italia ...
4,wikinews231649,takiego mrozu nie było w polsce od dawna słupk...


In [5]:
data=pd.read_csv('data/train/expected.tsv', names = ['ASROutput'], sep='\t')
pd.set_option('max_colwidth', 200)
data.head()

Unnamed: 0,ASROutput
0,"w wywiadzie dla "" polski "" jarosław kaczyński podkreślił, że informacje dotyczące radosława sikorskiego zagrażają interesowi państwa. "" to naprawdę wszystko, co mogę na ten temat powiedzieć ""- odp..."
1,"dowody prokuratury przedstawiające, że janusz kaczmarek, były szef mswia, kłamał, wypierając się spotkania z ryszardem krauze 5 lipca- nie osłabiły determinacji opozycji. teraz opozycja żąda przes..."
2,"w czasie długiego weekendu, 1- 3 maja 2009 w jadwisinie nad jeziorem zegrzyńskim stowarzyszenie wikimedia polska organizuje konferencję poświęconą zarówno wikimediom, m. in. wikipedii, jak i rucho..."
3,"drugie zwycięstwo w tegorocznym giro d'italia odniósł danilo di luca z grupy liquigas, zmieniając tym samym paolo bettiniego na pozycji lidera. wyścig przebiegał dzisiaj 223- kilometrową trasą z c..."
4,"takiego mrozu nie było w polsce od dawna. słupki rtęci w znaczącej części polski spadły nie tylko poniżej 10 stopni na minusie, ale nawet poniżej 20 bądź 30 tak zimno jest od czwartku m. in. w suw..."


In [6]:
PUNCTUATION = ['"-', ',', '.', ':', ';', '?', '!', '"', '%', '&', '(', ')', "'"]

In [7]:
import nltk
from nltk.tokenize import wordpunct_tokenize

DATA_SHAPE = data.shape[0]
data["ASROutput"] = data["ASROutput"].apply(lambda x: wordpunct_tokenize(x))

In [8]:
data["ASROutput"].head()

0    [w, wywiadzie, dla, ", polski, ", jarosław, kaczyński, podkreślił, ,, że, informacje, dotyczące, radosława, sikorskiego, zagrażają, interesowi, państwa, ., ", to, naprawdę, wszystko, ,, co, mogę, ...
1    [dowody, prokuratury, przedstawiające, ,, że, janusz, kaczmarek, ,, były, szef, mswia, ,, kłamał, ,, wypierając, się, spotkania, z, ryszardem, krauze, 5, lipca, -, nie, osłabiły, determinacji, opo...
2    [w, czasie, długiego, weekendu, ,, 1, -, 3, maja, 2009, w, jadwisinie, nad, jeziorem, zegrzyńskim, stowarzyszenie, wikimedia, polska, organizuje, konferencję, poświęconą, zarówno, wikimediom, ,, m...
3    [drugie, zwycięstwo, w, tegorocznym, giro, d, ', italia, odniósł, danilo, di, luca, z, grupy, liquigas, ,, zmieniając, tym, samym, paolo, bettiniego, na, pozycji, lidera, ., wyścig, przebiegał, dz...
4    [takiego, mrozu, nie, było, w, polsce, od, dawna, ., słupki, rtęci, w, znaczącej, części, polski, spadły, nie, tylko, poniżej, 10, stopni, na, minusie, ,, ale, nawet, poniżej,

In [9]:
train_set = data["ASROutput"]
train_set = [item for sublist in train_set for item in sublist]
train_set = np.array(train_set)
train_set.shape

(189129,)

In [10]:
TOKEN_COUNT = train_set.shape[0]
drop_indices = [idx for idx in range(TOKEN_COUNT) if train_set[idx] in PUNCTUATION ]
labels = [train_set[idx+1] if idx+1 in drop_indices else 'NO' for idx in range(TOKEN_COUNT)]
labels = np.array(labels)
X_ = np.delete(train_set, drop_indices)
labels = np.delete(labels, drop_indices)

assert (X_.shape[0] == len(labels))

In [11]:
# print example

vocab_punct = {'Token': X_.tolist(), 'Punctuation': labels}
df_dataset_view = pd.DataFrame(data=vocab_punct)
df_dataset_view[:20]

Unnamed: 0,Token,Punctuation
0,w,NO
1,wywiadzie,NO
2,dla,""""
3,polski,""""
4,jarosław,NO
5,kaczyński,NO
6,podkreślił,","
7,że,NO
8,informacje,NO
9,dotyczące,NO


### Word2Vec### 


In [12]:
import fasttext
import fasttext.util

ft = fasttext.load_model('cc.pl.300.bin')



In [13]:
WORD_2_VEC_SIZE = 300

In [14]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(labels)
y = le.transform(labels)

In [15]:
print(y[:15])
print(labels[:15])

[12 12  1  1 12 12  7 12 12 12 12 12 12 12  8]
['NO' 'NO' '"' '"' 'NO' 'NO' ',' 'NO' 'NO' 'NO' 'NO' 'NO' 'NO' 'NO' '.']


In [16]:
from collections import Counter
Counter(labels)

Counter({'NO': 140460,
         '"': 1684,
         ',': 9852,
         '.': 10009,
         '"-': 365,
         ':': 905,
         '?': 750,
         "'": 140,
         '%': 119,
         '!': 116,
         ';': 89,
         '(': 8,
         ')': 11})

In [17]:
y.shape

(164508,)

In [18]:
X = np.empty((X_.shape[0], WORD_2_VEC_SIZE), dtype = float)
for idx, token in np.ndenumerate(X_):
    X[idx] = ft.get_word_vector(token)

In [19]:
X.shape

(164508, 300)

In [20]:
X[:10]

array([[ 0.15531263, -0.04308021, -0.00168156, ...,  0.04180686,
        -0.00038691, -0.10296766],
       [ 0.04996644,  0.03045041, -0.03490978, ...,  0.01671311,
         0.04654586, -0.02967763],
       [-0.11573004,  0.0370084 ,  0.04243099, ...,  0.01563751,
         0.10110958, -0.02245275],
       ...,
       [ 0.25485411,  0.13717936, -0.02394456, ..., -0.07372162,
        -0.10366685,  0.01764201],
       [-0.02322206,  0.00295874,  0.02591213, ...,  0.00349834,
        -0.0202674 ,  0.04200821],
       [-0.02845111,  0.00157207,  0.02760693, ...,  0.01035177,
        -0.01641942,  0.02249819]])

###Model

Set floating point precision to 16-*bit*

In [21]:
from tensorflow.keras import mixed_precision
import tensorflow.keras.backend as K
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)
K.set_epsilon(1e-4)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3080 Laptop GPU, compute capability 8.6


In [22]:
from keras.models import Sequential
from keras.layers import Input, CuDNNLSTM, Dense, LSTM, Flatten, Dropout
import keras.backend as K

model = Sequential()
model.add(CuDNNLSTM(256, return_sequences=True, input_shape=(X.shape[1], 1)))
model.add(CuDNNLSTM(128, return_sequences=True))
model.add(Flatten())
model.add(Dropout(.2))
model.add(Dense(32,activation='relu'))
model.add(Dropout(.2))
model.add(Dense(len(le.classes_), activation='softmax'))

model.compile(loss=['sparse_categorical_crossentropy'], optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 cu_dnnlstm (CuDNNLSTM)      (None, 300, 256)          265216    
                                                                 
 cu_dnnlstm_1 (CuDNNLSTM)    (None, 300, 128)          197632    
                                                                 
 flatten (Flatten)           (None, 38400)             0         
                                                                 
 dropout (Dropout)           (None, 38400)             0         
                                                                 
 dense (Dense)               (None, 32)                1228832   
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 13)                4

In [23]:
history = model.fit(X, y, epochs=10, batch_size=256, validation_split=0.2, verbose=2)

Epoch 1/10
515/515 - 31s - loss: 0.6054 - accuracy: 0.8540 - val_loss: 0.5366 - val_accuracy: 0.8582 - 31s/epoch - 60ms/step
Epoch 2/10
515/515 - 27s - loss: 0.5540 - accuracy: 0.8575 - val_loss: 0.5205 - val_accuracy: 0.8600 - 27s/epoch - 52ms/step
Epoch 3/10
515/515 - 27s - loss: 0.5366 - accuracy: 0.8585 - val_loss: 0.5179 - val_accuracy: 0.8603 - 27s/epoch - 52ms/step
Epoch 4/10
515/515 - 27s - loss: 0.5272 - accuracy: 0.8590 - val_loss: 0.5055 - val_accuracy: 0.8605 - 27s/epoch - 52ms/step
Epoch 5/10
515/515 - 27s - loss: 0.5222 - accuracy: 0.8592 - val_loss: 0.5109 - val_accuracy: 0.8606 - 27s/epoch - 52ms/step
Epoch 6/10
515/515 - 27s - loss: 0.5167 - accuracy: 0.8595 - val_loss: 0.5021 - val_accuracy: 0.8607 - 27s/epoch - 52ms/step
Epoch 7/10
515/515 - 27s - loss: 0.5112 - accuracy: 0.8593 - val_loss: 0.5017 - val_accuracy: 0.8606 - 27s/epoch - 52ms/step
Epoch 8/10
515/515 - 27s - loss: 0.5074 - accuracy: 0.8594 - val_loss: 0.5000 - val_accuracy: 0.8606 - 27s/epoch - 52ms/step


In [24]:
from keras.models import Sequential
from keras.layers import Input, CuDNNLSTM, Dense, LSTM, Flatten, Bidirectional, Dropout
from sklearn.utils import class_weight

class_weights = {
    0: 1.,
    1: 0.8,
    2: 1.,
    3: 1.,
    4: 1.,
    5: 1.,
    6: 1.,
    7: 0.8,
    8: 0.8,
    9: 0.8,
    10: 1.,
    11: 1.,
    12: 0.1
}

early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)

model_bi = Sequential()
model_bi.add(Bidirectional(CuDNNLSTM(256, return_sequences=True), input_shape=(X.shape[1], 1)))
model_bi.add(Bidirectional(CuDNNLSTM(128, return_sequences=True)))
model_bi.add(Flatten())
model_bi.add(Dropout(.2))
model_bi.add(Dense(32,activation='relu'))
model_bi.add(Dropout(.2))
model_bi.add(Dense(len(le.classes_), activation='softmax'))

model_bi.compile(loss=['sparse_categorical_crossentropy'], optimizer='adam', metrics=['accuracy'])
model_bi.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 300, 512)         530432    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 300, 256)         657408    
 nal)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 76800)             0         
                                                                 
 dropout_2 (Dropout)         (None, 76800)             0         
                                                                 
 dense_2 (Dense)             (None, 32)                2457632   
                                                                 
 dropout_3 (Dropout)         (None, 32)               

In [25]:
history_bi = model_bi.fit(X, y, epochs=50, batch_size=1024, validation_split=0.2, class_weight = class_weights, callbacks = [early_stopping_callback], verbose=2)

Epoch 1/50
129/129 - 77s - loss: 0.3047 - accuracy: 0.6483 - val_loss: 0.8879 - val_accuracy: 0.7162 - 77s/epoch - 598ms/step
Epoch 2/50
129/129 - 32s - loss: 0.2769 - accuracy: 0.6785 - val_loss: 0.9491 - val_accuracy: 0.6484 - 32s/epoch - 245ms/step
Epoch 3/50
129/129 - 32s - loss: 0.2679 - accuracy: 0.6874 - val_loss: 0.8202 - val_accuracy: 0.7021 - 32s/epoch - 246ms/step
Epoch 4/50
129/129 - 32s - loss: 0.2630 - accuracy: 0.6954 - val_loss: 0.7965 - val_accuracy: 0.7304 - 32s/epoch - 247ms/step
Epoch 5/50
129/129 - 32s - loss: 0.2584 - accuracy: 0.7022 - val_loss: 0.7825 - val_accuracy: 0.7340 - 32s/epoch - 248ms/step
Epoch 6/50
129/129 - 36s - loss: 0.2549 - accuracy: 0.7080 - val_loss: 0.8203 - val_accuracy: 0.7231 - 36s/epoch - 281ms/step
Epoch 7/50
129/129 - 37s - loss: 0.2519 - accuracy: 0.7106 - val_loss: 0.8216 - val_accuracy: 0.7170 - 37s/epoch - 289ms/step
Epoch 8/50
129/129 - 36s - loss: 0.2496 - accuracy: 0.7081 - val_loss: 0.7840 - val_accuracy: 0.7056 - 36s/epoch - 281

Test models

In [26]:
data_test=pd.read_csv('data/test-A/expected.tsv', names = ['ASROutput'], sep='\t')
data.head()

Unnamed: 0,ASROutput
0,"[w, wywiadzie, dla, "", polski, "", jarosław, kaczyński, podkreślił, ,, że, informacje, dotyczące, radosława, sikorskiego, zagrażają, interesowi, państwa, ., "", to, naprawdę, wszystko, ,, co, mogę, ..."
1,"[dowody, prokuratury, przedstawiające, ,, że, janusz, kaczmarek, ,, były, szef, mswia, ,, kłamał, ,, wypierając, się, spotkania, z, ryszardem, krauze, 5, lipca, -, nie, osłabiły, determinacji, opo..."
2,"[w, czasie, długiego, weekendu, ,, 1, -, 3, maja, 2009, w, jadwisinie, nad, jeziorem, zegrzyńskim, stowarzyszenie, wikimedia, polska, organizuje, konferencję, poświęconą, zarówno, wikimediom, ,, m..."
3,"[drugie, zwycięstwo, w, tegorocznym, giro, d, ', italia, odniósł, danilo, di, luca, z, grupy, liquigas, ,, zmieniając, tym, samym, paolo, bettiniego, na, pozycji, lidera, ., wyścig, przebiegał, dz..."
4,"[takiego, mrozu, nie, było, w, polsce, od, dawna, ., słupki, rtęci, w, znaczącej, części, polski, spadły, nie, tylko, poniżej, 10, stopni, na, minusie, ,, ale, nawet, poniżej, 20, bądź, 30, tak, z..."


In [27]:
import ast

data_test["ASROutput"] = data_test["ASROutput"].apply(lambda x: wordpunct_tokenize(x))
test_set = data_test["ASROutput"]
test_set = [item for sublist in test_set for item in sublist]
test_set = np.array(test_set)

In [28]:
TOKEN_COUNT_TEST = test_set.shape[0]
drop_indices_test = [idx for idx in range(TOKEN_COUNT_TEST) if test_set[idx] in PUNCTUATION ]
labels_test = [test_set[idx+1] if idx+1 in drop_indices_test else 'NO' for idx in range(TOKEN_COUNT_TEST)]
labels_test = np.array(labels_test)
X_test_ = np.delete(test_set, drop_indices_test)
labels_test = np.delete(labels_test, drop_indices_test)
assert (X_test_.shape[0] == len(labels_test))
y_test = le.transform(labels_test)

In [29]:
X_test = np.empty((X_test_.shape[0], WORD_2_VEC_SIZE), dtype = float)
for idx, token in np.ndenumerate(X_test_):
    X_test[idx] = ft.get_word_vector(token)

###Test model **1**

In [30]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)



In [31]:
y_preds = y_pred.argmax(axis=1)
len(le.classes_)
classes = list(le.classes_)
classes.remove(classes[5]) # this class does not exist in test dataset

In [32]:
print(classification_report(y_test, y_preds, target_names=classes))

              precision    recall  f1-score   support

           !       0.00      0.00      0.00        23
           "       0.00      0.00      0.00       334
          "-       0.00      0.00      0.00        92
           %       0.00      0.00      0.00        33
           '       0.00      0.00      0.00        27
           )       0.00      0.00      0.00         1
           ,       0.00      0.00      0.00      2440
           .       0.80      0.09      0.16      2485
           :       0.00      0.00      0.00       321
           ;       0.00      0.00      0.00        10
           ?       0.00      0.00      0.00       134
          NO       0.86      1.00      0.92     34860

    accuracy                           0.86     40760
   macro avg       0.14      0.09      0.09     40760
weighted avg       0.78      0.86      0.80     40760



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
y_preds = list(le.inverse_transform(y_preds))

In [34]:
vocab_punct_test = {'Token': X_test_.tolist(), 'Punctuation': labels_test, "Prediction": y_preds, "Probability scores": np.max(y_pred, axis = 1)}
df_dataset_view_test = pd.DataFrame(data=vocab_punct_test)

In [35]:
pd.set_option('display.max_rows', 500)
df_dataset_view_test[:5]

Unnamed: 0,Token,Punctuation,Prediction,Probability scores
0,co,NO,NO,0.986328
1,znaczy,",",NO,0.610352
2,że,NO,NO,0.96582
3,beginki,"""",NO,0.661133
4,padły,NO,NO,0.955078


In [36]:
df_dataset_view_test["match_count"] = df_dataset_view_test.groupby(['Punctuation', 'Prediction'])["Prediction"].transform("count")
df_dataset_view_test["total"] = df_dataset_view_test.groupby(['Punctuation'])["Punctuation"].transform("count")
df_dataset_view_test["match_ratio"] = round(df_dataset_view_test["match_count"] / df_dataset_view_test["total"] * 100, 2)
df_dataset_view_test[:100]

Unnamed: 0,Token,Punctuation,Prediction,Probability scores,match_count,total,match_ratio
0,co,NO,NO,0.986328,34816,34860,99.87
1,znaczy,",",NO,0.610352,2434,2440,99.75
2,że,NO,NO,0.96582,34816,34860,99.87
3,beginki,"""",NO,0.661133,332,334,99.4
4,padły,NO,NO,0.955078,34816,34860,99.87
5,ofiarą,NO,NO,0.862793,34816,34860,99.87
6,reformacji,NO,NO,0.709473,34816,34860,99.87
7,"""?",NO,NO,0.75293,34816,34860,99.87
8,grzesie2k,NO,NO,0.745117,34816,34860,99.87
9,wpis,NO,NO,0.777832,34816,34860,99.87


In [37]:
y_pred_bi = model_bi.predict(X_test)
y_preds_bi = y_pred_bi.argmax(axis=1)



In [38]:
print(classification_report(y_test, y_preds_bi, target_names=classes))

              precision    recall  f1-score   support

           !       0.03      0.04      0.03        23
           "       0.11      0.13      0.12       334
          "-       0.01      0.02      0.02        92
           %       0.22      0.06      0.10        33
           '       0.16      0.22      0.19        27
           )       0.00      0.00      0.00         1
           ,       0.15      0.30      0.20      2440
           .       0.19      0.46      0.27      2485
           :       0.17      0.14      0.15       321
           ;       0.00      0.00      0.00        10
           ?       0.08      0.07      0.07       134
          NO       0.92      0.76      0.83     34860

    accuracy                           0.70     40760
   macro avg       0.17      0.18      0.16     40760
weighted avg       0.81      0.70      0.74     40760



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
y_preds_bi = list(le.inverse_transform(y_preds_bi))
vocab_punct_test_bi = {'Token': X_test_.tolist(), 'Punctuation': labels_test, "Prediction": y_preds_bi, "Probability scores": np.max(y_pred_bi, axis = 1)}
df_dataset_view_test_bi = pd.DataFrame(data=vocab_punct_test_bi)

In [40]:
df_dataset_view_test_bi["match_count"] = df_dataset_view_test_bi.groupby(['Punctuation', 'Prediction'])["Prediction"].transform("count")
df_dataset_view_test_bi["total"] = df_dataset_view_test_bi.groupby(['Punctuation'])["Punctuation"].transform("count")
df_dataset_view_test_bi["match_ratio"] = round(df_dataset_view_test_bi["match_count"] / df_dataset_view_test_bi["total"] * 100, 2)
df_dataset_view_test_bi[:100]

Unnamed: 0,Token,Punctuation,Prediction,Probability scores,match_count,total,match_ratio
0,co,NO,NO,0.707031,26468,34860,75.93
1,znaczy,",",NO,0.348877,999,2440,40.94
2,że,NO,NO,0.749023,26468,34860,75.93
3,beginki,"""",NO,0.452881,187,334,55.99
4,padły,NO,NO,0.995605,26468,34860,75.93
5,ofiarą,NO,"""",0.559082,324,34860,0.93
6,reformacji,NO,",",0.581055,3552,34860,10.19
7,"""?",NO,"""",0.62793,324,34860,0.93
8,grzesie2k,NO,NO,0.681152,26468,34860,75.93
9,wpis,NO,.,0.772949,4064,34860,11.66


In [41]:
STATISTIC_COLUMNS = ["match_count", "total", "match_ratio"]
df_dataset_view_test = df_dataset_view_test.drop(columns = STATISTIC_COLUMNS)
df_dataset_view_test_bi = df_dataset_view_test_bi.drop(columns = STATISTIC_COLUMNS)
df_compare = df_dataset_view_test.join(df_dataset_view_test_bi, lsuffix='_model_1', rsuffix='_model_2')
df_compare = df_compare.drop(columns = ['Token_model_2', 'Punctuation_model_2'])
df_compare = df_compare.rename(columns={"Token_model_1": "Token", "Punctuation_model_1": "Punctuation"})

In [44]:
df_compare[100:150]

Unnamed: 0,Token,Punctuation,Prediction_model_1,Probability scores_model_1,Prediction_model_2,Probability scores_model_2
100,prawdą,NO,NO,0.519043,",",0.935059
101,o,NO,NO,0.989746,NO,0.875977
102,epoce,",",NO,0.852051,NO,0.750977
103,obiektywizmem,NO,NO,0.572754,.,0.541504
104,historycznym,.,NO,0.642578,",",0.368164
105,w,NO,NO,0.987793,NO,0.961426
106,książce,NO,NO,0.650391,"""",0.927246
107,obok,NO,NO,0.929199,NO,0.655273
108,ułomności,NO,NO,0.541016,",",0.393555
109,merytorycznych,NO,NO,0.578125,.,0.978516
