### Import Functions

In [1]:
from ipynb.fs.full.utilities import *
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
import pickle
from sklearn.metrics import classification_report

### Data Prep

In [2]:
X_train_60 = load_json('../../data/Train_60_X.json')
y_train_60 = load_json('../../data/Train_60_y.json')
X_dev = load_json('../../data/Dev_X.json')
y_dev = load_json('../../data/Dev_y.json')

In [3]:
verify_prepped_data(X_train_60, y_train_60)

In [4]:
verify_prepped_data(X_dev, y_dev)

In [5]:
X_train_60_flat = [word for sentence in X_train_60 for word in sentence]
y_train_60_flat = [label for sentence in y_train_60 for label in sentence]
X_dev_flat = [word for sentence in X_dev for word in sentence]
y_dev_flat = [label for sentence in y_dev for label in sentence]

In [6]:
len(X_train_60_flat) == len(y_train_60_flat)

True

In [7]:
len(X_dev_flat) == len(y_dev_flat)

True

In [8]:
X_train_60[0]

[{'word': '^',
  'sent_len': 6,
  'upos': 'PART',
  'first_word_in_sent': 'нека',
  'contains_interrogative_word': False,
  'contains_interrogative_particle': False,
  'contains_imperative_verb': False,
  'contains_repetitive_conj_before': False,
  'between_two_verbs': False,
  'word_xpos_type': 'Te',
  'BOS': True,
  'next_word': 'нека',
  'next_word_upos': 'PART',
  'next_word_xpos_type': 'Tv',
  'word_after_next_word': 'не',
  'word_after_next_word_upos': 'PART',
  'word_after_next_word_xpos_type': 'Tn',
  'contains_publicistic_word': False},
 {'word': 'нека',
  'sent_len': 6,
  'upos': 'PART',
  'first_word_in_sent': 'нека',
  'contains_interrogative_word': False,
  'contains_interrogative_particle': False,
  'contains_imperative_verb': False,
  'contains_repetitive_conj_before': False,
  'between_two_verbs': False,
  'word_xpos_type': 'Tv',
  'prev_word': '^',
  'prev_word_upos': 'PART',
  'prev_word_xpos_type': 'Te',
  'next_word': 'не',
  'next_word_upos': 'PART',
  'next_word_x

In [9]:
X_dev[0]

[{'word': '^',
  'sent_len': 42,
  'upos': 'CCONJ',
  'first_word_in_sent': 'но',
  'contains_interrogative_word': False,
  'contains_interrogative_particle': False,
  'contains_imperative_verb': False,
  'contains_repetitive_conj_before': False,
  'between_two_verbs': False,
  'word_xpos_type': 'punct',
  'BOS': True,
  'next_word': 'но',
  'next_word_upos': 'CCONJ',
  'next_word_xpos_type': 'Cc',
  'word_after_next_word': 'синовете',
  'word_after_next_word_upos': 'NOUN',
  'word_after_next_word_xpos_type': 'Nc',
  'word_after_next_word_xpos_gender_number_article': 'mpd',
  'contains_publicistic_word': True},
 {'word': 'но',
  'sent_len': 42,
  'upos': 'CCONJ',
  'first_word_in_sent': 'но',
  'contains_interrogative_word': False,
  'contains_interrogative_particle': False,
  'contains_imperative_verb': False,
  'contains_repetitive_conj_before': False,
  'between_two_verbs': False,
  'word_xpos_type': 'Cc',
  'prev_word': '^',
  'prev_word_upos': 'CCONJ',
  'prev_word_xpos_type': 'pu

In [10]:
v = DictVectorizer()
X_train_60_svm = v.fit_transform(X_train_60_flat)
X_dev_svm = v.transform(X_dev_flat)

In [11]:
le = preprocessing.LabelEncoder()
y_train_60_svm = le.fit_transform(y_train_60_flat)
y_dev_svm = le.transform(y_dev_flat)

In [12]:
le.classes_

array(['', '!', '"', '",', '".', '(', ')', '),', ').', ',', '-', '.',
       '...', ':', ':"', ';', '?'], dtype='<U3')

### LogReg Model

In [14]:
%%time
clf = LogisticRegression(n_jobs=-1)
clf.fit(X_train_60_svm, y_train_60_svm)

CPU times: user 349 ms, sys: 633 ms, total: 982 ms
Wall time: 5min 41s


LogisticRegression(n_jobs=-1)

In [15]:
pkl_filename = '../../data/logreg_model.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file)

In [16]:
labels = list(clf.classes_)

In [17]:
y_pred = clf.predict(X_dev_svm)

In [18]:
target_names = list(le.classes_)

In [19]:
print(classification_report(y_dev_svm, y_pred, labels=labels, target_names=target_names, digits=3))

              precision    recall  f1-score   support

                  0.930     0.994     0.961    240418
           !      0.000     0.000     0.000       373
           "      0.000     0.000     0.000      2243
          ",      0.000     0.000     0.000       848
          ".      0.000     0.000     0.000       300
           (      0.990     0.608     0.754      1693
           )      0.959     0.094     0.171      1003
          ),      0.000     0.000     0.000       385
          ).      0.000     0.000     0.000       271
           ,      0.803     0.416     0.548     18651
           -      0.000     0.000     0.000       935
           .      0.866     0.988     0.923     11435
         ...      0.000     0.000     0.000        70
           :      0.000     0.000     0.000       935
          :"      0.000     0.000     0.000        68
           ;      0.000     0.000     0.000       628
           ?      0.000     0.000     0.000       453

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
target_names.remove('')

In [21]:
labels.remove(0)

In [22]:
print(classification_report(y_dev_svm, y_pred, labels=labels, target_names=target_names, digits=3))

              precision    recall  f1-score   support

           !      0.000     0.000     0.000       373
           "      0.000     0.000     0.000      2243
          ",      0.000     0.000     0.000       848
          ".      0.000     0.000     0.000       300
           (      0.990     0.608     0.754      1693
           )      0.959     0.094     0.171      1003
          ),      0.000     0.000     0.000       385
          ).      0.000     0.000     0.000       271
           ,      0.803     0.416     0.548     18651
           -      0.000     0.000     0.000       935
           .      0.866     0.988     0.923     11435
         ...      0.000     0.000     0.000        70
           :      0.000     0.000     0.000       935
          :"      0.000     0.000     0.000        68
           ;      0.000     0.000     0.000       628
           ?      0.000     0.000     0.000       453

   micro avg      0.847     0.501     0.629     40291
   macro avg      0.226   

#### Results: Not great results

### LogReg - higher max_iter

In [23]:
%%time
clf = LogisticRegression(n_jobs=-1, max_iter=500)
clf.fit(X_train_60_svm, y_train_60_svm)

CPU times: user 333 ms, sys: 632 ms, total: 965 ms
Wall time: 25min 26s


LogisticRegression(max_iter=500, n_jobs=-1)

In [24]:
pkl_filename = '../../data/logreg_v2_model.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file)

In [25]:
labels = list(clf.classes_)
y_pred = clf.predict(X_dev_svm)
target_names = list(le.classes_)
print(classification_report(y_dev_svm, y_pred, labels=labels, target_names=target_names, digits=3))
target_names.remove('')
labels.remove(0)
print(classification_report(y_dev_svm, y_pred, labels=labels, target_names=target_names, digits=3))

              precision    recall  f1-score   support

                  0.950     0.992     0.971    240418
           !      0.408     0.083     0.138       373
           "      0.833     0.013     0.026      2243
          ",      0.688     0.521     0.593       848
          ".      0.091     0.003     0.006       300
           (      0.916     0.804     0.857      1693
           )      0.810     0.737     0.772      1003
          ),      0.816     0.577     0.676       385
          ).      0.877     0.845     0.861       271
           ,      0.813     0.566     0.667     18651
           -      0.618     0.036     0.069       935
           .      0.932     0.982     0.956     11435
         ...      0.000     0.000     0.000        70
           :      0.791     0.381     0.514       935
          :"      0.000     0.000     0.000        68
           ;      0.042     0.003     0.006       628
           ?      0.734     0.792     0.762       453

    accuracy              

#### Result: Better results with more iterations

### LogReg - even higher max_iter

In [26]:
%%time
clf = LogisticRegression(n_jobs=-1, max_iter=1000)
clf.fit(X_train_60_svm, y_train_60_svm)

CPU times: user 353 ms, sys: 638 ms, total: 991 ms
Wall time: 56min 53s


LogisticRegression(max_iter=1000, n_jobs=-1)

In [27]:
pkl_filename = '../../data/logreg_v3_model.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file)

In [28]:
labels = list(clf.classes_)
y_pred = clf.predict(X_dev_svm)
target_names = list(le.classes_)
print(classification_report(y_dev_svm, y_pred, labels=labels, target_names=target_names, digits=3))
target_names.remove('')
labels.remove(0)
print(classification_report(y_dev_svm, y_pred, labels=labels, target_names=target_names, digits=3))

              precision    recall  f1-score   support

                  0.956     0.991     0.974    240418
           !      0.500     0.123     0.198       373
           "      0.578     0.059     0.108      2243
          ",      0.724     0.544     0.621       848
          ".      0.625     0.017     0.032       300
           (      0.930     0.867     0.897      1693
           )      0.848     0.805     0.826      1003
          ),      0.803     0.616     0.697       385
          ).      0.915     0.875     0.894       271
           ,      0.813     0.617     0.701     18651
           -      0.579     0.087     0.151       935
           .      0.934     0.987     0.960     11435
         ...      0.000     0.000     0.000        70
           :      0.802     0.506     0.620       935
          :"      0.400     0.029     0.055        68
           ;      0.325     0.041     0.073       628
           ?      0.834     0.753     0.791       453

    accuracy              

In [29]:
actual_y_pred = le.inverse_transform(y_pred)
punctuated_sentences = scikit_punctuate(X_dev_flat, actual_y_pred)
print(punctuated_sentences[:10735])

но синовете им не умъртви, защото постъпи според писаното в закона в книгата на мойсей, където господ заповяда и каза: бащите да не умират заради синовете и синовете да не умират заради бащите, а всеки да умира за собствения си грях. но когато ти говоря ще отворя устата ти и ти им кажи: така казва господ бог, който слуша:" нека слуша, а който не слуша, нека не слуша; защото са бунтовен дом. защото кой е по-голям този, който седи на трапезата ли или онзи, който слугува? а след това ще се насели, както в предишните дни, заявява господ. а той беше на задната част заспал на възглавница. ефрем е като птица славата му ще отлети. и така той си отиде от него на известно разстояние. не бягам при халдейците. и израилевите синове излязоха от египетската земя строени. а когато ахитофел видя, че съветът му не се изпълни оседла магарето си и стана и отиде у дома си в своя град и нареди домашните си работи и се обеси. ако умре някое животно от добитъка, който можете да ядете, който се допре до мършат

#### Result: Better results

### LogReg - solver='saga'

In [30]:
%%time
clf = LogisticRegression(n_jobs=-1, max_iter=1000, solver='saga')
clf.fit(X_train_60_svm, y_train_60_svm)

CPU times: user 1h 23min 33s, sys: 5.91 s, total: 1h 23min 39s
Wall time: 1h 23min 40s




LogisticRegression(max_iter=1000, n_jobs=-1, solver='saga')

In [31]:
pkl_filename = '../../data/logreg_v4_model.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file)

In [32]:
labels = list(clf.classes_)
y_pred = clf.predict(X_dev_svm)
target_names = list(le.classes_)
print(classification_report(y_dev_svm, y_pred, labels=labels, target_names=target_names, digits=3))
target_names.remove('')
labels.remove(0)
print(classification_report(y_dev_svm, y_pred, labels=labels, target_names=target_names, digits=3))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.954     0.993     0.973    240418
           !      0.533     0.064     0.115       373
           "      0.716     0.033     0.062      2243
          ",      0.725     0.557     0.630       848
          ".      0.500     0.003     0.007       300
           (      0.944     0.852     0.896      1693
           )      0.861     0.781     0.819      1003
          ),      0.892     0.579     0.702       385
          ).      0.955     0.860     0.905       271
           ,      0.834     0.601     0.699     18651
           -      0.684     0.058     0.107       935
           .      0.929     0.990     0.959     11435
         ...      0.000     0.000     0.000        70
           :      0.878     0.452     0.597       935
          :"      0.000     0.000     0.000        68
           ;      0.294     0.008     0.016       628
           ?      0.830     0.764     0.795       453

    accuracy              

#### Result: Greater precision with a lower recall

### LogReg - without penalty

In [14]:
%%time
clf = LogisticRegression(n_jobs=-1, max_iter=1000, penalty='none')
clf.fit(X_train_60_svm, y_train_60_svm)

CPU times: user 388 ms, sys: 577 ms, total: 964 ms
Wall time: 49min 20s


LogisticRegression(max_iter=1000, n_jobs=-1, penalty='none')

In [15]:
pkl_filename = '../../data/logreg_v5_model.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file)

In [16]:
labels = list(clf.classes_)
y_pred = clf.predict(X_dev_svm)
target_names = list(le.classes_)
print(classification_report(y_dev_svm, y_pred, labels=labels, target_names=target_names, digits=3))
target_names.remove('')
labels.remove(0)
print(classification_report(y_dev_svm, y_pred, labels=labels, target_names=target_names, digits=3))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.956     0.991     0.973    240418
           !      0.506     0.107     0.177       373
           "      0.585     0.069     0.124      2243
          ",      0.720     0.559     0.629       848
          ".      0.583     0.023     0.045       300
           (      0.938     0.864     0.899      1693
           )      0.850     0.798     0.823      1003
          ),      0.759     0.631     0.689       385
          ).      0.895     0.882     0.888       271
           ,      0.813     0.612     0.698     18651
           -      0.551     0.081     0.142       935
           .      0.934     0.987     0.959     11435
         ...      0.000     0.000     0.000        70
           :      0.765     0.519     0.618       935
          :"      0.500     0.029     0.056        68
           ;      0.267     0.037     0.064       628
           ?      0.804     0.786     0.795       453

    accuracy              

#### Results: A little rise in recall and fall in precision. I interpet it as not a good idea.