# Klasyfikacja tekstów sieciami neuronowymi

Ten plik pozwala na seryjne testowanie klasyfikatorów i pozyskanie wyników (w formacie tabeli latexowych, macierzy pomyłek) dla określonych zbiorów zapisanych w formacie pickle. Uruchomienie wszsytkich komórek z notatnika powoduje wygenerowanie wyników dla wszystkich okreśonych zbiorów zapamiętanych w formacie pickle.

### Działania przygotowawcze: podłączenie Dysku Google, instalacja wymaganych pakietów, dołączenie katalogu projektu do ścieżki systemowej i import potrzebnych klas z projektu

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
# ścieżka do bieżącego katalogu (należy edytować, jeśli się nie zgadza)
%cd gdrive/MyDrive/praca_inzynierska/authorship_analysis_project/training_models

Mounted at /content/gdrive
/content/gdrive/MyDrive/authorship_anaysis/authorship_analysis/training_models


In [None]:
! pip install -r ../requrements.txt
! pip install https://github.com/kpu/kenlm/archive/master.zip
! python -m spacy download pl_core_news_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting https://github.com/kpu/kenlm/archive/master.zip (from -r ../requrements.txt (line 9))
  Downloading https://github.com/kpu/kenlm/archive/master.zip (553 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m553.5/553.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting langid (from -r ../requrements.txt (line 1))
  Downloading langid-1.1.6.tar.gz (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting lime (from -r ../requrements.txt (line 3))
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
print(module_path)
print(sys.path)

/content/gdrive/MyDrive/authorship_anaysis/authorship_analysis
['/content', '/env/python', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.10/dist-packages/IPython/extensions', '/root/.ipython', '/content/gdrive/MyDrive/authorship_anaysis/authorship_analysis']


In [None]:
from data_preparation import CorpusPreparation
from pipelines import Pipeline, Explain
from data_fetchers import EpochsFetcher, BookSet



### Zbiory treningowe i testowe

Lista dostępnych zbiorów zapisanych jako pickle. Zbiory oznaczone są liczbami naturalnymi i cyfra dziesiątek oznacza sposób podziału zbioru na zbiory testowe i uczące z określoną liczbą słów w próbce, a cyfra jednosci oznacza wybrany preprocessing.

In [None]:
import pickle
from os.path import exists
datasets_filepath = 'datasets/features_'
for i in range(100):
  path = datasets_filepath + str(i)
  if exists(path):
    print(path)
    with open(path, 'rb') as f:
      data = pickle.load(f)
      head_data, _, _, _, _ = data
      print(f'test_size: {head_data[0]}, train_size: {head_data[1]}, number of words in paragraph: {head_data[2]}, preprocessing operations: {head_data[3]}, authors: {head_data[4]}')

datasets/features_0
test_size: 20, train_size: 100, number of words in paragraph: 15, preprocessing operations: ['anonymize'], authors: ['Daniel Naborowski', 'Mikołaj Sęp Szarzyński', 'Elżbieta Drużbacka', 'Adam Mickiewicz', 'Juliusz Słowacki', 'Cyprian Kamil Norwid', 'Adam Asnyk', 'Maria Konopnicka', 'Kazimierz Przerwa-Tetmajer', 'Bolesław Leśmian', 'Jan Kasprowicz']
datasets/features_1
test_size: 20, train_size: 100, number of words in paragraph: 15, preprocessing operations: ['anonymize', 'lower_text', 'remove_stop_words'], authors: ['Daniel Naborowski', 'Mikołaj Sęp Szarzyński', 'Elżbieta Drużbacka', 'Adam Mickiewicz', 'Juliusz Słowacki', 'Cyprian Kamil Norwid', 'Adam Asnyk', 'Maria Konopnicka', 'Kazimierz Przerwa-Tetmajer', 'Bolesław Leśmian', 'Jan Kasprowicz']
datasets/features_2
test_size: 20, train_size: 100, number of words in paragraph: 15, preprocessing operations: ['anonymize'], authors: ['Daniel Naborowski', 'Mikołaj Sęp Szarzyński', 'Elżbieta Drużbacka', 'Adam Mickiewicz'

### Określenie rodzajów preprocessingu i wybór zbiorów, deklaracja zmiennych

In [None]:
preps_num = [0,1,2,3,5] # wybór preprocessingu
preps_descr = ['a', 'als', 'alo_NVAdj', 'alp', 'alsp', 'alomp_NVAdj']
div_num= [5,6,7] #[0,1,2,3,4,5,6,7] # wybór numeru podziału zbioru z wielkością próbki
number_of_dataset = 11

mixed = None
test_size = None
train_size = None
words_num_in_par = None
representations = ['emb'] #'bow'/ 'wp'/ 'emb'
models= ['nn']
layers_arch = 'embed_basic' # 'basic', 'lstm', 'embed_basic', 'embed_glove_lstm'
authors = []
preprocessing_list = []
train_classes = []
prep_descr = ''
kinds_descr = ''
latex_set_str = ''


### Definicje funkcji

In [None]:
def get_books_epoch_list():
  print(authors)
  bookset = BookSet()
  bookset.fetch()
  if authors == ['Eliza Orzeszkowa', 'Henryk Sienkiewicz', 'Bolesław Prus']:
    kinds =['Epika']
    books_list = bookset.list_of_books_to_author_title_list(bookset.get_books_by_authors_list_kinds(authors, kinds))
    books_epoch_list = [books_list]
  if authors == ['Daniel Naborowski', 'Mikołaj Sęp Szarzyński', 'Elżbieta Drużbacka', 'Adam Mickiewicz', 'Juliusz Słowacki', 'Cyprian Kamil Norwid', 'Adam Asnyk', 'Maria Konopnicka', 'Kazimierz Przerwa-Tetmajer', 'Bolesław Leśmian', 'Jan Kasprowicz']:
    kinds = ['Liryka']
    authors_b = ['Daniel Naborowski', 'Mikołaj Sęp Szarzyński', 'Elżbieta Drużbacka']
    authors_r = ['Adam Mickiewicz', 'Juliusz Słowacki', 'Cyprian Kamil Norwid']
    authors_p = ['Adam Asnyk', 'Maria Konopnicka']
    authors_m = ['Kazimierz Przerwa-Tetmajer', 'Bolesław Leśmian', 'Jan Kasprowicz']
    books_list_b = bookset.list_of_books_to_author_title_list(bookset.get_books_by_authors_list_kinds(authors_b, kinds))
    books_list_r = bookset.list_of_books_to_author_title_list(bookset.get_books_by_authors_list_kinds(authors_r, kinds))
    books_list_p = bookset.list_of_books_to_author_title_list(bookset.get_books_by_authors_list_kinds(authors_p, kinds))
    books_list_m = bookset.list_of_books_to_author_title_list(bookset.get_books_by_authors_list_kinds(authors_m, kinds))
    books_epoch_list = [books_list_b, books_list_r, books_list_p, books_list_m]
  return books_epoch_list

In [None]:
def get_existing_authors():
  existing_authors_set = set(train_classes)
  #print(existing_authors_set)
  authors_max_books = {}
  authors_ = []
  books_epoch_list = get_books_epoch_list()
  #print(books_epoch_list)
  for books_epoch in books_epoch_list:
      authors_list = [author for (author, title) in books_epoch]
      authors_set = sorted(list(set(authors_list)))
      for author in authors_set:
          if author in existing_authors_set:
              books_num = authors_list.count(author)
              if author in authors_:
                  if books_num > authors_max_books[author]:
                      authors_.remove(author)
                      authors_max_books[author] = books_num
                      authors_.append(author)
              else:
                  authors_.append(author)
                  authors_max_books[author] = books_num
  return authors_

In [None]:
def conf_path():
  div = ''
  if not mixed:
      div = '_div'
  path = f'../figures/aut_nn_x_fig_{layers_arch}_{train_size}_{test_size}_{words_num_in_par}{div}_{prep_descr}_{kinds_descr}.jpg'
  return path

def get_latex_set_str(p: int, d: int):
  prep = preps_descr[p].partition('_')[0].upper()
  if d < 2:
    st = 'Y'
  elif d < 5:
    st = "Y'"
  else:
    st = 'X'
  return '$' + chr(92) + 'mathbb{' + st + '}_{' + prep + '}$'

In [None]:
def get_nn_args_dict():
  select_k_wp = 5000 # X 8000, Y' 5000, Y 1000
  select_k_bow = 5000 # X 8000, Y' 5000, Y 1000
  epochs_bow = 32
  epochs_wp = 32
  batch_size = 64
  layer_units = 64

  ann_args_bow = [select_k_bow, epochs_bow, batch_size, layer_units, layers_arch]
  ann_args_wp = [select_k_wp, epochs_wp, batch_size, layer_units, layers_arch]

  ann_args_dict = {
      (representations[0], models[0]): ann_args_bow
      #(representations[1], models[0]): ann_args_wp
  }
  return ann_args_dict

In [None]:
def run_pipeline():
  authors_ = get_existing_authors()
  nn_args_dict = get_nn_args_dict()
  p = Pipeline(
    corpus_train=train_set,
    corpus_test=test_set,
    classes_train=train_classes,
    classes_test=test_classes,
    class_names=authors_,
    representations=representations,
    models=models,
    ann_args_dict=nn_args_dict)
  p.pipelines()
  p.accuracy_latex_format(f'../results/nn_{layers_arch}_x_table.tex', latex_set_str)
  p.save_img(conf_path())
  #expl = Explain(p)

###Pętla klasyfikacji

In [None]:
for p in preps_num:
  for d in div_num:
    number_of_dataset = str(d)+str(p) if d>0 else str(p)
    if d < 5:
      mixed = True
      kinds_descr = 'l'
    else:
      mixed = False
      kinds_descr = 'e'
    latex_set_str = get_latex_set_str(p,d)
    prep_descr = preps_descr[p]
    file_name_read = datasets_filepath + str(number_of_dataset)
    with open(file_name_read, 'rb') as f:
      data = pickle.load(f)
      head_data, train_set, train_classes, test_set, test_classes = data
      test_size, train_size, words_num_in_par, preprocessing_list, authors = head_data
    print(get_books_epoch_list())
    run_pipeline()

Output hidden; open in https://colab.research.google.com to view.