In [1]:
import pandas as pd
import numpy as np

In [2]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM, Bidirectional, TimeDistributed, InputLayer
from tensorflow.keras.layers import Embedding, Conv1D, Input, concatenate, SpatialDropout1D, Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import optimizers
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
df = pd.read_json("filtered_and_field_1.json")
df.head()

In [None]:
target = [column for column in df.columns if "field_" in column]
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_abstract"], df[target],
                                                   train_size=0.4, test_size=0.1,
                                                   random_state=42)

In [None]:
X_test

In [None]:
vocab = Counter()
for sent in X_train:
    sent = [word.lower() for word in sent.split()]
    vocab.update(sent)

In [7]:
filtered_vocab = {word for word in vocab if vocab[word] > 5}

word2id = {word: i + 2 for i, word in enumerate(filtered_vocab)}
word2id['pad'] = 0
word2id['unk'] = 1  

id2word = {i: word for word, i in word2id.items()}

In [8]:
def data2ints(data, smth2id):
    int_data = []
    for seq in data:
        int_seq = []
        for i in seq:
            int_seq.append(smth2id.get(i.lower(), 1))
  
        int_data.append(int_seq)
    return int_data

In [9]:
X_train_ids, X_test_ids = data2ints(X_train, word2id), data2ints(X_test, word2id)

In [10]:
sent_max_len = max(len(x) for x in X_train_ids)

sent_max_len

2999

In [11]:
X_train_pad, X_test_pad = pad_sequences(X_train_ids, maxlen=sent_max_len, padding='post'), pad_sequences(X_test_ids, maxlen=sent_max_len, padding='post')

In [12]:
#y_train = y_train["field_Biology"]
#y_test = y_test["field_Biology"]

In [13]:
y_test_replaced = [elem if elem == 1 else 0 for elem in y_test["field_Biology"]]
y_train_replaced = [elem if elem == 1 else 0 for elem in y_train["field_Biology"]]

y_train_cat = to_categorical(y_train_replaced, num_classes=2)
y_test_cat = to_categorical(y_test_replaced, num_classes=2)

In [14]:
import tensorflow as tf

metrics = [tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.Accuracy()]

# Архитектура 1

модель берет слова, пропускает их через Embedding слой. По эмбеддингам проходит biLSTM, на выходе линейный слой и выходной слой.

Embedding слой обучается внутри модели -- 1 балл

подгружаются обученные эмбеддинги для русского языка ** -- 1 балл

fasttext эмбеддинги обучаются на всем корпусе с нуля *** -- 2 балла

In [15]:
X_train.shape, y_train_cat.shape, X_test.shape, y_test_cat.shape

((847375, 2999), (847375, 2), (211844, 2999), (211844, 2))

In [36]:
word_in = Input(shape=(sent_max_len))
emb_word = Embedding(input_dim=len(word2id), output_dim=20, input_length=sent_max_len, mask_zero=True)(word_in)
lstm = Bidirectional(LSTM(units=128))(emb_word)
out = Dense(2, activation="softmax")(lstm)

model = Model(inputs=word_in, outputs=out)

optimizer = optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=metrics)

In [37]:
model.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 2999)]            0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 2999, 20)          560840    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               152576    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 514       
Total params: 713,930
Trainable params: 713,930
Non-trainable params: 0
_________________________________________________________________


In [38]:
y_train_cat

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [39]:
model.fit(X_train, y_train_cat, validation_data=(X_test, y_test_cat), batch_size=128, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fe01cc88400>

С accuracy проблема: модель предсказывает числа типа 9.7562476e-1, а сравнивает с 0 и 1. Давайте округлим и пересчитаем

In [40]:
pred = model.predict(X_test[:1000])  # полностью предсказывает долго

rounded_pred = []
for pair in pred:
  rounded_pred.append([round(pair[0]), round(pair[1])])

In [25]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

In [42]:
prec = precision_score(rounded_pred, y_test_cat[:1000], average="macro")
rec = recall_score(rounded_pred, y_test_cat[:1000], average="macro")
acc = accuracy_score(rounded_pred, y_test_cat[:1000])

print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"Accuracy: {acc}")

Precision: 0.5016425107474342
Recall: 0.589859437751004
Accuracy: 0.679


# Архитектура 2

In [47]:
# изменим размерность эмбеддингов и лстм, добавим дропаут

word_in = Input(shape=(sent_max_len))
emb_word = Embedding(input_dim=len(word2id), output_dim=50, input_length=sent_max_len, mask_zero=True)(word_in)
lstm = Bidirectional(LSTM(units=256, recurrent_dropout=0.15))(emb_word)
out = Dense(2, activation="softmax")(lstm)

model = Model(inputs=word_in, outputs=out)

optimizer = optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=metrics)

In [48]:
model.summary()

Model: "functional_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 2999)]            0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 2999, 50)          1402100   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 512)               628736    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 1026      
Total params: 2,031,862
Trainable params: 2,031,862
Non-trainable params: 0
_________________________________________________________________


In [49]:
model.fit(X_train, y_train_cat, validation_data=(X_test, y_test_cat), batch_size=128, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fe0413779a0>

In [50]:
pred = model.predict(X_test[:1000])

rounded_pred = []
for pair in pred:
  rounded_pred.append([round(pair[0]), round(pair[1])])


prec = precision_score(rounded_pred, y_test_cat[:1000], average="macro")
rec = recall_score(rounded_pred, y_test_cat[:1000], average="macro")
acc = accuracy_score(rounded_pred, y_test_cat[:1000])

print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"Accuracy: {acc}")

Precision: 0.5048426538936222
Recall: 0.6183428635497253
Accuracy: 0.68


# Архитектура 3

In [15]:
# вернем размерности, оставим дропаут

word_in = Input(shape=(sent_max_len))
emb_word = Embedding(input_dim=len(word2id), output_dim=20, input_length=sent_max_len, mask_zero=True)(word_in)
lstm = Bidirectional(LSTM(units=128, recurrent_dropout=0.15))(emb_word)
out = Dense(2, activation="softmax")(lstm)

model = Model(inputs=word_in, outputs=out)

optimizer = optimizers.SGD(learning_rate=0.01, momentum=0.9)
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=metrics)

In [None]:
model.fit(X_train_pad, y_train_cat, validation_data=(X_test_pad, y_test_cat), batch_size=128, epochs=1, verbose=1)

1055/6621 [===>..........................] - ETA: 6:41:04 - loss: 0.6208 - precision: 0.6867 - recall: 0.6867 - accuracy: 0.0000e+00

In [None]:
pred = model.predict(X_test_pad[:1000])

rounded_pred = []
for pair in pred:
  rounded_pred.append([round(pair[0]), round(pair[1])])


prec = precision_score(rounded_pred, y_test_cat[:1000], average="macro")
rec = recall_score(rounded_pred, y_test_cat[:1000], average="macro")
acc = accuracy_score(rounded_pred, y_test_cat[:1000])

print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"Accuracy: {acc}")

In [None]:
from sklearn.metrics import classification_report

print(classification_report(rounded_pred, y_test_cat[:1000]))

На биологии и половине данных:

Precision: 0.83 среднее (0.94 и 0.72)
Recall: 0.86 среднее (0.88 и 0.85)
Accuracy: 0.87

Для TF-IDF + Logreg было для класса 1 0.88 0.80 0.90

# Эмбеддинги fasttext с нуля

In [68]:
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_abstract"], df[target],
                                                   train_size=0.008, test_size=0.002,
                                                   random_state=42)

In [69]:
corpus = np.concatenate([X_train, X_test])

In [70]:
import gensim

ft = gensim.models.FastText(corpus, size=300, iter=5)

In [74]:
ft_weights = np.array([ft.wv.__getitem__(id2word[id_]) for id_ in id2word])

In [75]:
word_in = Input(shape=(sent_max_len))
emb_word = Embedding(input_dim=len(word2id), output_dim=300, trainable=False, weights=[ft_weights])(word_in)
lstm = Bidirectional(LSTM(units=128, recurrent_dropout=0.15))(emb_word)
out = Dense(2, activation="softmax")(lstm)

model = Model(inputs=word_in, outputs=out)

optimizer = optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=metrics)

In [76]:
model.summary()

Model: "functional_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 2999)]            0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 2999, 300)         8412600   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 256)               439296    
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 514       
Total params: 8,852,410
Trainable params: 439,810
Non-trainable params: 8,412,600
_________________________________________________________________


In [78]:
X_train, X_test = pad_sequences(X_train_ids, maxlen=sent_max_len, padding='post'), pad_sequences(X_test_ids, maxlen=sent_max_len, padding='post')

model.fit(X_train, y_train_cat, validation_data=(X_test, y_test_cat), batch_size=128, epochs=1, verbose=1)



<tensorflow.python.keras.callbacks.History at 0x7fe003c70370>

In [80]:
pred = model.predict(X_test[:1000])

rounded_pred = []
for pair in pred:
  rounded_pred.append([round(pair[0]), round(pair[1])])


prec = precision_score(rounded_pred, y_test_cat[:1000], average="macro", zero_division=0)
rec = recall_score(rounded_pred, y_test_cat[:1000], average="macro", zero_division=0)
acc = accuracy_score(rounded_pred, y_test_cat[:1000])

print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"Accuracy: {acc}")

Precision: 0.5
Recall: 0.3395
Accuracy: 0.679


# Два входа

In [84]:
chars = set([letter for word in filtered_vocab for letter in word])
n_chars = len(chars)
print(len(chars), chars)

162 {'5', 'υ', '≃', 'в', '8', 'á', '’', '©', 'õ', '½', '∩', 'ù', 'с', 'å', 'm', 'º', 'μ', 'ℒ', 'р', '≡', 'ν', 'ℵ', 'l', '≫', 'é', '»', '̇', '∘', 'b', 'z', 'ü', 'ϕ', '∞', 'δ', '°', 'м', 'ū', '«', '≥', '∧', 'β', '—', 'o', 'ç', '𝔇', 'í', 'κ', 'g', '̈', '1', 'd', 'ℝ', '⊐', '\x96', '𝔼', '§', '⊕', 'v', 'a', '7', '“', 'ƒ', 'γ', '♂', 'į', 'ω', 'y', 'и', 'α', 'à', 'ε', 'ń', '⋆', '∑', '⊙', 'i', 'ó', 'x', 's', 'ě', '–', 'ô', 'c', 'j', 'ú', 'ö', '±', 'n', '⋅', 'r', 'ρ', 'p', 'τ', 'q', 'λ', 'ā', '×', 'u', '…', 'ê', 'è', 'ã', '≳', '⊂', '·', 'ṁ', 'σ', '≠', '≈', '̊', '²', '•', 'η', 'f', '2', 'т', 'µ', '‖', 'ξ', 'ζ', 'ï', 't', '3', '∼', 'π', '≤', '∈', 'h', '‘', 'đ', 'â', '¼', '0', 'k', 'ñ', 'у', 'ℓ', '′', 'ä', 'ß', '®', '≦', 'ψ', '6', 'θ', '✓', '˙', '−', 'ḥ', 'e', '∪', 'ϵ', 'χ', '→', '”', 'ű', '4', '\ue700', '€', 'φ', 'w', '9'}


In [85]:
char2id = {c: i + 2 for i, c in enumerate(chars)}
char2id["pad"] = 0
char2id["unk"] = 1

id2char = {i: char for char, i in char2id.items()}

In [86]:
char_max_len = max(len(x) for x in filtered_vocab)
print("максимальная длина слова:", char_max_len)

максимальная длина слова: 28


In [88]:
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_abstract"], df[target],
                                                   train_size=0.008, test_size=0.002,
                                                   random_state=42)

X_train, X_test = pad_sequences(X_train_ids, maxlen=sent_max_len, padding='post'), pad_sequences(X_test_ids, maxlen=sent_max_len, padding='post')

In [89]:
def make_X_char(sentences):
  X_char = []
  for sentence in sentences:
      sent_seq = []
      for i in range(sent_max_len):
          word_seq = []
          for j in range(char_max_len):
              try:
                  word_seq.append(char2id[sentence[i][j].lower()])
              except:
                  word_seq.append(char2id["pad"])
          sent_seq.append(word_seq)
      X_char.append(np.array(sent_seq))
  return np.array(X_char)


X_char_train, X_char_test = make_X_char(X_train), make_X_char(X_test)

In [92]:
word_in = Input(shape=(sent_max_len))
emb_word = Embedding(input_dim=len(word2id), output_dim=20, input_length=sent_max_len, mask_zero=True)(word_in)

char_in = Input(shape=(sent_max_len, char_max_len))
emb_char = TimeDistributed(Embedding(input_dim=len(char2id), output_dim=10, input_length=char_max_len))(char_in)
char_enc = TimeDistributed(Conv1D(filters=12, kernel_size=3))(emb_char)
char_flat = TimeDistributed(Flatten())(char_enc)

x = concatenate([emb_word, char_flat])
main_lstm = Bidirectional(LSTM(units=128,
                               recurrent_dropout=0.15)
                         )(x)
out = Dense(2, activation="softmax")(main_lstm)

model = Model(inputs=[char_in, word_in], outputs=out)


optimizer = optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=metrics)

In [93]:
model.summary()

Model: "functional_15"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           [(None, 2999, 28)]   0                                            
__________________________________________________________________________________________________
time_distributed (TimeDistribut (None, 2999, 28, 10) 1640        input_10[0][0]                   
__________________________________________________________________________________________________
input_9 (InputLayer)            [(None, 2999)]       0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, 2999, 26, 12) 372         time_distributed[0][0]           
______________________________________________________________________________________

In [95]:
X_char_train.shape, X_train.shape, y_train_cat.shape

((16947, 2999, 28), (16947, 2999), (16947, 2))

In [96]:
model.fit([X_char_train, X_train], y_train_cat, validation_data=([X_char_test, X_test], y_test_cat), batch_size=128, epochs=1, verbose=1)



<tensorflow.python.keras.callbacks.History at 0x7fe02602ee20>

In [97]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

pred = model.predict([X_char_test[:1000], X_test[:1000]])

rounded_pred = []
for pair in pred:
  rounded_pred.append([round(pair[0]), round(pair[1])])


prec = precision_score(rounded_pred, y_test_cat[:1000], average="macro")
rec = recall_score(rounded_pred, y_test_cat[:1000], average="macro")
acc = accuracy_score(rounded_pred, y_test_cat[:1000])

print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"Accuracy: {acc}")

Precision: 0.5
Recall: 0.3395
Accuracy: 0.679


  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
model.fit([X_char_train, X_train], y_train_cat, validation_data=([X_char_test, X_test], y_test_cat), batch_size=128, epochs=1, verbose=1)



<tensorflow.python.keras.callbacks.History at 0x7fbc69067580>

In [47]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

pred = model.predict([X_char_test[:1000], X_test[:1000]])

rounded_pred = []
for pair in pred:
  rounded_pred.append([round(pair[0]), round(pair[1])])


prec = precision_score(rounded_pred, y_test_cat[:1000], average="macro")
rec = recall_score(rounded_pred, y_test_cat[:1000], average="macro")
acc = accuracy_score(rounded_pred, y_test_cat[:1000])

print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"Accuracy: {acc}")

Precision: 0.6472903566457063
Recall: 0.6686637717803551
Accuracy: 0.648


In [98]:
df.head()

Unnamed: 0,DOI,title,abstract,subject,cleaned_abstract,filtered_subject,all_fields,field_Art,field_Biology,field_Business,field_Chemistry,field_Geology,field_Humanities,field_Math,field_Medicine,field_Physics,field_Psychology,field_Social,field_Tech
0,10.1163/1568525043083505,aristotle fr. 44 rose: midas and silenus,<jats:sec><jats:title>Abstract</jats:title><ja...,"[Classics, Linguistics and Language, Archaeolo...",abstract scholars have identified two supposed...,"[Archaeology, Classics, History, Language and ...","[Humanities, Humanities, Humanities, Humanitie...",0,0,0,0,0,1,0,0,0,0,0,0
1,10.1163/1568525043083532,loca loquuntur. lucretius' explanation of the ...,<jats:sec><jats:title>Abstract</jats:title><ja...,"[Classics, Linguistics and Language, Archaeolo...",abstract a discussion of the second part of lu...,"[Archaeology, Classics, History, Language and ...","[Humanities, Humanities, Humanities, Humanitie...",0,0,0,0,0,1,0,0,0,0,0,0
2,10.1163/1568525043083541,poverty and demography: the case of the gracch...,<jats:sec><jats:title>Abstract</jats:title><ja...,"[Classics, Linguistics and Language, Archaeolo...",abstract according to many ancient historians ...,"[Archaeology, Classics, History, Language and ...","[Humanities, Humanities, Humanities, Humanitie...",0,0,0,0,0,1,0,0,0,0,0,0
3,10.1163/1568525043083514,old persian in athens revisited (ar. ach. 100),<jats:sec><jats:title>Abstract</jats:title><ja...,"[Classics, Linguistics and Language, Archaeolo...",abstract the old persian line in aristophanes ...,"[Archaeology, Classics, History, Language and ...","[Humanities, Humanities, Humanities, Humanitie...",0,0,0,0,0,1,0,0,0,0,0,0
4,10.1163/1568527053083412,religion and violence: what can sociology offer?,<jats:sec><jats:title>Abstract</jats:title><ja...,"[Religious studies, History]",abstract this essay presents a sketch of a soc...,"[History, Religious studies]","[Humanities, Humanities]",0,0,0,0,0,1,0,0,0,0,0,0
