In [1]:
import numpy as np
import pandas as pd
import pickle
import keras
import re
import tensorflow as tf
from matplotlib import pyplot as plt

from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from pymystem3 import Mystem
from sklearn.model_selection import train_test_split
stemmer = Mystem()

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
filename = 'data/lenta-ru-news.csv'
tokenizer_filename = 'data/tokenizer_lenta.dump'

In [3]:
df = pd.read_csv(filename)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
def year_extraction(row):
    return int(row['date'][0:4])

In [5]:
df['year'] = df.apply(lambda row: year_extraction(row), axis=1)


In [6]:
def year_condition(row):
    return row['year'] > 1999


In [7]:
df = df[df['year'] > 1999]

In [8]:
with open(tokenizer_filename, 'rb') as file:
    tokenizer = pickle.load(file)


In [9]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 1254
# This is fixed.
EMBEDDING_DIM = 100

In [10]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[0-9a-z#+_]')
STOPWORDS = set(stopwords.words('russian'))

stemmer = Mystem()

def clean_text(text):
    try: 
        text = text.lower() # lowercase text
    except: 
        text = str(text).lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = text.replace('x', '')
    text = " ".join(stemmer.lemmatize(word)[0] for word in text.split() if word not in STOPWORDS)
    return text

In [11]:
def read_text_dataframe_generator(count):
    x = []
    for row in df['text'][:count]:
        x.append(clean_text(row))
    return x

In [12]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 386981 unique tokens.


In [16]:
with open('data/x_text_dumped_array.dump', 'rb') as file:
    x = pickle.load(file)
X = tokenizer.texts_to_sequences(x)
X = keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)
print('Text to sequences completed')
Y = pd.get_dummies(df['tags'], ).values
print('Dummies completed: ', Y.shape)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .1, random_state=69)

Shape of data tensor: (797889, 1254)
Text to sequences completed
Dummies completed:  (797889, 94)


In [17]:
with open('data/padded_x.dump', 'wb') as file:
    pickle.dump(X, file)

In [18]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, SpatialDropout1D, LSTM

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(Y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 100
batch_size = 128

In [20]:
model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size)

Epoch 1/100
 41344/718100 [>.............................] - ETA: 2:11:31 - loss: 2.0103 - accuracy: 0.5727

KeyboardInterrupt: 

In [70]:
def predict_generator():
    counter = 0
    for idx, series in df.iterrows():
        cleaned_text = clean_text(series.text)
        X = tokenizer.texts_to_sequences([cleaned_text, ])
        X = keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
        y = Y[counter].reshape(1, Y[counter].shape[0])
        yield X, y 
        counter += 1

In [71]:
pr = predict_generator()
for i in range(1):
    print(next(pr))


(array([[  0,   0,   0, ..., 334, 300, 750]], dtype=int32), array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]], dtype=uint8))


In [None]:
model.fit_generator(predict_generator(), steps_per_epoch=round(len(df.index)/batch_size), epochs=200)


Epoch 1/200
 223/6234 [>.............................] - ETA: 1:59:56 - loss: 0.3639 - accuracy: 0.9821

In [46]:
len(df.index)

797889