In [1]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
        print(e)

In [None]:
# !pip install tensorflow==2.0.0 --force-reinstall

In [2]:
import random
import os
import re
import numpy as np
import torch
import pandas as pd
import fasttext
from numpy import asarray
from numpy import zeros
import seaborn as sns
import matplotlib.pyplot as plt
import json


import keras
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.np_utils import to_categorical
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.utils import plot_model
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping

from keras.callbacks import History
from keras.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score, mean_squared_error, log_loss
from sklearn.model_selection import cross_validate

from collections import defaultdict





Using TensorFlow backend.


In [3]:
checkpoint = ModelCheckpoint('../classifier.hdf5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
stopping = EarlyStopping(monitor='val_loss', verbose=1, patience=10)
history = History()

In [4]:
df = pd.read_csv('all_train_data.csv')
df['label'] = df['label'] - 1
df = df.replace(r'\n',' ', regex=True)

In [None]:
ft = fasttext.load_model('ft_native_300_ru_twitter_nltk_word_tokenize.bin')

In [5]:
%%time

from nltk import ngrams

def get_3grams(text):
    return [''.join(ngram) for ngram in ngrams(text, 3)]

if False:
    token2index_3gram = {}
    for text in df['text']:
        for ngram in get_3grams(text):
            if ngram not in token2index_3gram:
                token2index_3gram[ngram] = len(token2index_3gram)
    
    embedding_matrix_3gram = np.zeros((len(token2index_3gram) + 1, ft.get_dimension()))
    for token, index in token2index_3gram.items():
        embedding_matrix_3gram[index] = ft.get_word_vector(token)

CPU times: user 204 ms, sys: 47.7 ms, total: 251 ms
Wall time: 316 ms


In [None]:
embedding_matrix_3gram.shape

In [None]:
# %%time

# tokens = ft.get_labels()
# embedding_matrix = np.zeros((len(tokens) + 1, ft.get_dimension()))
# token2index = {}
# for i, token in enumerate(tokens):
#     print(i, end='\r')
#     embedding_matrix[i] = ft.get_word_vector(token)
#     token2index[token] = i

In [None]:
#import json

#with open('token2index_3gram.json', 'w') as fp:
    #json.dump(token2index_3gram, fp)

In [6]:
import json

with open('token2index.json', 'r') as fp:
    token2index = json.load(fp)

In [None]:
#np.save('embedding_matrix_3gram.npy', embedding_matrix_3gram)

In [7]:
embedding_matrix_3gram = np.load('embedding_matrix_3gram.npy')

In [None]:
token2index

In [None]:
embedding_matrix_3gram.shape

In [None]:
from sklearn.model_selection import train_test_split

X = []

for row in text:
    indexes = []
    for elem in row:
        token_index = token2index.get(elem, embedding_matrix.shape[0] - 1)
        indexes.append(token_index)
    X.append(indexes)

X = pad_sequences(list(X))
       
y = pd.get_dummies(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

In [8]:
from keras.layers import GlobalMaxPooling1D, MaxPooling1D, LSTM, SpatialDropout1D

class BaseIntentModel:
    def __init__(self, seed=42): 
        self.seed = seed
        self.model = self.build_model()
        self.seed_everything(seed)
        
    def fit(self, X_train, Y_train, X_val, Y_val):
        raise NotImplementedError
        
    def fit_all_data(self, X_train, Y_train):
        raise NotImplementedError
    
    def predict(self, X):
        raise NotImplementedError
    
    def build_model(self):
        raise NotImplementedError
        
    def seed_everything(self, seed):
        random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True
        
        
class NeuralNetFastText(BaseIntentModel):
    
    
    def __init__(self):
        self.embedding_matrix_3gram = np.load('embedding_matrix_3gram.npy')
        super(NeuralNetFastText, self).__init__()
        self.token2index = json.load(open('token2index_3gram.json', 'rb'))
    
    def prepare_x(self, X_raw):
        
        data = X_raw['text']
        text = [re.sub(r'([^\s\w]|_)+', '', sentence) for sentence in data]
        text = [get_3grams(sentence) for sentence in text]

        X = []
        for row in text:
            indexes = []
            for elem in row:
                token_index = self.token2index.get(elem, self.embedding_matrix_3gram.shape[0] - 1)
                indexes.append(token_index)
            X.append(indexes)
        X = pad_sequences(list(X), maxlen=200)
        return X
    
    def prepare_y(self, Y_raw):
        Y = keras.utils.to_categorical(Y_raw, num_classes=14, dtype='float32')
        return Y
        
    
    def build_model(self):
        
        model = Sequential()
        model.add(Embedding(embedding_matrix_3gram.shape[0], 100,input_length=200,weights=[embedding_matrix_3gram],trainable=False))
        model.add(SpatialDropout1D(0.2))
        model.add(LSTM(300, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
        model.add(LSTM(100, dropout=0.1, recurrent_dropout=0.1))
        
       
        model.add(Dense(14))
        model.add(Activation('sigmoid'))
        
        return model
    
    
    def fit(self, X_train, Y_train, X_val, Y_val):
        X_train = self.prepare_x(X_train)
        X_val = self.prepare_x(X_val)
        Y_train = self.prepare_y(Y_train)
        Y_val = self.prepare_y(Y_val)
        
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
        callbacks_list = [checkpoint, stopping, history]
        print(X_train.shape, Y_train.shape)
        print(X_val.shape, Y_val.shape)
        self.model.fit(X_train, Y_train, batch_size=5, epochs=20, callbacks=callbacks_list, validation_data=(X_val, Y_val), verbose=0)
        
        
    def fit_all_data(self, X_train, Y_train):
        self.model.fit_all_data(X_train, Y_train)
    
    def predict(self, X):
        X = self.prepare_x(X)
        Y_pred = self.model.predict(X)
        return Y_pred
        

In [9]:
c = NeuralNetFastText()


In [10]:
c.model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 100)          1635000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 100)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 200, 300)          481200    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_1 (Dense)              (None, 14)                1414      
_________________________________________________________________
activation_1 (Activation)    (None, 14)                0         
Total params: 2,278,014
Trainable params: 643,014
Non-trainable params: 1,635,000
______________________________________

In [11]:
def stratify(df):
    """
    Принимает на вход датафрейм. Отдает разбитый на стратифай фолды датафрейм(список).
    """
    result=[]
    skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=123)
    for train_index, val_index in skf.split(df, df['label']):
        result.append((train_index, val_index))
    return result

In [12]:
def scoring(Y_true, Y_pred):
        Y_true = np.array(Y_true)
        Y_pred = np.array(Y_pred)
    
        print('---------------Classification_report------------------\n', classification_report(Y_true, [_.argmax() for _ in np.array(Y_pred)], digits=4))
        print('RMSE:', mean_squared_error(to_categorical(Y_true, num_classes=14), Y_pred, squared=False))
        print('Logloss:', log_loss(Y_true, Y_pred))

In [13]:
def run_prediction(df, holdout_train_index, holdout_index):
    df_holdout = df.loc[holdout_index]
    X_holdout = df_holdout[['text']] 
    Y_holdout = df_holdout['label']
    
    holdout_predict = np.zeros((df_holdout.shape[0], 14))
    oof_true = []
    oof_pred = []

    for train_index, val_index in stratify(df.loc[holdout_train_index]):
        """
        Извлекаем по очереди K-fold
        """
        df_val = df.loc[val_index]
        df_train = df.loc[train_index]           
                   
        """
        Извлекаем по очереди K-fold для Holdout
        """  
        X_train = df_train[['text']] 
        Y_train = df_train['label']
        X_val = df_val[['text']]
        Y_val = df_val['label']           
        
        """
        Фитим модель на X_train и y_val на каждом фолде
        """
        clf = NeuralNetFastText()
        clf.fit(X_train, Y_train, X_val, Y_val)
        
        """
        Предиктим и скорим модель на каждом фолде
        """
        Y_pred = clf.predict(X_val)
    #     scoring(Y_val, Y_pred)
    
        """
        OUT OF FOLD
        """
        oof_true.extend(Y_val)
        oof_pred.extend(Y_pred)
    
        holdout_predict += 0.2 * clf.predict(X_holdout)
        
    return oof_true, oof_pred, Y_holdout, holdout_predict

In [14]:
for i, (holdout_train_index, holdout_index) in enumerate(stratify(df)):
    oof_true, oof_pred, Y_holdout, holdout_predict = run_prediction(df, holdout_train_index, holdout_index)
    print(f'Fold {i}:')
    scoring(oof_true, oof_pred)
    scoring(Y_holdout, holdout_predict)

(12793, 200) (12793, 14)
(3199, 200) (3199, 14)
(12793, 200) (12793, 14)
(3199, 200) (3199, 14)
(12794, 200) (12794, 14)
(3198, 200) (3198, 14)
(12794, 200) (12794, 14)
(3198, 200) (3198, 14)
(12794, 200) (12794, 14)
(3198, 200) (3198, 14)
Fold 0:
---------------Classification_report------------------
               precision    recall  f1-score   support

           0     0.7200    0.7937    0.7550      4037
           1     0.6865    0.6147    0.6486      2201
           2     0.6096    0.5469    0.5765      1739
           3     0.6401    0.4590    0.5346       891
           4     0.7900    0.9001    0.8415      1542
           5     0.5778    0.2600    0.3586       100
           6     0.6048    0.5498    0.5760       231
           7     0.6932    0.7000    0.6966       710
           8     0.7680    0.8084    0.7877      2088
           9     0.8186    0.7548    0.7854       526
          10     0.7270    0.6694    0.6970       366
          11     0.5730    0.5721    0.5725    

(12794, 200) (12794, 14)
(3199, 200) (3199, 14)
(12794, 200) (12794, 14)
(3199, 200) (3199, 14)
(12794, 200) (12794, 14)
(3199, 200) (3199, 14)
(12795, 200) (12795, 14)
(3198, 200) (3198, 14)
(12795, 200) (12795, 14)
(3198, 200) (3198, 14)
Fold 4:
---------------Classification_report------------------
               precision    recall  f1-score   support

           0     0.7270    0.7719    0.7488      4037
           1     0.6677    0.6535    0.6605      2202
           2     0.6355    0.5394    0.5835      1739
           3     0.6183    0.4781    0.5392       891
           4     0.7820    0.9073    0.8400      1542
           5     0.6296    0.3400    0.4416       100
           6     0.5854    0.5195    0.5505       231
           7     0.7243    0.6662    0.6941       710
           8     0.7582    0.8170    0.7865      2088
           9     0.8408    0.7529    0.7944       526
          10     0.7616    0.6721    0.7141       366
          11     0.5697    0.5891    0.5793    