In [1]:
import os
import csv
import re
import pickle
from collections import Counter, namedtuple

import numpy as np
import pandas as pd

from gensim.models import KeyedVectors
from keras.preprocessing import sequence
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import LSTM, Bidirectional, Dropout, Dense, Input, Embedding
from keras.layers.merge import concatenate

Using TensorFlow backend.


In [2]:
DIR = "data"
TRAIN_FILENAME = os.path.join(DIR, "train.txt")
TEST_FILENAME = os.path.join(DIR, "test.txt")
VAL_FILENAME = os.path.join(DIR, "validation.txt")
MODEL_FILENAME = os.path.join("models", "LSTM256_Emb30_Dense128_dropout0.3.h5")
VOCABULARY_PATH = "vocabulary.pickle"
W2V_PATH= "resources/GoogleNews-vectors-negative300.bin.gz"

In [3]:
Sample = namedtuple('Sample', 'id context response answer')
ExtendedSample = namedtuple('ExtendedSample', 'id tweets answer')
Tweet = namedtuple('Tweet', 'speaker text')


def clean_text(text):
    text = text.replace("@@ ", "")
    text = text.replace("<at>", "")
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = text.lower().split(" ")
    return " ".join(text)

def get_tweets(text):
    borders = []
    elements = ["<first_speaker>", "<second_speaker>", "<minor_speaker>", "<third_speaker>"]
    for element in elements:
        borders += [m.start() for m in re.finditer(element, text)]
    borders.append(len(text))
    borders = list(sorted(borders))
    tweets = [text[borders[i]:borders[i+1]] for i in range(len(borders)-1)]
    for i, sentence in enumerate(tweets):
        for key in elements:
            if key in sentence:
                tweets[i] = Tweet(text=sentence.replace(key, ""), speaker=key)
    return tweets

def samples(filename):
    reader = csv.reader(open(filename, "r", encoding="utf-8"), delimiter='\t')
    header = next(reader)
    for sample in reader:
        if len(header) == 3:
            sample.append(None)
        sample = Sample._make(sample)  # type: Sample
        sample = Sample(id=sample.id,
                        context=clean_text(sample.context),
                        response=clean_text(sample.response),
                        answer=sample.answer)
        yield ExtendedSample(id=sample.id,
                             tweets=get_tweets(sample.context) + get_tweets(sample.response),
                             answer=sample.answer)

In [4]:
class Vocabulary(object):
    """
    Индексированный словарь.
    """

    def __init__(self, dump_filename):
        self.dump_filename = dump_filename
        self.reset()

        if os.path.isfile(self.dump_filename):
            self.load()

    def save(self) -> None:
        with open(self.dump_filename, "wb") as f:
            pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)

    def load(self):
        with open(self.dump_filename, "rb") as f:
            vocab = pickle.load(f)
            self.__dict__.update(vocab.__dict__)

    def add_word(self, word):
        if self.word_to_index.get(word) is None:
            self.index_to_word.append(word)
            index = len(self.index_to_word) - 1
            self.word_to_index[word] = index
            self.count_word(word)
            return index
        return self.word_to_index[word]

    def count_word(self, word):
        self.counter[word] += 1

    def get_word_index(self, word) -> int:
        if self.word_to_index.get(word) is not None:
            return self.word_to_index[word]
        return -1

    def get_word(self, index):
        return self.index_to_word[index]

    def size(self):
        return len(self.index_to_word)

    def reset(self):
        self.word_to_index = {}
        self.index_to_word = []
        self.counter = Counter()
        self.word_to_index["NotAWord"] = 0
        self.index_to_word.append("NotAWord")
        self.counter["NotAWord"] = 1

    def shrink(self, num):
        pairs = self.counter.most_common(num)
        self.reset()
        for word, count in pairs:
            self.add_word(word)
def collect_vocabulary(vocabulary, filename):
    i = 0
    for sample in samples(filename):
        i += 1
        for tweet in sample.tweets:
            for word in tweet.text.split():
                vocabulary.add_word(word)
        if i%100000 == 0:
            print(i)

vocabulary = Vocabulary(VOCABULARY_PATH)
if vocabulary.size() <= 1:
    collect_vocabulary(vocabulary, TEST_FILENAME)
    collect_vocabulary(vocabulary, TRAIN_FILENAME)
    collect_vocabulary(vocabulary, VAL_FILENAME)
    vocabulary.save()
    print(vocabulary.size())
vocabulary.shrink(100000)
print(vocabulary.size())

100001


In [None]:
def text_to_indices(text, vocabulary):
    indices = []
    for word in text.split():
        index = vocabulary.get_word_index(word)
        indices.append(index if index != -1 else vocabulary.size())
    return indices


def collect_data(filename, vocabulary, n=None, maxlen=100):
    if n is None:
        n = sum(1 for line in open(filename))
    data = []
    labels = []
    answers = []
    i = 1
    for sample in samples(filename):
        word_indices = []
        speaker_labels = []
        response_speaker = sample.tweets[-1].speaker
#         all_speakers = set([tweet.speaker for tweet in sample.tweets])
#         if len(all_speakers) != 2:
#             continue
        for tweet in sample.tweets:
            word_indices += text_to_indices(tweet.text, vocabulary)
            for word in tweet.text.split():
                speaker_labels.append(float(tweet.speaker == response_speaker))
        answer = sample.answer
        word_indices = np.array(word_indices, dtype="int32")
        speaker_labels = np.array(speaker_labels, dtype="float32")
        data.append(word_indices)
        labels.append(speaker_labels)
        answers.append(answer)
        if i == n:
            data = sequence.pad_sequences(data, maxlen=maxlen)
            labels = sequence.pad_sequences(labels, maxlen=maxlen)
            labels = labels.reshape(labels.shape[0], labels.shape[1], 1)
            yield (data, labels, answers)
            data = []
            labels = []
            answers = []
            i = 0
        i += 1
    data = sequence.pad_sequences(data, maxlen=maxlen)
    labels = sequence.pad_sequences(labels, maxlen=maxlen)
    labels = labels.reshape(labels.shape[0], labels.shape[1], 1)
    yield (data, labels, answers)

In [None]:
def load_w2v(embeddings_filename):
    w2v = KeyedVectors.load_word2vec_format(embeddings_filename, binary=True)
    return w2v

def get_weights(w2v, vocabulary, embedding_dim=300):
    weights = np.random.uniform(low=-0.1, high=0.1, size=(vocabulary.size() + 1, embedding_dim))
    weights[0] = np.zeros((embedding_dim))
    for i, word in enumerate(vocabulary.index_to_word):
        if word in w2v.vocab:
            weights[i] = w2v.word_vec(word)
    return weights

w2v = load_w2v(W2V_PATH)
weights = get_weights(w2v, vocabulary)

In [None]:
class TuringRNN:
    def __init__(self, rnn=LSTM, units_rnn=256, units_dense=128, dropout=0.3, batch_size=128, emb_size=300,
                 maxlen=100):
        self.rnn = rnn
        self.batch_size = batch_size
        self.units_rnn = units_rnn
        self.units_dense = units_dense
        self.dropout = dropout
        self.model = None
        self.maxlen = maxlen
        self.emb_size = emb_size

    def build(self, vocabulary, weigths) -> None:
        """
        Построение модели.
        """
        word_input = Input(shape=(self.maxlen,), dtype='int32')
        speaker_labels = Input(shape=(self.maxlen, 1), dtype='float32')
        word_emb = Embedding(weights.shape[0], weights.shape[1], weights=[weights, ], trainable=False)(word_input)
        emb = concatenate([word_emb, speaker_labels], axis=-1)
        encoded = Bidirectional(self.rnn(self.units_rnn, recurrent_dropout=0.3))(emb)
        merged = Dropout(self.dropout)(encoded)

        dense1 = Dense(self.units_dense, activation='relu')(merged)
        dense1 = Dropout(self.dropout)(dense1)

        predictions = Dense(1, activation='sigmoid')(dense1)
        model = Model(inputs=[word_input, speaker_labels], outputs=predictions)
        model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
        print(model.summary())
        self.model = model

    def train(self, train_filename, val_filename, vocabulary) -> None:
        """
        Обучение модели.
        """
        x_val_context, x_val_response, y_val = next(collect_data(val_filename, vocabulary, None))
        for i in range(50):
            j = 0
            for x_train_context, x_train_response, y_train in collect_data(train_filename, vocabulary, 100000):
                filename = "{rnn}{units_rnn}_Emb{emb}_Dense{units_dense}_dropout{dropout}.h5"
                filename = filename.format(rnn=self.rnn.__name__, units_rnn=self.units_rnn,
                                           units_dense=self.units_dense, dropout=self.dropout,
                                           emb=self.emb_size)
                filename = os.path.join(os.getcwd(), "models", filename)
                print("Big epoch: ", i, j)
                self.model.fit([x_train_context, x_train_response], y_train,
                               epochs=1,
                               batch_size=self.batch_size,
                               shuffle=True,
                               verbose=1)
                self.model.save(filename)
                j += 1
            self.model.evaluate([x_val_context, x_val_response], y_val, batch_size=self.batch_size)

    def load(self, filename: str) -> None:
        self.model = load_model(filename)

    def predict(self, x, x_labels):
        preds = self.model.predict([x, x_labels], batch_size=self.batch_size, verbose=1)
        submission = pd.DataFrame({'id': ids, 'Bob': preds.ravel()})
        submission.to_csv(os.path.join(os.getcwd(), 'answer.csv'), index=False)

In [None]:
rnn = TuringRNN(batch_size=256)
rnn.build(vocabulary, weights)
rnn.train(TRAIN_FILENAME, VAL_FILENAME, vocabulary)

In [None]:
import json
from collections import namedtuple


class Dialog:
    def __init__(self, dialog_id, context, first_user_id, second_user_id,
                 first_user_is_bot=None, second_user_is_bot=None):
        self.dialog_id = int(dialog_id)
        self.first_user_id = first_user_id
        self.second_user_id = second_user_id
        self.context = context
        self.messages = []
        self.first_user_is_bot = first_user_is_bot
        self.second_user_is_bot = second_user_is_bot

    def add_message(self, user_id, text):
        Message = namedtuple("Message", "user_id text")
        self.messages.append(Message(user_id, text))

    def get_first_user_messages(self):
        return [message.text for message in self.messages if message.user_id == self.first_user_id]

    def get_second_user_messages(self):
        return [message.text for message in self.messages if message.user_id == self.second_user_id]

    def get_messages(self):
        return self.messages

    def get_context(self):
        return self.context

    def __str__(self):
        return str(self.dialog_id) + " " + self.first_user_id + " " + self.second_user_id

    def __repr__(self):
        return self.__str__()


def parse(filename):
    with open(filename, "r", encoding="utf-8") as f:
        text = f.read()
        dialogs = []
        if text[0] == "[":
            text = '{"dialogs": ' + text + '}'
            dialogs = json.loads(text)["dialogs"]
        else:
            dialogs.append(json.loads(text))

        result = []
        for dialog in dialogs:
            users = dialog["users"]
            messages = dialog["thread"]
            first_user = users[0]
            second_user = users[1]
            first_user_is_bot = None
            second_user_is_bot = None
            if "userType" in first_user:
                first_user_is_bot = first_user["userType"] != "Human"
                second_user_is_bot = second_user["userType"] != "Human"

            dialog = Dialog(dialog_id=dialog["dialogId"], context=dialog["context"],
                            first_user_id=first_user["id"], second_user_id=second_user["id"],
                            first_user_is_bot=first_user_is_bot, second_user_is_bot=second_user_is_bot)
            for message in messages:
                dialog.add_message(message["userId"], message["text"])
            result.append(dialog)
        return result

In [None]:
def collect(json_filename, maxlen=100):
    dialogs = parse(json_filename)
    data = []
    labels = []
    i = 0
    for dialog in dialogs:
        word_indices = []
        speaker_labels = []
        texts = [message.text for message in dialog.messages]
        for text in texts:
            word_indices += text_to_indices(text, vocabulary)
        speaker_labels += [message.user_id == "Bob" for message in dialog.messages]
        word_indices = np.array(word_indices, dtype="int32")
        speaker_labels = np.array(speaker_labels, dtype="float32")
        data.append(word_indices)
        labels.append(speaker_labels)
    data = sequence.pad_sequences(data, maxlen=maxlen)
    labels = sequence.pad_sequences(labels, maxlen=maxlen)
    labels = labels.reshape(labels.shape[0], labels.shape[1], 1)
    return (data, labels)

rnn = TuringRNN(batch_size=256)
rnn.load("models/LSTM256_Emb300_Dense128_dropout0.3.h5")
rnn.predict(collect("data/day0.json"))