In [2]:
import csv
import re
from collections import Counter

import pandas as pd
import numpy as np
import json
import io
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.utils import shuffle
from scipy.sparse import csc_matrix
from scipy.sparse import coo_matrix
from sklearn.preprocessing import OneHotEncoder

In [3]:
train_data = []
with open('train_data.json') as f:
    for line in f:
        train_data.append(json.loads(line))
len(train_data)

22676

In [79]:
test_data = []
with open('test_data.json') as f:
    for line in f:
        test_data.append(json.loads(line))
len(test_data)

5670

In [80]:
to_test = Counter()
for d in test_data:
    for m in d['Marks']:
        to_test[m['Mark']] += 1
to_test = list(to_test.keys())
to_test

['.', '"', '»', '…', '?', '!']

In [81]:
test_data[-100]

{'Marks': [{'Index': 26060, 'Mark': '.', 'Pos': 91},
  {'Index': 26061, 'Mark': '.', 'Pos': 282},
  {'Index': 26062, 'Mark': '.', 'Pos': 350},
  {'Index': 26063, 'Mark': '.', 'Pos': 432}],
 'Paragraph': 'На первой площадке она встретила доктора, когда он уже возвращался, неся на руках астронома. Он остановился и своим острым, как скальпель, языком отрезал несколько слов, не очень громко Миссис Паркер застыла в неловкой позе, как платье из негнущейся материи, соскользнувшее с гвоздя. С тех пор чувство неловкости в душе и теле осталось у нее навсегда. Время от времени любопытные жильцы спрашивали, что же это ей сказал тогда доктор.'}

In [7]:
vocab = {}
for x in train_data:
    p = x['Paragraph']
    for c in p:
        if c not in vocab:
            vocab[c] = len(vocab)
for x in test_data:
    p = x['Paragraph']
    for c in p:
        if c not in vocab:
            vocab[c] = len(vocab)

In [8]:
len(vocab)

883

In [9]:
from collections import namedtuple
Sample = namedtuple('Sample', ['before','mark','after','is_end'])

In [10]:
def get_window(string, i):
    prev = ''.join([z for z in string[max(i-window_before, 0):i]])
    prev = '*' * (window_before - len(prev)) + prev
    after = ''.join([z for z in string[i+1:i+1+window_after]])
    after = after + '$' * (window_after - len(after))
    return prev, after, string[i]

In [93]:
window_before = 10
window_after = 10
samples = []
for x in train_data:
    paragraph = x['Paragraph']
    sentences = x['Sentences']
    for i, c in enumerate(paragraph):
        if c in to_test:
            before, after, mark = get_window(paragraph, i)
            end = any([s.endswith(paragraph[i-10:i+1]) for s in sentences])
            samples.append(Sample(before, mark, after, end))

In [95]:
def sample2input(s):
    w = s.before+s.mark+s.after
    u = [int(i.isupper()) for i in w]
    a = [int(i.isalpha()) for i in w]
    m = [int(i in to_test) for i in w]
    return [vocab[i] for i in w] + u + a

In [96]:
X = []
Y = []
for i, s in enumerate(samples):
    X.append(sample2input(s))
    Y.append(s.is_end)

In [97]:
test_X = []
for x in test_data:
    paragraph = x['Paragraph']
    marks = x['Marks']
    for m in marks:
        pos = m['Pos']
        ind = m['Index']
        before, after, mark = get_window(paragraph, pos)        
        test_X.append(sample2input(Sample(before, mark, after, False)))
len(test_X)

26476

In [98]:
enc = OneHotEncoder()
enc.fit(X + test_X)

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [99]:
X = enc.transform(X)
test_X = enc.transform(test_X)

In [100]:
len(vocab)

883

In [74]:
# train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.2)
# clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 50, 25, 10))
# clf.fit(train_X, train_Y)
# res = []
# for x in test_X:
#     xx = np.array(x).tolist()
#     res.append(clf.predict(xx)[0])
# metrics.roc_auc_score(test_Y, res), metrics.f1_score(test_Y, res)

In [101]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(200, 100, 50, 10))
clf.fit(X, Y)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200, 100, 50, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [102]:
ans = dict()
for i, x in enumerate(test_X):
    xx = np.array(x).tolist()
    ans[i+1] = int(clf.predict(xx)[0])

In [103]:
len(ans)

26476

In [104]:
with open('sampleSubmission.csv', 'w') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Id', 'Mark'])
    for i, mark in ans.items():
        writer.writerow([i, int(mark)])