In [1]:
import csv
import re
from collections import Counter

import pandas as pd
import numpy as np
import json
import io
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.utils import shuffle

In [2]:
train_data = []
with open('train_data.json') as f:
    for line in f:
        train_data.append(json.loads(line))
len(train_data)

22676

In [45]:
test_data = []
with open('test_data.json') as f:
    for line in f:
        test_data.append(json.loads(line))
len(test_data)

5670

In [4]:
to_test = Counter()
for d in test_data:
    for m in d['Marks']:
        to_test[m['Mark']] += 1
to_test = list(to_test.keys())
to_test

['.', '"', '»', '…', '?', '!']

In [47]:
test_data[-1]

{'Marks': [{'Index': 26474, 'Mark': '.', 'Pos': 138},
  {'Index': 26475, 'Mark': '.', 'Pos': 254},
  {'Index': 26476, 'Mark': '.', 'Pos': 347}],
 'Paragraph': 'По данным ОВД-Инфо, отмена приговора Валерию Юрцеву — практически единственная успешная апелляция по делам задержанных 5 декабря 2011 года. Несколько десятков задержанных подавали апелляции в районные суды, однако жалобы были оставлены без удовлетворения. Сейчас дела задержанных 5 декабря готовятся для подачи в Европейский суд по правам человека.'}

In [32]:
vocab = {}
for x in train_data:
    p = x['Paragraph']
    for c in p:
        if c not in vocab:
            vocab[c] = len(vocab)
for x in test_data:
    p = x['Paragraph']
    for c in p:
        if c not in vocab:
            vocab[c] = len(vocab)

In [33]:
len(vocab)

883

In [34]:
from collections import namedtuple
Sample = namedtuple('Sample', ['before','mark','after','is_end'])

In [35]:
def get_window(string, i):
    prev = ''.join([z for z in string[max(i-window_before, 0):i]])
    prev = '*' * (window_before - len(prev)) + prev
    after = ''.join([z for z in string[i+1:i+1+window_after]])
    after = after + '$' * (window_after - len(after))
    return prev, after, string[i]

In [36]:
window_before = 6
window_after = 6
samples = []
for x in train_data:
    paragraph = x['Paragraph']
    sentences = x['Sentences']
    for i, c in enumerate(paragraph):
        if c in to_test:
            before, after, mark = get_window(paragraph, i)
            end = any([s.endswith(paragraph[i-10:i+1]) for s in sentences])
            samples.append(Sample(before, mark, after, end))

In [22]:
def sample2input(sample):
    w = s.before+s.mark+s.after
    u = [int(i.isupper()) for i in w]
    a = [int(i.isalpha()) for i in w]
    return [vocab[i] for i in w] + u + a

In [23]:
X = []
Y = []
for s in samples:
    X.append(sample2input(s))
    Y.append(s.is_end)

In [44]:
# train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.2)
# clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20, 15, 10))
# clf.fit(train_X, train_Y)
# res = []
# for x in test_X:
#     xx = np.array(x).reshape(1, -1)
#     res.append(clf.predict(xx)[0])
# metrics.roc_auc_score(test_Y, res), metrics.f1_score(test_Y, res)

(0.92600943569318106, 0.96140196831668678)

In [49]:
test_X = dict()
for x in test_data:
    paragraph = x['Paragraph']
    marks = x['Marks']
    for m in marks:
        pos = m['Pos']
        ind = m['Index']
        before, after, mark = get_window(paragraph, pos)        
        test_X[ind] = sample2input(Sample(before, mark, after, False))

In [50]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20, 15, 10))
clf.fit(X, Y)
ans = dict()
for i, x in test_X.items():
    xx = np.array(x).reshape(1, -1)
    ans[i] = int(clf.predict(xx)[0])

In [51]:
len(ans)

26476

In [52]:
with open('sampleSubmission.csv', 'w') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Id', 'Mark'])
    for i, mark in ans.items():
        writer.writerow([i, int(mark)])