In [74]:
import csv
import re
from collections import Counter

import pandas as pd
import numpy as np
import json
import io
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.utils import shuffle
from scipy.sparse import csc_matrix
from scipy.sparse import coo_matrix

In [2]:
train_data = []
with open('train_data.json') as f:
    for line in f:
        train_data.append(json.loads(line))
len(train_data)

22676

In [3]:
test_data = []
with open('test_data.json') as f:
    for line in f:
        test_data.append(json.loads(line))
len(test_data)

5670

In [4]:
to_test = Counter()
for d in test_data:
    for m in d['Marks']:
        to_test[m['Mark']] += 1
to_test = list(to_test.keys())
to_test

['.', '"', '»', '…', '?', '!']

In [5]:
test_data[-100]

{'Marks': [{'Index': 26060, 'Mark': '.', 'Pos': 91},
  {'Index': 26061, 'Mark': '.', 'Pos': 282},
  {'Index': 26062, 'Mark': '.', 'Pos': 350},
  {'Index': 26063, 'Mark': '.', 'Pos': 432}],
 'Paragraph': 'На первой площадке она встретила доктора, когда он уже возвращался, неся на руках астронома. Он остановился и своим острым, как скальпель, языком отрезал несколько слов, не очень громко Миссис Паркер застыла в неловкой позе, как платье из негнущейся материи, соскользнувшее с гвоздя. С тех пор чувство неловкости в душе и теле осталось у нее навсегда. Время от времени любопытные жильцы спрашивали, что же это ей сказал тогда доктор.'}

In [6]:
vocab = {}
for x in train_data:
    p = x['Paragraph']
    for c in p:
        if c not in vocab:
            vocab[c] = len(vocab)
for x in test_data:
    p = x['Paragraph']
    for c in p:
        if c not in vocab:
            vocab[c] = len(vocab)

In [7]:
len(vocab)

883

In [8]:
from collections import namedtuple
Sample = namedtuple('Sample', ['before','mark','after','is_end'])

In [9]:
def get_window(string, i):
    prev = ''.join([z for z in string[max(i-window_before, 0):i]])
    prev = '*' * (window_before - len(prev)) + prev
    after = ''.join([z for z in string[i+1:i+1+window_after]])
    after = after + '$' * (window_after - len(after))
    return prev, after, string[i]

In [10]:
window_before = 6
window_after = 6
samples = []
for x in train_data:
    paragraph = x['Paragraph']
    sentences = x['Sentences']
    for i, c in enumerate(paragraph):
        if c in to_test:
            before, after, mark = get_window(paragraph, i)
            end = any([s.endswith(paragraph[i-10:i+1]) for s in sentences])
            samples.append(Sample(before, mark, after, end))

In [35]:
x = lil_matrix((11505, 1), dtype=np.int8)

In [36]:
x[1]= 1
x[10000] = 1

In [38]:
x.shape

(11505, 1)

In [75]:
def sample2input(s):
    w = s.before+s.mark+s.after
    u = [int(i.isupper()) for i in w]
    a = [int(i.isalpha()) for i in w]
    sz_w = len(vocab) * len(w)
    sz = sz_w + len(u) + len(a)
    x = dok_matrix((sz, 1), dtype=np.int8)
    for i, c in enumerate(w):
        x[vocab[c]*(i+1)] = 1
    for i, c in enumerate(u):
        x[sz_w+i] = c
    for i, c in enumerate(a):
        x[sz_w+len(u)+i] = c
    return x

In [76]:
X = []
Y = []
for i, s in enumerate(samples):
    if i % 10000 == 0:
        print (i)
    X.append(sample2input(s))
    Y.append(s.is_end)

0


TypeError: 'coo_matrix' object does not support item assignment

In [70]:
len(vocab)

883

In [73]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.2)
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(30, 25, 20, 10))
clf.fit(train_X, train_Y)
res = []
for x in test_X:
    xx = np.array(x).reshape(1, -1)
    res.append(clf.predict(xx)[0])
metrics.roc_auc_score(test_Y, res), metrics.f1_score(test_Y, res)



TypeError: float() argument must be a string or a number, not 'dok_matrix'

In [30]:
test_X = dict()
for x in test_data:
    paragraph = x['Paragraph']
    marks = x['Marks']
    for m in marks:
        pos = m['Pos']
        ind = m['Index']
        before, after, mark = get_window(paragraph, pos)        
        test_X[ind] = sample2input(Sample(before, mark, after, False))

In [32]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(30, 25, 20, 10))
clf.fit(X, Y)
ans = dict()
for i, x in test_X.items():
    xx = np.array(x).reshape(1, -1)
    ans[i] = int(clf.predict(xx)[0])

In [33]:
len(ans)

26476

In [35]:
with open('sampleSubmission.csv', 'w') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Id', 'Mark'])
    for i, mark in ans.items():
        writer.writerow([i, int(mark)])