In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sc
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
f = open('task2_lemmas_train')
lines = []
for line in f:
    lines.append(line[:-2].decode('utf-8'))
    
train_data = []
for line in lines[1:]:
    array = line.split(',')
    train_data.append([array[1], zip(map(lambda x: x[:-2], array[2:]), map(lambda x: x[-1], array[2:]))])

In [3]:
f = open('task2_lemmas_test')
test_words = []
for line in f:
    test_words.append(line[:-2].decode('utf-8').split(',')[-1])
test_words = test_words[1:]

In [4]:
def shortest_of(strings):
    return min(strings, key=len)

def long_substr(strings):
    substr = ""
    if not strings:
        return substr
    reference = shortest_of(strings) 
    length = len(reference)
    for i in xrange(length):
        for j in xrange(i + len(substr) + 1, length + 1):
            candidate = reference[i:j]  
            if all(candidate in text for text in strings):
                substr = candidate
    return substr

def prefix(word, root):
    for i in range(len(word)-len(root) + 1):
        if word[i:i+len(root)] == root:
            return word[:i]
        
def suffix(word, root):
    ret = prefix(word[::-1], root[::-1])
    return ret[::-1] if ret else u''

In [5]:
train_dataset = [[], [], [], [], []]
suffixes = []
for elem in train_data:
    strings = [elem[0]] + map(lambda x: x[0], elem[1])
    root = long_substr(strings)
    if root != '' and len(root) > 1:
        if root[0] == '-':
            root = root[1:]
    train_dataset[0].append(elem[0])
    train_dataset[1].append(suffix(elem[0], root))
    train_dataset[2].append(elem[1][0][0])
    train_dataset[3].append(suffix(elem[1][0][0], root))
    train_dataset[4].append(elem[1][0][1])
    for x in strings:
        suff = suffix(x, root)
        if suff:
            suffixes.append(suff) 
        
suffixes = sc.unique(suffixes)

In [6]:
for i in range(5):
    print i, train_dataset[i][:5]

0 [u'vergognerete', u'amnistiavate', u'menomazione', u'sfaldavamo', u'sfodererei']
1 [u'erete', u'vate', u'', u'vamo', u'erei']
2 [u'vergognare', u'amnistiare', u'menomazione', u'sfaldare', u'sfoderare']
3 [u'are', u're', u'', u're', u'are']
4 [u'V', u'V', u'N', u'V', u'V']


In [23]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 7), lowercase = True)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(), X, train_dataset[4], verbose=2))

[CV]  ................................................................
[CV] ................................................. , total=  21.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   21.5s remaining:    0.0s


[CV] ................................................. , total=  27.6s
[CV]  ................................................................
[CV] ................................................. , total=  23.7s
0.962879318504


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.2min finished


In [24]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 7), lowercase = True)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(), X, train_dataset[3], verbose=2))



[CV]  ................................................................
[CV] ................................................. , total= 4.8min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.8min remaining:    0.0s


[CV] ................................................. , total= 5.2min
[CV]  ................................................................
[CV] ................................................. , total= 5.3min
0.94684858034


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 15.4min finished


In [25]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 7), lowercase = True)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(), X, train_dataset[1], verbose=2))

[CV]  ................................................................
[CV] ................................................. , total=17.8min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 17.8min remaining:    0.0s


[CV] ................................................. , total=20.0min
[CV]  ................................................................
[CV] ................................................. , total=20.8min
0.915086629488


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 58.5min finished


In [7]:
train_suff_len = map(len, train_dataset[1])

In [27]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 7), lowercase = True)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(), X, train_suff_len, verbose=2))

[CV]  ................................................................
[CV] ................................................. , total= 1.3min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s


[CV] ................................................. , total= 1.2min
[CV]  ................................................................
[CV] ................................................. , total= 1.3min
0.92195723801


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.8min finished


Полный ответ

In [9]:
%%time
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 7), lowercase = True)
X = cv.fit_transform(train_dataset[0])
ind = 4*X.shape[0]/5
clf = LogisticRegression()
clf.fit(X[:ind], train_dataset[4][:ind])
class_pred = clf.predict(X[ind:])
print 'class predicted'
clf.fit(X[:ind], train_dataset[3][:ind])
ending_pred = clf.predict(X[ind:])
print 'ending predicted'
clf.fit(X[:ind], train_suff_len[:ind])
len_suff_pred = clf.predict(X[ind:])
print 'len predicted'

class predicted
ending predicted
len predicted


In [11]:
print len(class_pred), len(ending_pred), len(len_suff_pred) 

23728 23728 23728


In [12]:
predictions = []
for i in range(len(len_suff_pred)):
    predictions.append(train_dataset[0][ind+i][:-len_suff_pred[i]] + ending_pred[i] + '+' + class_pred[i])

In [18]:
true_values = []
for i in range(len(len_suff_pred)):
    true_values.append(train_dataset[2][ind+i] + '+' + train_dataset[4][ind+i])

In [21]:
accuracy_score(true_values, predictions)

0.81393290627107218