In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sc
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import OneHotEncoder

In [2]:
f = open('task2_lemmas_train')
lines = []
for line in f:
    lines.append(line[:-2].decode('utf-8'))
    
train_data = []
for line in lines[1:]:
    array = line.split(',')
    train_data.append([array[1], zip(map(lambda x: x[:-2], array[2:]), map(lambda x: x[-1], array[2:]))])

In [3]:
f = open('task2_lemmas_test')
test_words = []
for line in f:
    test_words.append(line[:-2].decode('utf-8').split(',')[-1])
test_words = test_words[1:]

In [4]:
def shortest_of(strings):
    return min(strings, key=len)

def long_substr(strings):
    substr = ""
    if not strings:
        return substr
    reference = shortest_of(strings) 
    length = len(reference)
    for i in xrange(length):
        for j in xrange(i + len(substr) + 1, length + 1):
            candidate = reference[i:j]  
            if all(candidate in text for text in strings):
                substr = candidate
    return substr

def prefix(word, root):
    for i in range(len(word)-len(root) + 1):
        if word[i:i+len(root)] == root:
            return word[:i]
        
def suffix(word, root):
    ret = prefix(word[::-1], root[::-1])
    return ret[::-1] if ret else u''

In [5]:
train_dataset = [[], [], [], [], []]
suffixes = []
for elem in train_data:
    strings = [elem[0]] + map(lambda x: x[0], elem[1])
    root = long_substr(strings)
    if root != '' and len(root) > 1:
        if root[0] == '-':
            root = root[1:]
    train_dataset[0].append(elem[0])
    train_dataset[1].append(suffix(elem[0], root))
    train_dataset[2].append(elem[1][0][0])
    train_dataset[3].append(suffix(elem[1][0][0], root))
    train_dataset[4].append(elem[1][0][1])
    for x in strings:
        suff = suffix(x, root)
        if suff:
            suffixes.append(suff) 
        
suffixes = sc.unique(suffixes)

In [6]:
for i in range(5):
    print i, train_dataset[i][:5]

0 [u'vergognerete', u'amnistiavate', u'menomazione', u'sfaldavamo', u'sfodererei']
1 [u'erete', u'vate', u'', u'vamo', u'erei']
2 [u'vergognare', u'amnistiare', u'menomazione', u'sfaldare', u'sfoderare']
3 [u'are', u're', u'', u're', u'are']
4 [u'V', u'V', u'N', u'V', u'V']


<h2>Предсказание части речи</h2>

In [23]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 7), lowercase = True)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(), X, train_dataset[4], verbose=2))

[CV]  ................................................................
[CV] ................................................. , total=  21.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   21.5s remaining:    0.0s


[CV] ................................................. , total=  27.6s
[CV]  ................................................................
[CV] ................................................. , total=  23.7s
0.962879318504


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.2min finished


In [24]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 8), lowercase = True)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(C=50), X, train_dataset[4], verbose=2))

[CV]  ................................................................
[CV] ................................................. , total=  31.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   31.3s remaining:    0.0s


[CV] ................................................. , total=  36.2s
[CV]  ................................................................
[CV] ................................................. , total=  40.8s
0.967506762878


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.8min finished


In [27]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 10), lowercase = True)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(C=50), X, train_dataset[4], verbose=2))

[CV]  ................................................................
[CV] ................................................. , total=  34.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   34.1s remaining:    0.0s


[CV] ................................................. , total=  41.3s
[CV]  ................................................................
[CV] ................................................. , total=  46.0s
0.966520587274


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.0min finished


In [None]:
scores = []
c_range = range(4, 10)
for i in c_range:
    cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, i), lowercase = True)
    X = cv.fit_transform(train_dataset[0])
    scores.append(np.mean(cross_val_score(LogisticRegression(), X, train_dataset[4])))
    print i, scores[-1]
index = np.argmax(scores)
print c_range[index], scores[index]
plt.plot(c_range, score)
plt.show()

<h2>Предсказание суффикса</h2>

In [24]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 7), lowercase = True)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(), X, train_dataset[3], verbose=2))



[CV]  ................................................................
[CV] ................................................. , total= 4.8min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.8min remaining:    0.0s


[CV] ................................................. , total= 5.2min
[CV]  ................................................................
[CV] ................................................. , total= 5.3min
0.94684858034


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 15.4min finished


In [26]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 10), lowercase = True)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(C=50), X, train_dataset[3], verbose=2))

[CV]  ................................................................
[CV] ................................................. , total= 6.5min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.5min remaining:    0.0s


[CV] ................................................. , total= 7.6min
[CV]  ................................................................
[CV] ................................................. , total= 7.7min
0.949908308836


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 21.8min finished


In [18]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 7), lowercase = True)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(C=50), X, train_dataset[3], verbose=2))

[CV]  ................................................................
[CV] ................................................. , total= 4.8min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.8min remaining:    0.0s


[CV] ................................................. , total= 5.5min
[CV]  ................................................................
[CV] ................................................. , total= 5.6min
0.951164005539


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 15.8min finished


In [28]:
scores = []
c_range = range(4, 10)
for i in c_range:
    cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, i), lowercase = True)
    X = cv.fit_transform(train_dataset[0])
    scores.append(np.mean(cross_val_score(LogisticRegression(C=50), X, train_dataset[3])))
    print i, scores[-1]
index = np.argmax(scores)
print c_range[index], scores[index]
plt.plot(c_range, score)
plt.show()

4 0.93828457213
5 0.947573101747
6 0.950118631636
7 0.951164005539
8 0.951088196506
9 0.950481355555
7 0.951164005539


NameError: name 'score' is not defined

<h2>Предсказание куска, который нужно отрезать</h2>

In [8]:
train_suff_len = map(len, train_dataset[1])
w_length_train = []
for word in train_dataset[0]:
    w_length_train.append(len(word))
w_length_train = np.array(w_length_train)

9274

In [34]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 8), lowercase = True)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(C=50), X, train_suff_len, verbose=2))

[CV]  ................................................................
[CV] ................................................. , total= 2.2min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.2min remaining:    0.0s


[CV] ................................................. , total= 2.2min
[CV]  ................................................................
[CV] ................................................. , total= 2.6min
0.9267869874


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  7.0min finished


In [10]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 8), lowercase = True, max_df=0.5)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(C=50), X, train_suff_len, verbose=2))

[CV]  ................................................................
[CV] ................................................. , total= 4.1min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.1min remaining:    0.0s


[CV] ................................................. , total= 3.6min
[CV]  ................................................................
[CV] ................................................. , total= 3.9min
0.927705723738


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 11.7min finished


In [25]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 10), lowercase = True, max_df=0.5)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(C=50), X, train_suff_len, verbose=2))

[CV]  ................................................................
[CV] ................................................. , total= 2.0min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.0min remaining:    0.0s


[CV] ................................................. , total= 2.0min
[CV]  ................................................................
[CV] ................................................. , total= 1.9min
0.928793025848


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  5.9min finished


In [20]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 8), lowercase = True, max_df=0.9)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(C=50), X, train_suff_len, verbose=2))

[CV]  ................................................................
[CV] ................................................. , total= 2.2min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.2min remaining:    0.0s


[CV] ................................................. , total= 2.2min
[CV]  ................................................................
[CV] ................................................. , total= 2.5min
0.927427570278


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  6.9min finished


In [21]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 8), lowercase = True, max_df=0.5)
X = cv.fit_transform(train_dataset[0])
print np.mean(cross_val_score(LogisticRegression(C=30), X, train_suff_len, verbose=2))

[CV]  ................................................................
[CV] ................................................. , total= 2.0min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.0min remaining:    0.0s


[CV] ................................................. , total= 1.8min
[CV]  ................................................................
[CV] ................................................. , total= 2.0min
0.927570863321


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  5.9min finished


<h2>Полный ответ</h2>

In [9]:
%%time
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 7), lowercase = True)
X = cv.fit_transform(train_dataset[0])
ind = 4*X.shape[0]/5
clf = LogisticRegression()
clf.fit(X[:ind], train_dataset[4][:ind])
class_pred = clf.predict(X[ind:])
print 'class predicted'
clf.fit(X[:ind], train_dataset[3][:ind])
ending_pred = clf.predict(X[ind:])
print 'ending predicted'
clf.fit(X[:ind], train_suff_len[:ind])
len_suff_pred = clf.predict(X[ind:])
print 'len predicted'

predictions = []
for i in range(len(len_suff_pred)):
    cutted_word = train_dataset[0][ind+i][:-len_suff_pred[i]] if len_suff_pred[i] > 0 else train_dataset[0][ind+i]
    predictions.append(cutted_word + ending_pred[i] + '+' + class_pred[i])

true_values = []
for i in range(len(len_suff_pred)):
    true_values.append(train_dataset[2][ind+i] + '+' + train_dataset[4][ind+i])
    
accuracy_score(true_values, predictions)

class predicted
ending predicted
len predicted


<h2> Дополнения в процессе </h2>

In [29]:
%%time
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 8), lowercase = True)
X = cv.fit_transform(train_dataset[0])
ind = 4*X.shape[0]/5
clf = LogisticRegression(C=50)
clf.fit(X[:ind], train_dataset[4][:ind])
class_pred = clf.predict(X[ind:])
print 'class predicted'

class predicted
CPU times: user 2min 53s, sys: 3.51 s, total: 2min 56s
Wall time: 51.1 s


In [30]:
def encode(array):
    encoded = []
    for elem in array:
        if elem == 'N':
            encoded.append(0)
        if elem == 'V':
            encoded.append(1)
        if elem == 'A':
            encoded.append(2)
    return np.array(encoded)

In [31]:
%%time
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 10), lowercase = True, max_df=0.5)
X = cv.fit_transform(train_dataset[0])

ohe = OneHotEncoder(sparse=False)
dummy_features = ohe.fit_transform(encode(train_dataset[4][:ind] + class_pred.tolist()).reshape(-1, 1))
X = csr_matrix(hstack([X, csr_matrix(dummy_features)]))

clf.fit(X[:ind], train_suff_len[:ind])
len_suff_pred = clf.predict(X[ind:])
print 'len predicted'

len predicted
CPU times: user 10min 54s, sys: 12.5 s, total: 11min 6s
Wall time: 2min 56s


In [32]:
%%time
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 7), lowercase = True)
X = cv.fit_transform(train_dataset[0])
X = csr_matrix(hstack([X, csr_matrix(train_suff_len[:ind] + len_suff_pred.tolist()).transpose()]))

clf.fit(X[:ind], train_dataset[3][:ind])
ending_pred = clf.predict(X[ind:])
print 'ending predicted'

predictions = []
for i in range(len(len_suff_pred)):
    cutted_word = train_dataset[0][ind+i][:-len_suff_pred[i]] if len_suff_pred[i] > 0 else train_dataset[0][ind+i]
    predictions.append(cutted_word + ending_pred[i] + '+' + class_pred[i])

true_values = []
for i in range(len(len_suff_pred)):
    true_values.append(train_dataset[2][ind+i] + '+' + train_dataset[4][ind+i])
    
print accuracy_score(true_values, predictions)

ending predicted
0.914784221173
CPU times: user 40min 45s, sys: 1min 2s, total: 41min 47s
Wall time: 11min


0.9185 c С=50

In [12]:
print accuracy_score(true_values, predictions)

0.912213418746
