In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sc
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import OneHotEncoder

In [2]:
f = open('task2_lemmas_train')
lines = []
for line in f:
    lines.append(line[:-2].decode('utf-8'))
    
train_data = []
for line in lines[1:]:
    array = line.split(',')
    train_data.append([array[1], zip(map(lambda x: x[:-2], array[2:]), map(lambda x: x[-1], array[2:]))])
    
f = open('task2_lemmas_test')
test_words = []
for line in f:
    test_words.append(line[:-2].decode('utf-8').split(',')[-1])
test_words = test_words[1:]

In [3]:
def shortest_of(strings):
    return min(strings, key=len)

def long_substr(strings):
    substr = ""
    if not strings:
        return substr
    reference = shortest_of(strings) 
    length = len(reference)
    for i in xrange(length):
        for j in xrange(i + len(substr) + 1, length + 1):
            candidate = reference[i:j]  
            if all(candidate in text for text in strings):
                substr = candidate
    return substr

def prefix(word, root):
    for i in range(len(word)-len(root) + 1):
        if word[i:i+len(root)] == root:
            return word[:i]
        
def suffix(word, root):
    ret = prefix(word[::-1], root[::-1])
    return ret[::-1] if ret else u''

def encode(array):
    encoded = []
    for elem in array:
        if elem == 'N':
            encoded.append(0)
        if elem == 'V':
            encoded.append(1)
        if elem == 'A':
            encoded.append(2)
    return np.array(encoded)

In [4]:
train_dataset = [[], [], [], [], []]
suffixes = []
for elem in train_data:
    strings = [elem[0]] + map(lambda x: x[0], elem[1])
    root = long_substr(strings)
    if root != '' and len(root) > 1:
        if root[0] == '-':
            root = root[1:]
    train_dataset[0].append(elem[0])
    train_dataset[1].append(suffix(elem[0], root))
    train_dataset[2].append(elem[1][0][0])
    train_dataset[3].append(suffix(elem[1][0][0], root))
    train_dataset[4].append(elem[1][0][1])
    for x in strings:
        suff = suffix(x, root)
        if suff:
            suffixes.append(suff) 
        
suffixes = sc.unique(suffixes)

train_suff_len = map(len, train_dataset[1])
w_length_train = []
for word in train_dataset[0]:
    w_length_train.append(len(word))
w_length_train = np.array(w_length_train)

<h2> Stack </h2>

In [9]:
%%time
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 8), lowercase = True)
X = cv.fit_transform(train_dataset[0])
ind = 4*X.shape[0]/5
ind1 = 4*ind/5
clf = LogisticRegression(C=50)
clf.fit(X[:ind1], train_dataset[4][:ind1])
class_pred = clf.predict(X[ind:])
print 'class predicted'

cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 10), lowercase = True, max_df=0.5)
X = cv.fit_transform(train_dataset[0])

ohe = OneHotEncoder(sparse=False)
dummy_features = ohe.fit_transform(encode(train_dataset[4][:ind] + class_pred.tolist()).reshape(-1, 1))
X = csr_matrix(hstack([X, csr_matrix(dummy_features)]))

clf.fit(X[:ind1], train_suff_len[:ind1])
len_suff_pred = clf.predict(X[ind:])
print 'len predicted'

cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 7), lowercase = True)
X = cv.fit_transform(train_dataset[0])
X = csr_matrix(hstack([X, csr_matrix(train_suff_len[:ind] + len_suff_pred.tolist()).transpose()]))

clf.fit(X[:ind1], train_dataset[3][:ind1])
ending_pred = clf.predict(X[ind:])
print 'ending predicted'

predictions = []
for i in range(len(len_suff_pred)):
    cutted_word = train_dataset[0][ind+i][:-len_suff_pred[i]] if len_suff_pred[i] > 0 else train_dataset[0][ind+i]
    predictions.append(cutted_word + ending_pred[i] + '+' + class_pred[i])

true_values = []
for i in range(len(len_suff_pred)):
    true_values.append(train_dataset[2][ind+i] + '+' + train_dataset[4][ind+i])

print accuracy_score(true_values, predictions)

class predicted
len predicted
ending predicted
0.904585300067
CPU times: user 46min 2s, sys: 1min 8s, total: 47min 10s
Wall time: 12min 55s


In [10]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 8), lowercase = True)
X = cv.fit_transform(train_dataset[0])
ohe = OneHotEncoder(sparse=False)
dummy_features = ohe.fit_transform(encode(train_dataset[4][:ind] + class_pred.tolist()).reshape(-1, 1))
X = csr_matrix(hstack([X, csr_matrix(dummy_features), csr_matrix(train_suff_len[:ind] + len_suff_pred.tolist()).transpose()]))
#dummy_features = ohe.fit_transform(encode(train_dataset[3][:ind] + ending_pred.tolist()).reshape(-1, 1))
#X = csr_matrix(hstack(pX, csr_matrix(dummy_features)))
clf = LogisticRegression(C=50)
clf.fit(X[ind1:ind], train_dataset[4][ind1:ind])
class_pred = clf.predict(X[ind:])
print 'class predicted'

cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 10), lowercase = True, max_df=0.5)
X = cv.fit_transform(train_dataset[0])
ohe = OneHotEncoder(sparse=False)
dummy_features = ohe.fit_transform(encode(train_dataset[4][:ind] + class_pred.tolist()).reshape(-1, 1))
X = csr_matrix(hstack([X, csr_matrix(dummy_features), csr_matrix(train_suff_len[:ind] + len_suff_pred.tolist()).transpose()]))
#dummy_features = ohe.fit_transform(encode(train_dataset[3][:ind] + ending_pred.tolist()).reshape(-1, 1))
#X = csr_matrix(hstack(pX, csr_matrix(dummy_features)))
clf.fit(X[ind1:ind], train_suff_len[ind1:ind])
len_suff_pred = clf.predict(X[ind:])
print 'len predicted'

cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 7), lowercase = True)
X = cv.fit_transform(train_dataset[0])
ohe = OneHotEncoder(sparse=False)
dummy_features = ohe.fit_transform(encode(train_dataset[4][:ind] + class_pred.tolist()).reshape(-1, 1))
X = csr_matrix(hstack([X, csr_matrix(dummy_features), csr_matrix(train_suff_len[:ind] + len_suff_pred.tolist()).transpose()]))
#dummy_features = ohe.fit_transform(encode(train_dataset[3][:ind] + ending_pred.tolist()).reshape(-1, 1))
#X = csr_matrix(hstack(pX, csr_matrix(dummy_features)))

clf.fit(X[ind1:ind], train_dataset[3][ind1:ind])
ending_pred = clf.predict(X[ind:])
print 'ending predicted'

predictions = []
for i in range(len(len_suff_pred)):
    cutted_word = train_dataset[0][ind+i][:-len_suff_pred[i]] if len_suff_pred[i] > 0 else train_dataset[0][ind+i]
    predictions.append(cutted_word + ending_pred[i] + '+' + class_pred[i])

true_values = []
for i in range(len(len_suff_pred)):
    true_values.append(train_dataset[2][ind+i] + '+' + train_dataset[4][ind+i])

print accuracy_score(true_values, predictions)

class predicted
len predicted
ending predicted
0.882880984491


In [8]:
ohe = OneHotEncoder(sparse=False)
dummy_features = ohe.fit_transform(np.chararray(train_dataset[3][:ind] + ending_pred.tolist()).reshape(-1, 1))

ValueError: sequence too large; cannot be greater than 32