In [47]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sc
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import OneHotEncoder

In [48]:
f = open('task2_lemmas_train')
lines = []
for line in f:
    lines.append(line[:-2].decode('utf-8'))
    
train_data = []
for line in lines[1:]:
    array = line.split(',')
    train_data.append([array[1], zip(map(lambda x: x[:-2], array[2:]), map(lambda x: x[-1], array[2:]))])
    
f = open('task2_lemmas_test')
test_words = []
for line in f:
    test_words.append(line[:-2].decode('utf-8').split(',')[-1])
test_words = test_words[1:]

In [49]:
def shortest_of(strings):
    return min(strings, key=len)

def long_substr(strings):
    substr = ""
    if not strings:
        return substr
    reference = shortest_of(strings) 
    length = len(reference)
    for i in xrange(length):
        for j in xrange(i + len(substr) + 1, length + 1):
            candidate = reference[i:j]  
            if all(candidate in text for text in strings):
                substr = candidate
    return substr

def prefix(word, root):
    for i in range(len(word)-len(root) + 1):
        if word[i:i+len(root)] == root:
            return word[:i]
        
def suffix(word, root):
    ret = prefix(word[::-1], root[::-1])
    return ret[::-1] if ret else u''

def encode(array):
    encoded = []
    for elem in array:
        if elem == 'N':
            encoded.append(0)
        if elem == 'V':
            encoded.append(1)
        if elem == 'A':
            encoded.append(2)
    return np.array(encoded)

In [50]:
train_dataset = [[], [], [], [], []]
suffixes = []
for elem in train_data:
    strings = [elem[0]] + map(lambda x: x[0], elem[1])
    root = long_substr(strings)
    if root != '' and len(root) > 1:
        if root[0] == '-':
            root = root[1:]
    train_dataset[0].append(elem[0])
    train_dataset[1].append(suffix(elem[0], root))
    train_dataset[2].append(elem[1][0][0])
    train_dataset[3].append(suffix(elem[1][0][0], root))
    train_dataset[4].append(elem[1][0][1])
    for x in strings:
        suff = suffix(x, root)
        if suff:
            suffixes.append(suff) 
        
suffixes = sc.unique(suffixes)

In [51]:
train_suff_len = map(len, train_dataset[1])

In [35]:
%%time
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 8), lowercase = True)
X = cv.fit_transform(train_dataset[0] + test_words)
X_train, X_test = X[:len(train_dataset[0])], X[len(train_dataset[0]):]
clf = LogisticRegression(C=50)
clf.fit(X_train, train_dataset[4])
class_pred = clf.predict(X_test)
print 'class predicted'

cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 10), lowercase = True, max_df=0.5)
X = cv.fit_transform(train_dataset[0] + test_words)
X = csr_matrix(hstack([X, csr_matrix(train_suff_len + len_suff_pred.tolist()).transpose()]))
X_train, X_test = X[:len(train_dataset[0])], X[len(train_dataset[0]):]
clf.fit(X_train, train_suff_len)
len_suff_pred = clf.predict(X_test)
print 'len predicted'

cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 7), lowercase = True)
X = cv.fit_transform(train_dataset[0] + test_words)
ohe = OneHotEncoder(sparse=False)
dummy_features = ohe.fit_transform(encode(train_dataset[4] + class_pred.tolist()).reshape(-1, 1))
X = csr_matrix(hstack([X, csr_matrix(dummy_features)]))
X_train, X_test = X[:len(train_dataset[0])], X[len(train_dataset[0]):]
clf.fit(X_train, train_dataset[3])
ending_pred = clf.predict(X_test)
print 'ending predicted'

class predicted
ending predicted
len predicted
CPU times: user 54min 20s, sys: 1min 15s, total: 55min 35s
Wall time: 14min 24s


In [56]:
predictions = []
for i in range(len(len_suff_pred)):
    cutted_word = test_words[i][:-len_suff_pred[i]] if len_suff_pred[i] > 0 else test_words[i]
    predictions.append(cutted_word + ending_pred[i] + '+' + class_pred[i])

In [55]:
new_sub = pd.DataFrame({"Category":map(lambda x: x.encode('utf-8'), predictions), "Id":np.arange(len(predictions))+1})
new_sub.set_index('Id', inplace=True)
new_sub.to_csv('t_sub.csv')

In [40]:
print len(predictions), len(test_words)
print test_words[:10]

29661 29661
[u'gettonan', u'incidentali', u'involtino', u'lievi', u'comunistizzasse', u'vidimerebbe', u'imbrodan', u'strillar', u'cifrasti', u'compassavano']


In [57]:
print class_pred[:5]
print ending_pred[:5]
print len_suff_pred[:5]
print predictions[:5]
print test_words[:5]

[u'V' u'A' u'V' u'N' u'V']
[u're' u'e' u'are' u'o' u're']
[1 1 3 1 3]
[u'gettonare+V', u'incidentale+A', u'involtare+V', u'lievo+N', u'comunistizzare+V']
[u'gettonan', u'incidentali', u'involtino', u'lievi', u'comunistizzasse']
