In [77]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Activation
from keras.callbacks import ModelCheckpoint
import numpy as np
from random import randint
import h5py

In [128]:
np.random.seed(7)

In [78]:
def vectors(file):
    dct = {}
    with open(file, 'r', encoding = 'utf-8') as rfile:
        for line in rfile:
            line = line.split()
            dct[line[0]] = np.asarray(line[1:], dtype='float32')
    return(dct)

In [79]:
def wv(word,dct):
    if word in dct:
        vector = dct[word]
    else:
        vector = np.random.uniform(-1,1,(100,)).astype(np.float32)
    return(vector)

In [115]:
def data_prepare(file,dct):
    count = 0
    #creating X: number_of_samples x 3(pivot,comparison,feature) x 100(word_vector length)
    X = np.zeros((0,3,100), dtype=np.float)
    y = np.zeros((0,2), dtype=np.bool)
    count = 0
    with open(file,'r',encoding= 'utf-8') as rfile:
        for line in rfile:
            count += 1
            line = line.rstrip().split(',')
            words = line[:3]
            answer = line[3]

            sample = np.zeros((0,100), dtype=np.float)
            for word in words:
                sample = np.append(sample,[wv(word,dct)], axis = 0)
            X = np.append(X,[sample], axis = 0)

            if answer == '1':
                y = np.append(y,[[0,1]], axis = 0)
            else:
                y = np.append(y,[[1,0]], axis = 0)
            
    return(X,y)

In [81]:
def answers(dct,model):
    answer_file = 'res/answer.txt'
    val_file = 'ref/validation.txt'
    count = 0
    with open(answer_file,'w', encoding = 'utf-8') as ans_file:
        with open(val_file,'r',encoding = 'utf-8') as rfile:
            for line in rfile:
                count += 1
                if count >= 1723:
                    line = line.rstrip().split(',')
                    words = line[:3]
                    sample = np.zeros((0,100), dtype=np.float)
                    for word in words:
                        sample = np.append(sample,[wv(word,dct)], axis = 0)
                    sample = np.expand_dims(sample, axis = 0)
                    output = model.predict(sample)
                    answer = np.argmax(output) #index of the higher probability
                    ans_file.write('{},{},{},{}\n'.format(words[0],words[1],words[2],answer))

In [8]:
%time dct = vectors('glove/glove.6B.100d.txt')

Wall time: 25.4 s


In [82]:
len(dct)

400000

In [116]:
#%time X,y = data_prepare('train/train.txt',dct)

%time X,y = data_prepare('ref/validation.txt',dct)
X_tn = X[:-1000]
X_ts = X[-1000:]
y_tn = y[:-1000]
y_ts = y[-1000:]

Wall time: 5.26 s


In [121]:
len(y_ts)

1000

In [157]:
print('Build model...')
model = Sequential()
model.add(LSTM(1024, input_shape=(3, 100)))
model.add(Dropout(0.2))
for i in range(1):
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.2))
model.add(Dense(2))
model.add(Activation('softmax'))

Build model...


In [158]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [159]:
filepath="weights-improvement-{epoch:02d}.hdf5"
checkpoint = ModelCheckpoint(filepath, verbose=1, save_best_only=True, mode='max', monitor='val_acc')
callbacks_list = [checkpoint]

In [160]:
json_string = model.to_json()
with open("model.json", "w") as text_file:
    text_file.write(json_string)

In [161]:
% time model.fit(X_tn, y_tn, epochs=20, batch_size=16, validation_data=(X_ts, y_ts), verbose=0, callbacks=callbacks_list)


Epoch 00000: val_acc improved from -inf to 0.60600, saving model to weights-improvement-00.hdf5
Epoch 00001: val_acc improved from 0.60600 to 0.66700, saving model to weights-improvement-01.hdf5
Epoch 00002: val_acc did not improve
Epoch 00003: val_acc did not improve
Epoch 00004: val_acc improved from 0.66700 to 0.68700, saving model to weights-improvement-04.hdf5
Epoch 00005: val_acc did not improve
Epoch 00006: val_acc did not improve
Epoch 00007: val_acc did not improve
Epoch 00008: val_acc improved from 0.68700 to 0.70100, saving model to weights-improvement-08.hdf5
Epoch 00009: val_acc improved from 0.70100 to 0.70400, saving model to weights-improvement-09.hdf5
Epoch 00010: val_acc did not improve
Epoch 00011: val_acc did not improve
Epoch 00012: val_acc improved from 0.70400 to 0.70600, saving model to weights-improvement-12.hdf5
Epoch 00013: val_acc improved from 0.70600 to 0.71800, saving model to weights-improvement-13.hdf5
Epoch 00014: val_acc did not improve
Epoch 00015: v

<keras.callbacks.History at 0x8100b3c8>

In [162]:
#model.save_weights('73_my_model_weights.h5')

In [163]:
answers(dct,model)

In [164]:
def predict(triple,dct):
    x = np.zeros((0,3,100), dtype=np.float)
    sample = np.zeros((0,100), dtype=np.float)
    for word in triple:
        sample = np.append(sample,[wv(word,dct)], axis = 0)
    #print(sample.shape)
    x = np.append(x,[sample], axis = 0)
    #print(x.shape)
    #print(x)
    preds = model.predict(x, verbose=0)[0]
    preds = preds.tolist()
    answer = preds[1]
    #answer = np.argmax(preds)
    #print(answer)
    return answer

In [19]:
from gensim.models import KeyedVectors
from sklearn.metrics import classification_report
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
from nltk.corpus import brown
from nltk import bigrams
from collections import Counter

In [165]:
c = Counter(bigrams(list(brown.words())))

def get_brown_bigram_cooccurence(w1, w2, c=c):
    return c[(w1,w2)]
brown_ic = wordnet_ic.ic('ic-brown.dat')

In [22]:
startTime = datetime.now()
print('Loading w2v model...')
wvmodel = KeyedVectors.load_word2vec_format('w2vmodels/GoogleNews-vectors-negative300.bin', binary=True)
wvmodel.init_sims(replace=True)
print('w2v model is loaded after', datetime.now() - startTime)

Loading w2v model...
w2v model is loaded after 0:02:47.697484


In [166]:
# Just to make it a bit more readable
WN_NOUN = 'n'
WN_VERB = 'v'
WN_ADJECTIVE = 'a'
WN_ADJECTIVE_SATELLITE = 's'
WN_ADVERB = 'r'

In [167]:
def convert(word, from_pos, to_pos):
    """ Transform words given from/to POS tags """

    synsets = wn.synsets(word, pos=from_pos)

    # Word not found
    if not synsets:
        return []

    # Get all lemmas of the word (consider 'a'and 's' equivalent)
    lemmas = [l for s in synsets
                for l in s.lemmas()
                if s.name().split('.')[1] == from_pos
                    or from_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE)
                        and s.name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE)]

    # Get related forms
    derivationally_related_forms = [(l, l.derivationally_related_forms()) for l in lemmas]

    # filter only the desired pos (consider 'a' and 's' equivalent)
    related_noun_lemmas = [l for drf in derivationally_related_forms
                             for l in drf[1]
                             if l.synset().name().split('.')[1] == to_pos
                                or to_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE)
                                    and l.synset.name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE)]

    # Extract the words from the lemmas
    words = [l.name() for l in related_noun_lemmas]
    len_words = len(words)

    # Build the result in the form of a list containing tuples (word, probability)
    result = [(w, float(words.count(w))/len_words) for w in set(words)]
    result.sort(key=lambda w: -w[1])

    # return all the possibilities sorted by probability
    return result

In [177]:
def get_wordnet_features(triples):
    X_orig = list()
    for triple in triples:
        norm_third = triple[2]
        try:
            first = wn.synsets(triple[0])[0]
            second = wn.synsets(triple[1])[0]
            try:
                third_orig = wn.synsets(norm_third)[0]
            except IndexError:
                # print(wn.synsets(norm_third), norm_third)
                X_orig.append('no_stuff')
                continue
            if third_orig.pos() != 'n':
                try:
                    third_n = convert(norm_third, third_orig.pos(), WN_NOUN)[0][0]
                except IndexError:
                    # print(convert(norm_third, third_orig.pos(), WN_NOUN), norm_third, third_orig.pos())
                    X_orig.append('no_stuff')
                    continue
            else:
                third_n = third_orig.name().split('.')[0]
                #print(third_orig.name().split('.')[0])
            third = wn.synsets(third_n)[0]
            dist_one = wn.wup_similarity(first, third)
            dist_two = wn.wup_similarity(second, third)
            dist_bet = wn.wup_similarity(first, second)
            path_one = first.path_similarity(third)
            path_two = second.path_similarity(third)
            path_bet = first.path_similarity(second)
            res_one = first.res_similarity(third, brown_ic)
            res_two = second.res_similarity(third, brown_ic)
            #res_bet = first.res_similarity(second, brown_ic)
            lin_one = first.lin_similarity(third, brown_ic)
            lin_two = second.lin_similarity(third, brown_ic)
            lin_bet = first.lin_similarity(second, brown_ic)
            lch_one = first.lch_similarity(third, brown_ic)
            lch_two = second.lch_similarity(third, brown_ic)
            lch_bet = first.lch_similarity(second, brown_ic)
            vector = [dist_one, dist_two, path_one, path_two, res_one, res_two, lin_one, lin_two, lch_one, lch_two, \
                      dist_bet, path_bet, lin_bet, lch_bet]
            if None in vector:
                print(triple, vector, first, second, first.path_similarity(third))
            X_orig.append(vector)
        except IndexError as e:
            print(e)

    #print(np.array([row for row in X_orig if row != 'no_stuff']))
    good_only = np.mean(np.array([row for row in X_orig if row != 'no_stuff']), axis=0)
    #print(good_only.tolist())
    X_orig_with_means = []
    for row in X_orig:
        if row == 'no_stuff':
            X_orig_with_means.append(good_only.tolist())
        else:
            X_orig_with_means.append(row)

    X = np.array(X_orig_with_means, dtype=float)
    X = np.nan_to_num(X)
    return X_orig_with_means

In [168]:
def split_triples(data_set):
    """Парсит триплы данных в массив кортежей"""
    data_set = open(data_set, 'r', encoding='utf-8')
    triples = list()
    for triple in data_set:
        triple = triple.split(',')
        word1 = triple[0]
        word2 = triple[1]
        feature = triple[2]
        triples.append((word1, word2, feature))
    data_set.close()
    return triples

In [169]:
def get_truth(data_set):
    """Достает правильные ответы из данных = truth"""
    data_set = open(data_set, 'r', encoding='utf-8')
    truth = list()
    for triple in data_set:
        triple = triple.split(',')
        answer = triple[3]
        truth.append(answer)
    data_set.close()
    return truth

In [215]:
def get_all_features(data_set):
    """Считает similarity по гугл-n-грамам для всех пар в трипле.
    Если такого слова нет в n-грамах, пишет 'not_in_voc'ю
    Результат кладет в список списков - так и подается на обучение классификатору."""
    features_data = list()
    triples = split_triples(data_set)
    wordndet_features = get_wordnet_features(triples)
    n = 0
    for triple in triples:
        word1 = triple[0]
        word2 = triple[1]
        feature = triple[2]
        #lstm prediction as feature
        lstm_predict = predict(triple,dct)
        #coocurence in brown corpus as feature
        coc_one = get_brown_bigram_cooccurence(word1, feature)
        coc_two = get_brown_bigram_cooccurence(word2, feature)
        #most important feature
        #important_features = [len(word1), len(word2), len(feature),len(word1)/len(word2), len(word1)/len(feature), len(word2)/len(feature)]
        #vector similarity as feature
        try:
            word1_and_feature_sim = wvmodel.wv.similarity(word1, feature)
        except KeyError:
            word1_and_feature_sim = 'not_in_voc'
        try:
            word2_and_feature_sim = wvmodel.wv.similarity(word2, feature)
        except KeyError:
            word2_and_feature_sim = 'not_in_voc'
        try:
            word1_and_word2_sim = wvmodel.wv.similarity(word1, word2)
        except KeyError:
            word1_and_word2_sim = 'not_in_voc'
        features_data.append([word1_and_feature_sim, word2_and_feature_sim, word1_and_word2_sim,lstm_predict, coc_one, coc_two] + wordndet_features[n])
        n += 1
    return features_data

In [216]:
def calculate_average_similarity(similarity_data):
    """Считает среднюю similarity по гугл-n-грамам по всем данным"""
    similarity_sum = 0
    norm_sim_amount = 0
    for feature_set in similarity_data:
        for feature in feature_set[:3]:
            if feature != 'not_in_voc':
                similarity_sum += feature
                norm_sim_amount += 1
    average_similarity = similarity_sum / norm_sim_amount
    return average_similarity

In [217]:
def clean_similarity_data(data_set):
    """Заменяет 'not_in_voc' на среднюю similarity в данных"""
    all_features_data = get_all_features(data_set)
    average_similarity = calculate_average_similarity(all_features_data)
    print('\nAverage similarity for {0} is'.format(data_set), average_similarity, '\n')
    similarity_data_clean = list()
    for feature_set in all_features_data:
        feature_set_clean = [average_similarity if x == 'not_in_voc' else x for x in feature_set]
        similarity_data_clean.append(feature_set_clean)
    return similarity_data_clean

In [218]:
def split_data(train_set):
    # загружаем данные (обучение + тест в одном файле)
    # собираем фичи - X
    data = clean_similarity_data(train_set)
    # делим данные - первые 1772 на train, последние 1000 - на test
    X_train = data[:-1000]
    X_test = data[-1000:]

    # собираем ответы - y
    y = [int(x.split('\n')[0]) for x in get_truth(train_set)]
    # делим ответы - первые 1772 train, последние 1000 - test
    y_train = y[:-1000]
    y_test = y[-1000:]

    # делим все данные на train и test так, чтобы в test было 1000
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000, random_state=33)
    return X_train, X_test, y_train, y_test

In [219]:
def train_regression_print_results(X_train, X_test, y_train, y_test, seed, C):

    # тренируем логистическую регрессию
    clf_logreg = linear_model.LogisticRegression(random_state=seed, C=C)
    clf_logreg.fit(X_train, y_train)

    # оцениваем результат
    predictions = clf_logreg.predict(X_test)
    print(classification_report(y_test, predictions))
    # print('f1: ' + str(f1_score(y_test, predictions))) - эта хрень не то показывает

    print('X_train length:', len(X_train))
    print('y_test lenght:', len(y_test), '\npredictions length:', len(predictions))
    return predictions

In [220]:
def print_results_in_file(predictions, y_test):
    # записываем результат в нужную папку для запуска официального evaluation скрипта
    res = open('./trial/res/answer.txt', 'w', encoding='utf-8')
    truth_res = open('./trial/ref/truth.txt', 'w', encoding='utf-8')
    print('\nWriting results to "answer.txt".')
    triples = split_triples(train_set)[-1000:]
    for i in range(len(predictions)):
        w1 = triples[i][0]
        w2 = triples[i][1]
        feat = triples[i][2]
        result = predictions[i]
        truth = y_test[i]
        res.write(w1 + ',' + w2 + ',' + feat + ',' + str(result) + '\n')
        truth_res.write(w1 + ',' + w2 + ',' + feat + ',' + str(truth) + '\n')
    res.close()
    truth_res.close()
    print('\nThe results are written to "answer.txt".')

In [221]:
train_set = './training/validation.txt'

%time X_train, X_test, y_train, y_test = split_data(train_set)


Average similarity for ./training/validation.txt is 0.287926773692 

Wall time: 57.7 s


In [222]:
from sklearn.preprocessing import StandardScaler

X_train_scaled = StandardScaler().fit_transform(X_train)
X_test_scaled = StandardScaler().fit_transform(X_test)

print('X_train[0]:', X_train[0])
print('X_train_scaled[0]:', X_train_scaled[0])

X_train[0]: [0.52668244997625446, 0.69899092361566229, 0.53967745076214657, 0.9999985694885254, 0, 0, 0.7058823529411765, 0.875, 0.16666666666666666, 0.3333333333333333, 6.204848558361875, 8.233151896511865, 0.659789269855249, 0.7876718705275318, 1.845826690498331, 2.538973871058276, 0.7058823529411765, 0.16666666666666666, 0.6541370794030843, 1.845826690498331]
X_train_scaled[0]: [ 1.99951746  3.38860009  0.59187768  1.030703   -0.06553302 -0.08412156
  1.59813406  2.39367351  0.64550426  3.14105495  2.54419379  3.8748521
  2.64426243  3.43104137  1.28918164  2.87152386  0.47113111 -0.02697961
  1.00714807  0.29933199]


In [223]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM",
         # "Gaussian Process",
         "Decision Tree", "Random Forest",
         "Neural Net 1", "Neural Net 0.001", "Neural Net 0.0001", "Neural Net 0.00001",
         "AdaBoost Decision Tree", "AdaBoost Random Forest",
         "Naive Bayes",
         "QDA"]

classifiers = [
    KNeighborsClassifier(10),
    SVC(kernel="linear", C=100),
    SVC(gamma=2, C=100),
    # GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=20),
    RandomForestClassifier(n_estimators=1000, random_state=42, n_jobs=-1),
    MLPClassifier(alpha=1), MLPClassifier(alpha=0.001), MLPClassifier(alpha=0.0001),
    MLPClassifier(alpha=0.00000001),
    AdaBoostClassifier(DecisionTreeClassifier(max_depth=30), n_estimators=100, learning_rate=1.5, algorithm="SAMME"),
    AdaBoostClassifier(RandomForestClassifier(n_estimators=1000, random_state=42, n_jobs=-1), n_estimators=1000, random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [224]:
for name, clf in zip(names, classifiers):
    print(name)
    clf.fit(X_train_scaled, y_train)
    predictions = clf.predict(X_test_scaled)
    score = classification_report(y_test, predictions)
    if name == 'Nearest Neighbors':
        print_results_in_file(predictions, y_test)
    print(score, '\n')
    try:
        print(clf.feature_importances_)
    except AttributeError as e:
        print(e)

Nearest Neighbors

Writing results to "answer.txt".

The results are written to "answer.txt".
             precision    recall  f1-score   support

          0       0.72      0.78      0.75       509
          1       0.75      0.69      0.72       491

avg / total       0.74      0.74      0.74      1000
 

'KNeighborsClassifier' object has no attribute 'feature_importances_'
Linear SVM
             precision    recall  f1-score   support

          0       0.74      0.72      0.73       509
          1       0.72      0.74      0.73       491

avg / total       0.73      0.73      0.73      1000
 

'SVC' object has no attribute 'feature_importances_'
RBF SVM
             precision    recall  f1-score   support

          0       0.68      0.81      0.74       509
          1       0.76      0.61      0.67       491

avg / total       0.72      0.71      0.71      1000
 

'SVC' object has no attribute 'feature_importances_'
Decision Tree
             precision    recall  f1-score   s