In [1]:
import pandas as pd
import numpy as np
import warnings
from argparse import ArgumentParser
from pymystem3 import Mystem
from scipy.stats import spearmanr

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

from tqdm import tqdm
from compounds_utils import acquiring
from gensim.models import FastText

from compounds_utils import apply_distance, average_normalized, average_standard

from scipy.spatial import distance

  return f(*args, **kwds)


In [23]:
# def make_train_data(w1vecs, w2vecs, compvecs):
#     train = np.concatenate((np.array(w1vecs), np.array(w2vecs), np.array(compvecs)), axis=1)
#     print('Classification data created with shape', train.shape)
#     return StandardScaler().fit_transform(train)

def make_train_data(w1vecs, w2vecs, compvecs):
    dist = apply_distance(np.array(w1vecs), np.array(w2vecs), np.array(compvecs), distance.cosine, average_normalized)
    dist = dist.reshape(-1, 1)
    train = np.concatenate((np.array(w1vecs), np.array(w2vecs), np.array(compvecs), dist), axis=1)
    return StandardScaler().fit_transform(train)

In [24]:
compounds_path = './workdir/annotation_small_selected.csv'
model_path = './workdir/models/model_fasttext_300_mc5'
model_words_path = './workdir/models/model_fasttext_nocompounds_300_mc5'

model = FastText.load(model_path)
comp = pd.read_csv(compounds_path)
#model_words = FastText.load(model_words_path)
model_words = None

w1, w2, c, true = acquiring(comp, model, 'Катя (short list)', model_words=model_words)

vecs = make_train_data(w1, w2, c)

Number of examples:  201


In [27]:
accuracies = []
precision1 = []
precision0 = []
recall1 = []
recall0 = []
f11 = []
f10 = []
spearman = []
rocaucs = []

Cs = [1]
kernels = [1]
for C in Cs:
    for kernel in kernels:
        print(C, kernel)
        for state in tqdm(range(71, 71+25)):
            X_train, X_test, y_train, y_test = train_test_split(vecs, true, test_size=.25, random_state=state)

            #clf = SVC(C=1, kernel='linear', random_state=51, class_weight='balanced')
            #clf = SVC(C=1, kernel='linear', random_state=51)
            #clf = MLPClassifier(alpha=1, solver='lbfgs', hidden_layer_sizes=(200,80,20, ), random_state=42)
            #clf = DecisionTreeClassifier(max_depth=10, max_features=20, random_state=42)
            clf = GaussianNB()
            clf.fit(X_train, y_train)
            pred = clf.predict(X_test)
            accuracies.append(accuracy_score(pred, y_test))
            precision1.append(precision_score(pred, y_test))
            precision0.append(precision_score(pred, y_test, pos_label=0))
            recall1.append(recall_score(pred, y_test))
            recall0.append(recall_score(pred, y_test, pos_label=0))
            f11.append(f1_score(pred, y_test))
            f10.append(f1_score(pred, y_test, pos_label=0))
            #print(pred, y_test)
            try:
                rocaucs.append(roc_auc_score(pred, y_test))
            except:
                pass
            with warnings.catch_warnings():
                warnings.filterwarnings('error')
                try:
                    corr = spearmanr(pred, y_test)[0]
                    spearman.append(corr)
                except Warning:
                    spearman.append(0)


        print('accuracy=', '%.4f' % np.mean(accuracies))
        print('precision=', ['%.4f' % np.mean(precision1), '%.4f' % np.mean(precision0)])
        print('recall=', ['%.4f' % np.mean(recall1), '%.4f' % np.mean(recall0)])
        print('f1=',['%.4f' % np.mean(f11), '%.4f' % np.mean(f10)])
        print('spearman=', '%.4f' % np.mean(spearman))
        print('roc_auc=', '%.4f' % np.mean(rocaucs))

        print('.....................................')

 36%|███▌      | 9/25 [00:00<00:00, 86.76it/s]

1 1


100%|██████████| 25/25 [00:00<00:00, 96.17it/s]

accuracy= 0.8431
precision= ['0.8917', '0.5776']
recall= ['0.9191', '0.5137']
f1= ['0.9040', '0.5261']
spearman= 0.4455
roc_auc= 0.7164
.....................................



