# Concurrent Learning of Semantic Relations
## Authors : hidden

Notebook to reproduce the results of the paper with respect to identifying the semantic relations between words.

Note: the results may not be identical to those put to the paper due to randomization; the differences in any case should be very small. 

In [None]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
import numpy as np, pandas as pd
from collections import Counter
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import matplotlib.pyplot as plt
%matplotlib inline
from keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.dummy import DummyClassifier
# np.random.seed(44)
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))


## Step 1: Read the data and perform the lexical split

In [None]:
dff = pd.read_csv("./RUMEN/RumenPairs.txt")
dff.rename(columns={"W1":"w1", "W2":"w2","rel":"Category"}, inplace=True)

In [None]:
def get_names(cat):
    if cat == 0 : return "RANDOM"
    if cat == 1: return "HYPER"
    if cat == 2: return "SYN"
dff["Category"] = dff["Category"].apply(get_names)

In [None]:
df = dff.loc[dff.Category == "SYN"]
df2 = dff.loc[dff.Category == "HYPER"]
df3 = dff.loc[dff.Category == "RANDOM"]

In [None]:
words_coord = list(set(df.w1.values.tolist() + df.w2.values.tolist()))
words_hyper = list(set(df2.w1.values.tolist() + df2.w2.values.tolist()))
words_rando = list(set(df3.w1.values.tolist() + df3.w2.values.tolist()))

In [None]:
# load embeddings and mark the words that are in the embeddings dictionary

def get_vector_representation_of_word_pairs(dataframe, embeddings_voca):
    x1 = [embeddings_voca[word] for word in dataframe.w1.values]
    x2 =[embeddings_voca[word] for word in dataframe.w2.values]
    y = dataframe.Category.values
    x = np.hstack((x1, x2))
    return x, y

def load_embeddings(path, dimension):
    f = open(path, encoding="utf8").read().splitlines()
    vectors = {}
    for i in f:
        elems = i.split()
        vectors[" ".join(elems[:-dimension])] =  np.array(elems[-dimension:]).astype(float)
    return vectors

embeddings = load_embeddings("./glove.6B.300d.txt", 300)
df["known_words"] = df.apply(lambda l: l["w1"] in embeddings and l["w2"] in embeddings, axis =1  )
df2["known_words"] = df2.apply(lambda l: l["w1"] in embeddings and l["w2"] in embeddings, axis =1  )
df3["known_words"] = df3.apply(lambda l: l["w1"] in embeddings and l["w2"] in embeddings, axis =1  )

In [None]:
# Perform the lexical split using the vocabulary of the corpus

words_ = sorted(list(set(words_coord+words_hyper+words_rando)))
words_train, words_test =train_test_split(words_, test_size=0.4, random_state=1344)

# Given the words in the train and test parts, mark the pairs as training or testing, when both words of aa pair belong to the train or test vocabulary.
df["is_train"] = df.apply(lambda l : l["w1"] in words_train and l["w2"] in words_train and l["known_words"] == True, axis=1 )
df["is_test"] = df.apply(lambda l : l["w1"] in words_test and l["w2"] in words_test and l["known_words"] == True, axis=1)

df2["is_train"] = df2.apply(lambda l : l["w1"] in words_train and l["w2"] in words_train and l["known_words"] == True, axis=1 )
df2["is_test"] = df2.apply(lambda l : l["w1"] in words_test and l["w2"] in words_test and l["known_words"] == True, axis=1)

df3["is_train"] = df3.apply(lambda l : l["w1"] in words_train and l["w2"] in words_train and l["known_words"] == True, axis=1 )
df3["is_test"] = df3.apply(lambda l : l["w1"] in words_test and l["w2"] in words_test and l["known_words"] == True, axis=1)


df.shape[0], df.is_test.astype(int).sum(), df.is_train.astype(int).sum(), df.is_test.astype(int).sum() + df.is_train.astype(int).sum()


In [None]:
# Prepare the inputs of the learning systems (concatenation of GloVe embeddings)
xtrainCoord, ytrainCoord = get_vector_representation_of_word_pairs(df.loc[df.is_train==True], embeddings)
xtestCoord, ytestCoord   = get_vector_representation_of_word_pairs(df.loc[df.is_test==True], embeddings)

xtrainHyper, ytrainHyper = get_vector_representation_of_word_pairs(df2.loc[df2.is_train==True], embeddings)
xtestHyper, ytestHyper   = get_vector_representation_of_word_pairs(df2.loc[df2.is_test==True], embeddings)

xtrainRando, ytrainRando = get_vector_representation_of_word_pairs(df3.loc[df3.is_train==True], embeddings) 
xtestRando, ytestRando   = get_vector_representation_of_word_pairs(df3.loc[df3.is_test==True], embeddings)

In [None]:
from sklearn.utils import shuffle

In [None]:
x_train_1, x_train_2 = np.vstack((xtrainCoord, xtrainRando)), np.vstack((xtrainHyper, xtrainRando))
y_train_1, y_train_2 = [1]*len(xtrainCoord) + [0]*len(xtrainRando), [1]*len(xtrainHyper) + [0]*len(xtrainRando)


x_test_1, x_test_2 = np.vstack((xtestCoord, xtestRando)), np.vstack((xtestHyper, xtestRando))
y_test_1, y_test_2 = [1]*len(xtestCoord) + [0]*len(xtestRando), [1]*len(xtestHyper) + [0]*len(xtestRando)

x_train_1, y_train_1 = shuffle(x_train_1, y_train_1, random_state=1234)
x_train_2, y_train_2 = shuffle(x_train_2, y_train_2, random_state=1234)
x_test_1, y_test_1 = shuffle(x_test_1, y_test_1, random_state=1234)
x_test_2, y_test_2 = shuffle(x_test_2, y_test_2, random_state=1234)
assert len(x_train_1) == len(y_train_1)
assert len(x_train_2) == len(y_train_2)
assert len(x_test_1) == len(y_test_1)
assert len(x_test_2) == len(y_test_2)
print(len(x_train_1), len(x_train_2))

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.layers import Input, Dense, Dropout, BatchNormalization
from keras.models import Model
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelBinarizer, LabelEncoder


In [None]:
def get_my_nn_model():
    """Defines the NN baseline.
    Two hidden layers, followed by the output layer. 
    """
    model = Sequential()
    model.add(Dense(50, activation='sigmoid', input_dim=600))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy')
    return model

def get_n_most_confident_predictions(my_preds, n, y_train_data, x_unlabeled):
    """
    defines the function used for self-learning
    It selects 10 
    """
    y_train_data = np.array(y_train_data)
    probabilities = np.array([(y_train_data==0).sum()/y_train_data.shape[0], (y_train_data==1).sum()/y_train_data.shape[0]])
    probabilities = (probabilities * n).astype(int) 
    new_x, new_y, exclude_indices = [], [], []
    my_preds  = my_preds.reshape(1,-1)[0]
    negative_to_positive = np.argsort(my_preds)
    new_x.extend(x_unlabeled[negative_to_positive][:probabilities[0]])
    new_x.extend(x_unlabeled[negative_to_positive][-probabilities[1]:])
    new_y.extend([0]*probabilities[0])
    new_y.extend([1]*probabilities[1])
    exclude_indices.extend(negative_to_positive[:probabilities[0]])
    exclude_indices.extend(negative_to_positive[-probabilities[1]:])
    updated_unlabel_indices = np.setdiff1d(np.arange(len(x_unlabeled)), exclude_indices)  
#     print("Returning for self-training:", probabilities.sum(), "Unlabeled. Had: %d, now have: %d"%(len(x_unlabeled), len(updated_unlabel_indiced)))
    return np.array(new_x), np.array(new_y), x_unlabeled[updated_unlabel_indices]


In [None]:
def get_my_multitask_nn_models():
    inputs = Input(shape=(600,))
    x = Dense(50, activation='sigmoid')(inputs)

    coord = Dense(1, activation='sigmoid', name='coord_output')(x)
    hyper = Dense(1, activation='sigmoid', name='hyper_output')(x)

    model_coord = Model(inputs=[inputs], outputs=[coord])
    model_hyper = Model(inputs=[inputs], outputs=[hyper])

    model_coord.compile(optimizer='rmsprop', loss='binary_crossentropy')
    model_hyper.compile(optimizer='rmsprop', loss='binary_crossentropy')
    return model_coord, model_hyper

In [None]:
from sklearn.grid_search import GridSearchCV

**Note**: the results may not be identical to those put to the paper due to randomization; the differences in any case should be very small. 

In [None]:
data = {}
for name, x_train, y_train, x_test, y_test in zip(["Coord-Random", "Hyper-Random"], [x_train_1, x_train_2], [y_train_1, y_train_2], [x_test_1, x_test_2], [y_test_1, y_test_2]):   
    # Perform the splits in train, validation, unlabeled
    x_train, x_unlabeled, y_train, y_unlabeled = train_test_split(x_train, y_train, stratify=y_train, test_size=0.6, random_state=1234,)
    x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, stratify=y_train,  test_size=0.30, random_state=1234,)
    print("Problem", name, "\nBasic Stats:", len(x_train), len(x_valid), len(x_unlabeled), len(x_test), end=" \n")

    # Start with the accuracy and MaF1 scores of a Dummy Classifier
    clf = DummyClassifier(strategy="most_frequent")
    clf.fit(x_train, y_train)
    preds = clf.predict(x_test)
    print("Majority classifier:", accuracy_score(y_test, preds), f1_score(y_test, preds, average="macro"), end=" \n")

    
    # Logistic Regression with default params
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    preds = clf.predict(x_test)
    print("Logistic Regression:", accuracy_score(y_test, preds), f1_score(y_test, preds, average="macro"), end=" \n")
    
    # NN baseline
    model = get_my_nn_model()
    model.fit(x_train, y_train, epochs=500, validation_data=(x_valid, y_valid), verbose=False, callbacks=[EarlyStopping(patience=5)])
    preds = model.predict_classes(x_test, verbose=False)
    print("NN baseline:", accuracy_score(y_test, preds), f1_score(y_test, preds, average="macro"), end=" \n")
    
    # keep the train/validation/test splits so that hey can be used with multitask learning and the results are comparable between them
    data[name]={"x_train": x_train, "y_train":y_train, "x_unlabeled":x_unlabeled, "y_unlabeled":y_unlabeled,  "x_valid":x_unlabeled, "y_valid":y_unlabeled, "x_test":x_test,  "y_test":y_test } 
    
    # NN baseline + self learning 
    accuracy_scores, mif_scores = [], []
    for i in range(20):
        tmpx, tmpy, x_unlabeled = get_n_most_confident_predictions(model.predict(x_unlabeled), 10, y_train, x_unlabeled) # use 10 labels each time
        x_train = np.vstack((x_train, tmpx))
        y_train = np.concatenate((y_train, tmpy))
        model = get_my_nn_model()
        model.fit(x_train, y_train, nb_epoch=500, validation_data=(x_valid, y_valid), verbose=False, \
                  callbacks=[EarlyStopping(patience=5)],)
        preds = model.predict_classes(x_test, verbose=0)
        accuracy_scores.append(accuracy_score(y_test, preds))# , f1_score(y_test, preds, average="macro"))        
        mif_scores.append( f1_score(y_test, preds, average="macro") )
    
    print("NN baseline + self-learning", max(accuracy_scores), max(mif_scores)) 
    

In [None]:
# Repeat the process with multi-task learning
model_coord, model_hyper = get_my_multitask_nn_models()

acc_scores_coord,mif_scores_coord, acc_scores_hyper,  mif_scores_hyper= [], [], [], []

for epoch in range(250):
    model_coord.fit(data["Coord-Random"]["x_train"], data["Coord-Random"]["y_train"], epochs=1, validation_data=None, verbose=False, )
    model_hyper.fit(data["Hyper-Random"]["x_train"], data["Hyper-Random"]["y_train"], epochs=1, validation_data=None, verbose=False, )
    preds_coord = (model_coord.predict(data["Coord-Random"]["x_test"], verbose=0)> 0.5).astype(int)
    preds_hyper = (model_hyper.predict(data["Hyper-Random"]["x_test"], verbose=0)> 0.5).astype(int)

    preds_coord_valid = (model_coord.predict(data["Coord-Random"]["x_valid"], verbose=0)> 0.5).astype(int)
    preds_hyper_valid = (model_hyper.predict(data["Hyper-Random"]["x_valid"], verbose=0)> 0.5).astype(int)

    acc_scores_coord.append([accuracy_score(data["Coord-Random"]["y_valid"], preds_coord_valid), accuracy_score(data["Coord-Random"]["y_test"], preds_coord)])
    acc_scores_hyper.append([accuracy_score(data["Hyper-Random"]["y_valid"], preds_hyper_valid), accuracy_score(data["Hyper-Random"]["y_test"], preds_hyper)])
    mif_scores_coord.append([f1_score(data["Coord-Random"]["y_valid"], preds_coord_valid, average="macro") , f1_score(data["Coord-Random"]["y_test"], preds_coord, average="macro")])
    mif_scores_hyper.append([f1_score(data["Hyper-Random"]["y_valid"], preds_hyper_valid, average="macro") , f1_score(data["Hyper-Random"]["y_test"], preds_hyper, average="macro")])

# Monitor validation score in the score tuple and print accordingly
print("Multitask (Coord-Random):", max(acc_scores_coord)[1], max(mif_scores_coord)[1], "\nMultitask (Hyper-Random):",  max(acc_scores_hyper)[1], max(mif_scores_hyper)[1])


acc_scores_coord,mif_scores_coord, acc_scores_hyper,  mif_scores_hyper= [], [], [], []
for i in range(20):
    tmpx, tmpy, data["Coord-Random"]["x_unlabeled"] = get_n_most_confident_predictions(model_coord.predict(data["Coord-Random"]["x_unlabeled"]), 10, data["Coord-Random"]["y_train"], data["Coord-Random"]["x_unlabeled"])
    data["Coord-Random"]["x_train"] = np.vstack((data["Coord-Random"]["x_train"], tmpx))
    data["Coord-Random"]["y_train"] = np.concatenate((data["Coord-Random"]["y_train"], tmpy))

    tmpx, tmpy, data["Hyper-Random"]["x_unlabeled"] = get_n_most_confident_predictions(model_hyper.predict(data["Hyper-Random"]["x_unlabeled"]), 10, data["Hyper-Random"]["y_train"], data["Hyper-Random"]["x_unlabeled"])
    data["Hyper-Random"]["x_train"] = np.vstack((data["Hyper-Random"]["x_train"], tmpx))
    data["Hyper-Random"]["y_train"] = np.concatenate((data["Hyper-Random"]["y_train"], tmpy))
    acc_scores_coord_, mif_scores_coord_, acc_scores_hyper_,  mif_scores_hyper_ = [], [], [], []

    model_coord, model_hyper = get_my_multitask_nn_models()

    for epoch in range(500):
        model_coord.fit(data["Coord-Random"]["x_train"], data["Coord-Random"]["y_train"], epochs=1, validation_data=None, verbose=False, )
        model_hyper.fit(data["Hyper-Random"]["x_train"], data["Hyper-Random"]["y_train"], epochs=1, validation_data=None, verbose=False, )
        preds_coord = (model_coord.predict(data["Coord-Random"]["x_test"], verbose=0)> 0.5).astype(int)
        preds_hyper = (model_hyper.predict(data["Hyper-Random"]["x_test"], verbose=0)> 0.5).astype(int)

        preds_coord_valid = (model_coord.predict(data["Coord-Random"]["x_valid"], verbose=0)> 0.5).astype(int)
        preds_hyper_valid = (model_hyper.predict(data["Hyper-Random"]["x_valid"], verbose=0)> 0.5).astype(int)

        acc_scores_coord_.append([accuracy_score(data["Coord-Random"]["y_valid"], preds_coord_valid), accuracy_score(data["Coord-Random"]["y_test"], preds_coord)])
        acc_scores_hyper_.append([accuracy_score(data["Hyper-Random"]["y_valid"], preds_hyper_valid), accuracy_score(data["Hyper-Random"]["y_test"], preds_hyper)])
        mif_scores_coord_.append([f1_score(data["Coord-Random"]["y_valid"], preds_coord_valid, average="macro") , f1_score(data["Coord-Random"]["y_test"], preds_coord, average="macro")])
        mif_scores_hyper_.append([f1_score(data["Hyper-Random"]["y_valid"], preds_hyper_valid, average="macro") , f1_score(data["Hyper-Random"]["y_test"], preds_hyper, average="macro")])


    acc_scores_coord.append(max(acc_scores_coord_))
    mif_scores_coord.append(max(mif_scores_coord_))
    acc_scores_hyper.append(max(acc_scores_hyper_))
    mif_scores_hyper.append(max(mif_scores_hyper_))
print("\nMultitask + Self-learning(Coord-Random):", max(acc_scores_coord)[1], max(mif_scores_coord)[1], "\nMultitask + Self-learning(Hyper-Random):",  max(acc_scores_hyper)[1],  max(mif_scores_hyper)[1])