# Causally Denoise Word Embeddings Using Half-Sibling Regression

## Load word embedding

In [1]:
import gdown
import codecs
import numpy as np
import nltk
from heapq import nlargest
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr
import os, csv, re, requests, scipy
import tensorflow as tf
import pandas as pd
import functools as ft
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate

In [2]:
# download Word2Vec
url = 'https://drive.google.com/uc?id=1iLd0Wz0bVazXvJiGZvEJ67QCgcj4DMCM'
output = 'data/small_word2vec.txt'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1iLd0Wz0bVazXvJiGZvEJ67QCgcj4DMCM
To: E:\CityU Spring 2019\Research\word-vector-NLP\denoise\Code Causally Denoise Word Embeddings Using Half-Sibling Regression\data\small_word2vec.txt
400MB [07:20, 909kB/s]  


'data/small_word2vec.txt'

In [3]:
def loadWordVecs(model_str):
    word_dictionary = {}
    
    input_file_destination = 'data/small_' + model_str + '.txt'

    f = codecs.open(input_file_destination, 'r', 'utf-8') 

    for line in f:

        line = line.split(" ", 1)   
        transformed_key = line[0].lower()

        try:
            transformed_key = str(transformed_key)

        except:
            print("Can't convert the key to unicode:", transformed_key)

        word_dictionary[transformed_key] = np.fromstring(line[1], dtype="float32", sep=" ")

        if word_dictionary[transformed_key].shape[0] != 300:
            print(transformed_key, word_dictionary[transformed_key].shape)

    return  word_dictionary     

orig_word2vec = loadWordVecs('word2vec')

In [4]:
orig_model = {}

orig_model['word2vec'] = orig_word2vec

## Load stop words (function words)

In [5]:
# all stop words
nltk.download('stopwords')

STOP = list(nltk.corpus.stopwords.words("english"))

# all nonstop words
nonStop = list(set(orig_word2vec.keys() ) - set(STOP)) 


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ZKY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def ensemble_wordvec_mat(wordVecModel_str, wordList, orig_model = orig_model):
    
    # put the word vectors in columns
    feasibleWordList = list(set(orig_model[wordVecModel_str].keys() ) & set(wordList)) 
        
    x_collector = []
    newDict = {}
    for word in feasibleWordList:
        x_collector.append(orig_model[wordVecModel_str][word])
        newDict[word] = orig_model[wordVecModel_str][word][:]        
                        
    x_collector = np.array(x_collector).T    
    
    return newDict, x_collector


In [7]:
# emsemble dictionary for stop words and non stop words
StopWordDict, StopWordVecs = ensemble_wordvec_mat('word2vec', STOP)
nonStopWordDict, nonStopWordVecs = ensemble_wordvec_mat('word2vec', nonStop)

# Half-Sibling Ridge Regression

In [8]:
def HSR_RR(InputVec,TargetVec,TargetDict):
    alpha = 50 # ridge regression parameter
    
    W = np.linalg.inv(InputVec.T @ InputVec + alpha * np.eye(InputVec.shape[1])) @ InputVec.T @ TargetVec
    W = np.array(W)
    post_TargetVec = TargetVec  - InputVec @ W # modify those non-stop words
    
    post_TargetDict = TargetDict.copy() # copy the dictionary of non-stop words

    i = 0
    for w in TargetDict.keys():
        post_TargetDict[w] = post_TargetVec[:, i] # update the modified non-stop words
        i += 1
  
    return post_TargetDict


## Carry out half-sibling regression for content-word vectors

In [9]:
post_nonStopWordDict = HSR_RR(StopWordVecs,nonStopWordVecs,nonStopWordDict)

## Carry out half-sibling regression for stop-word vectors

In [10]:
# We use some content-word vectors to predict stop-word vectors. To this end, we first extract commonly used content words. 
wikiWordsPath = 'data/enwiki_vocab_min200.txt' # This file can be downloaded froom https://github.com/PrincetonML/SIF/blob/master/auxiliary_data/enwiki_vocab_min200.txt
wikiWords = {}

with open(wikiWordsPath, "r+") as f_in:
    for line in f_in:
        wikiWords[line.split(' ')[0]] = int(line.split(' ')[1])

freq_content_word = list(set(wikiWords.keys()) & set(orig_word2vec.keys()))
non_stop_freq_content_word = list(set(freq_content_word) - set(STOP))

wikiWords_nsfc = {}
for word in non_stop_freq_content_word:
     wikiWords_nsfc[word] = wikiWords[word]

feature_nonStop = nlargest(1000, wikiWords_nsfc, key=wikiWords_nsfc.get)
 
nonStopWordVecs_features = np.array([nonStopWordDict[word] for word in feature_nonStop]).T

In [11]:
post_StopWordDict = HSR_RR(nonStopWordVecs_features,StopWordVecs,StopWordDict)

In [12]:
post_word2vec = {**post_nonStopWordDict, **post_StopWordDict} # merge stop and non-stop word vectors into a single dictionary

# Evaluation

## Word Similarity

In [13]:
dataSets = ['EN-RG-65.txt', 'EN-WS-353-ALL.txt', 'EN-RW-STANFORD.txt', 'EN-MEN-TR-3k.txt', 'EN-MTurk-287.txt', 'EN-SIMLEX-999.txt', 'EN-SimVerb-3500.txt']


def similarity_eval(dataSetAddress, wordVecModel_str):
    wordVecModel = eval(wordVecModel_str)
    vocab = set(list(wordVecModel.keys()))
    
    fread_simlex = open(dataSetAddress, "r")
    
    pair_list = []

    line_number = 0
    for line in fread_simlex:
#         if line_number > 0:
        tokens = line.split()
        word_i = tokens[0]
        word_j = tokens[1]
        score = float(tokens[2])
        if word_i in vocab and word_j in vocab:
            pair_list.append( ((word_i, word_j), score) )
#         line_number += 1

    pair_list.sort(key=lambda x: - x[1]) # order the pairs from highest score (most similar) to lowest score (least similar)


    extracted_scores = {}

    extracted_list = []
    
               
    for (x,y) in pair_list:
        (word_i, word_j) = x
        
        current_distance = 1- cosine_similarity( wordVecModel[word_i].reshape(1,-1)  , wordVecModel[word_j].reshape(1,-1) )        

        extracted_scores[(word_i, word_j)] = current_distance
        extracted_list.append(((word_i, word_j), current_distance))

    extracted_list.sort(key=lambda x: x[1])

    spearman_original_list = []
    spearman_target_list = []

    for position_1, (word_pair, score_1) in enumerate(pair_list):
        score_2 = extracted_scores[word_pair]
        position_2 = extracted_list.index((word_pair, score_2))
        spearman_original_list.append(position_1)
        spearman_target_list.append(position_2)

    spearman_rho = spearmanr(spearman_original_list, spearman_target_list)
    
    return spearman_rho[0]

In [14]:
for dataset in dataSets:
    dataSetAddress = 'data/wordSimData/' +  dataset
    print('evaluating the data set', dataset)
    print('word2vec + StopWordPost : %.4f' %  similarity_eval(dataSetAddress, 'post_word2vec'))
    print('word2vec + Orig : %.4f' %  similarity_eval(dataSetAddress, 'orig_word2vec'))
    print('\n')

evaluating the data set EN-RG-65.txt
word2vec + StopWordPost : 0.7569
word2vec + Orig : 0.7494


evaluating the data set EN-WS-353-ALL.txt
word2vec + StopWordPost : 0.7059
word2vec + Orig : 0.6999


evaluating the data set EN-RW-STANFORD.txt
word2vec + StopWordPost : 0.6033
word2vec + Orig : 0.5997


evaluating the data set EN-MEN-TR-3k.txt
word2vec + StopWordPost : 0.7726
word2vec + Orig : 0.7706


evaluating the data set EN-MTurk-287.txt
word2vec + StopWordPost : 0.6854
word2vec + Orig : 0.6831


evaluating the data set EN-SIMLEX-999.txt
word2vec + StopWordPost : 0.4672
word2vec + Orig : 0.4427


evaluating the data set EN-SimVerb-3500.txt
word2vec + StopWordPost : 0.3978
word2vec + Orig : 0.3659




## Semantic Textual Similarity

In [15]:
def load_sts_dataset(filename):
    # For a STS dataset, loads the relevant information: the sentences and their human rated similarity score.
    sent_pairs = []
    with tf.gfile.GFile(filename, "r") as f:
        for line in f:
            ts = line.strip().split("\t")
            if len(ts) == 7 or len(ts) == 9:
                sent_pairs.append((re.sub("[^0-9]", "", ts[2]) + '-' + ts[1] , ts[5], ts[6], float(ts[4])))
            elif len(ts) == 6 or len(ts) == 8:
                sent_pairs.append((re.sub("[^0-9]", "", ts[1]) + '-' + ts[0] , ts[4], ts[5], float(ts[3])))
            else:
                print('data format is wrong!!!')
    return pd.DataFrame(sent_pairs, columns=["year-task", "sent_1", "sent_2", "sim"])


def load_all_sts_dataset():
    # Loads all of the STS datasets 
    stsbenchmarkDir = 'data/stsbenchmark/'
    stscompanionDir = 'data/stsbenchmark/'
    sts_train = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-train.csv"))    
    sts_dev = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-dev.csv"))
    sts_test = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-test.csv"))
    sts_other = load_sts_dataset(os.path.join(stscompanionDir, "sts-other.csv"))
    sts_mt = load_sts_dataset(os.path.join(stscompanionDir, "sts-mt.csv"))
    
    sts_all = pd.concat([sts_train, sts_dev, sts_test, sts_other, sts_mt ])
    
    return sts_all

sts_all = load_all_sts_dataset()




def load_sts_by_year_task():
    # Divide STS datasets based on their year and tasks
    sts_by_year_task = {}
    
    for year_task in sts_all['year-task'].unique():
        indices = [i for i, x in enumerate(list(sts_all['year-task'])) if x == year_task]
        
        pairs = sts_all.iloc[indices]
        
        sts_by_year_task[year_task] = pairs
        
    return sts_by_year_task

sts_by_year_task = load_sts_by_year_task()




def load_sts_by_year():
    # Divide STS datasets ONLY based on their year (different tasks in that year are merged).

    sts_by_year = {}
    
    for year in ['2012', '2013', '2014', '2015', '2016', '2017']:
        indices = [i for i, x in enumerate(list(sts_all['year-task'])) if x.startswith(year)]
        
        pairs = sts_all.iloc[indices]
        pairs = pairs.copy()
        pairs['year-task'] = year
        sts_by_year[year] = pairs
        
    return sts_by_year

sts_by_year_task = load_sts_by_year_task()

sts_by_year = load_sts_by_year()

filename = 'data/stsbenchmark/2015-answers-students.test.tsv'
sent_pairs = []
with tf.gfile.GFile(filename, "r") as f:
    for line in f:
        ts = line.strip().split("\t")
        if len(ts) == 3:
            sent_pairs.append((ts[1], ts[2], float(ts[0])))
answers_students_2015 =  pd.DataFrame(sent_pairs, columns=["sent_1", "sent_2", "sim"])

In [16]:
def download_sick(f): 

    response = requests.get(f).text

    lines = response.split("\n")[1:]
    lines = [l.split("\t") for l in lines if len(l) > 0]
    lines = [l for l in lines if len(l) == 5]

    df = pd.DataFrame(lines, columns=["idx", "sent_1", "sent_2", "sim", "label"])
    df['sim'] = pd.to_numeric(df['sim'])
    return df
    
sick_all = download_sick("https://raw.githubusercontent.com/alvations/stasis/master/SICK-data/SICK_test_annotated.txt")

In [17]:
class Sentence:
    
    def __init__(self, sentence):
        self.raw = sentence
        normalized_sentence = sentence.replace("‘", "'").replace("’", "'")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]
        
def run_benchmark(sentences1, sentences2, model_str): 
    
    model = eval(model_str)
    embeddings = []

    for (sent1, sent2) in zip(sentences1, sentences2): 

        tokens1 =  sent1.tokens
        tokens2 =  sent2.tokens

        tokens1 = [token for token in tokens1 if token in model and token.islower()]
        tokens2 = [token for token in tokens2 if token in model and token.islower()]
        
        if tokens1 == [] and tokens2 != []:
            embedding1 = np.zeros(300)
            embedding2 = np.average([model[token] for token in tokens2], axis=0)
        elif tokens2 == [] and tokens1 != []:
            embedding2 = np.zeros(300)
            embedding1 = np.average([model[token] for token in tokens1], axis=0)
        elif tokens2 != [] and tokens1 != []:     
            embedding1 = np.average([model[token] for token in tokens1], axis=0)
            embedding2 = np.average([model[token] for token in tokens2], axis=0)
        else:
            embedding1 = np.zeros(300)
            embedding2 = np.zeros(300)


        embeddings.append(embedding1)
        embeddings.append(embedding2)


    sims = [cosine_similarity(embeddings[idx*2].reshape(1, -1), embeddings[idx*2+1].reshape(1, -1))[0][0] for idx in range(int(len(embeddings)/2))]
    return sims

def run_experiment(df, benchmarks): 
    
    sentences1 = [Sentence(s) for s in df['sent_1']]
    sentences2 = [Sentence(s) for s in df['sent_2']]
    
    pearson_cors, spearman_cors = [], []
    for label, method in benchmarks:
        sims = method(sentences1, sentences2)
        pearson_correlation = round(scipy.stats.pearsonr(sims, df['sim'])[0] * 100,2)
        pearson_cors.append(pearson_correlation)
        
    return pearson_cors

In [18]:
benchmarks = [("post-word2vec", ft.partial(run_benchmark, model_str= 'post_word2vec')),
             ("orig-word2vec", ft.partial(run_benchmark, model_str= 'orig_word2vec'))]

pearson_results_year_task = {}

for year_task in sts_all['year-task'].unique():
    print('STS-' + year_task)
    pearson_results_year_task['STS-' + year_task] = run_experiment(sts_by_year_task[year_task], benchmarks)  
    
pearson_results_year_task['SICK'] = run_experiment(sick_all, benchmarks) 

pearson_results_year_task['2015-answers_students'] = run_experiment(answers_students_2015, benchmarks)

STS-2012-MSRvid
STS-2014-images
STS-2015-images
STS-2014-deft-forum
STS-2012-MSRpar
STS-2014-deft-news
STS-2013-headlines
STS-2014-headlines
STS-2015-headlines
STS-2016-headlines
STS-2017-track5.en-en
STS-2015-answers-forums
STS-2016-answer-answer
STS-2012-surprise.OnWN
STS-2013-FNWN
STS-2013-OnWN
STS-2014-OnWN
STS-2014-tweet-news
STS-2015-belief
STS-2016-plagiarism
STS-2016-question-question
STS-2012-SMTeuroparl
STS-2012-surprise.SMTnews
STS-2016-postediting


In [19]:

pearson_results_year_task_df = pd.DataFrame(pearson_results_year_task)
pearson_results_year_task_df = pearson_results_year_task_df.transpose()
pearson_results_year_task_df = pearson_results_year_task_df.rename(columns={i:b[0] for i, b in enumerate(benchmarks)})

pearson_results_year_task_df.reindex(['STS-2012-MSRpar', 'STS-2012-MSRvid', 'STS-2012-surprise.OnWN', 'STS-2012-SMTeuroparl', 'STS-2012-surprise.SMTnews','STS-2013-FNWN', 'STS-2013-OnWN', 'STS-2013-headlines',  'STS-2014-OnWN', 'STS-2014-deft-forum','STS-2014-deft-news', 'STS-2014-headlines', 'STS-2014-tweet-news',  'STS-2014-images', 'STS-2015-answers-forums', '2015-answers_students', 'STS-2015-belief',  'STS-2015-headlines', 'STS-2015-images', 'SICK'])


Unnamed: 0,post-word2vec,orig-word2vec
STS-2012-MSRpar,34.42,41.78
STS-2012-MSRvid,79.63,76.27
STS-2012-surprise.OnWN,71.27,70.62
STS-2012-SMTeuroparl,40.32,31.2
STS-2012-surprise.SMTnews,50.09,51.07
STS-2013-FNWN,49.09,39.68
STS-2013-OnWN,75.57,67.98
STS-2013-headlines,63.65,63.29
STS-2014-OnWN,81.4,74.85
STS-2014-deft-forum,46.73,41.3


## Downstream task -- Sentiment Analysis

In [20]:
# Amazon Review
AR_data = []

with open('data/SentimentAnalysis/train_amazon_10000.csv', encoding = 'utf8') as f:
    reader = csv.reader(f)
    headers = next(reader, None)
    for row in reader:
        AR_data.append([row[1],int(row[0])])
        

AR_label = []

for i in range(0, len(AR_data)):
    AR_label.append(int(AR_data[i][1]))
    
# Customer Review
CR_data = []

with open('data/SentimentAnalysis/custrev.neg', encoding = 'utf-8') as f:
    for row in f:
        CR_data.append([row,0])

with open('data/SentimentAnalysis/custrev.pos', encoding = 'utf-8') as f:
    for row in f:
        CR_data.append([row,1])

CR_label = []

for i in range(0, len(CR_data)):
    CR_label.append(int(CR_data[i][1]))
    
# IMDB
IMDB_data = []

with open('data/SentimentAnalysis/imdb_train_10000_new.csv', encoding = 'utf8') as f:
    reader = csv.reader(f)
    #headers = next(reader, None)
    for row in reader:        
        s = 0
        if row[0] == 'pos':
            s = 1
        IMDB_data.append([row[1],s])

IMDB_label = []

for i in range(0, len(IMDB_data)):
    IMDB_label.append(int(IMDB_data[i][1]))

# SST
SST_data = []

with open('data/SentimentAnalysis/sst_all.csv', encoding = 'utf-8-sig') as f:
    reader = csv.reader(f)
    #headers = next(reader, None)
    for row in reader:        
        SST_data.append([row[1],int(row[0])])

SST_label = []

for i in range(0, len(SST_data)):
    SST_label.append(int(SST_data[i][1]))


In [21]:
def convert_to_sentence_emb(sentence1, model_str): 
    model = eval(model_str)
    
    sentence1 = Sentence(sentence1)
    
    wv_len = 300
    
        
    tokens1 =  sentence1.tokens
    tokens1 = [token for token in tokens1 if token in model and token.islower()]
    
    if tokens1 == []:
        embedding1 = np.zeros(wv_len)
    elif tokens1 != []:   
        embedding1 = np.average([model[token] for token in tokens1], axis=0)
    
    return embedding1

In [22]:
def to_sent_emb_list(dataset, model_str):
    data_list = []
    for i in range(0, len(dataset)):
        sent = dataset[i][0]
    
        data_list.append(convert_to_sentence_emb(sent, model_str))
    
    return data_list

In [23]:
AR_word2vec = to_sent_emb_list(AR_data, 'post_word2vec')
CR_word2vec = to_sent_emb_list(CR_data, 'post_word2vec')
IMDB_word2vec = to_sent_emb_list(IMDB_data, 'post_word2vec')
SST_word2vec = to_sent_emb_list(SST_data, 'post_word2vec')

In [24]:
def LR_crossval(model, label):
    
    LRClassifier = LogisticRegression(solver='sag',multi_class = 'multinomial')
    LR_cv_results = cross_validate(LRClassifier, model, label, cv=5, return_train_score=True)
    
    return [np.mean(LR_cv_results['train_score']), np.mean(LR_cv_results['test_score'])]


In [25]:
print('AR: ', LR_crossval(AR_word2vec, AR_label))
print('CR: ', LR_crossval(CR_word2vec, CR_label))
print('IMDB: ', LR_crossval(IMDB_word2vec, IMDB_label))
print('SST: ', LR_crossval(SST_word2vec, SST_label))

AR:  [0.8466000771730482, 0.8376999831999958]
CR:  [0.8246745350004291, 0.7823881871175774]
IMDB:  [0.8474499646644527, 0.8433998841499711]
SST:  [0.8240477182009938, 0.8056305235810155]


## Save

In [26]:
def save_wv(word_vector_str):
    
    word_dictionary = eval(word_vector_str)
    
    ListWords = list(word_dictionary.keys())

    print('writing to', 'HSR_RR_' + word_vector_str)

    with open('small_HSR_RR_' + word_vector_str + '.txt', 'a', encoding = 'utf8') as the_file:
        for word in ListWords:

            wordVec = word_dictionary[word]
            wordVecString = " ".join(str(x) for x in wordVec)

            the_file.write(word + ' ' + wordVecString  + '\n')

In [27]:
save_wv('post_word2vec')

writing to HSR_RR_post_word2vec
