## Semi-supervised Outcome value prediction from Behaviour Change Data

In this notebook, we set up a regression/classification pipeline to predict the outcome value under a semi-supervised setting. The objective is to see if adding automatically extracted data can help improve the outcome value predictions.

In [1]:
import sys, getopt, os
sys.path.insert(0, "ov-predict/src/")

import numpy as np

#external libs
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import mean_squared_error
import statistics
from sklearn.metrics import confusion_matrix
from keras import backend as k
from math import sqrt

#our libraries
from model.lstm import buildModel
from model.lstm import rmse
from preprocessing.InputHelper import InputHelper
from preprocessing.InputHelper import mapToNonUniformlySpacedIntervals
from preprocessing.InputHelper import transformLabels
from common.utils import plotHistogram
from common.utils import getSelectedData
from common.utils import printWordVecs
from common.utils import convertSoftmaxToLabels
from common.utils import computePerIntervalStats
from common.utils import computeTwoStagedRMSE

Using TensorFlow backend.


In [2]:
#Globals
SEED = 314159
NUM_EXPERIMENTS = 1
MAXLEN=100
FOLD=5
NUM_EXPERIMENTS = 1
EPOCHS = 3
NUM_CLASSES=0

DATAFILE="../../core/prediction/sentences/train.tsv"
NOISY_DATAFILE="../../core/prediction/sentences/train.noisy.tsv"
NODEVECFILE_REF="../../core/prediction/graphs/nodevecs/nodes_and_words_ref.vec"
NODEVECFILE_NOISY="../../core/prediction/graphs/nodevecs/nodes_and_words_extracted.vec"
MERGE_VEC_FILE = 'mergedvec.txt'

In [3]:
#Keep a dictionary (keyed by document name) of corresponding pairs of RCTArms (DataInstances)
#e.g. for document study['ABC.pdf'] --> {ref-data, noisy-data}

# the class def
from datadef.rct import RCTArm
from datadef.rct import RCTArms

In [4]:
def mergeVecFiles(refVecFile, extractedVecFile, mergedFileName):
    with open(refVecFile) as ref_f:
        r_content = ref_f.readlines()

    with open(extractedVecFile) as ext_f:
        e_content = ext_f.readlines()
    
    totalwords = str(int(r_content[0].split(' ')[0]) + int(e_content[0].split(' ')[0]))
    dim = r_content[0].split(' ')[1]
    
    header = totalwords + ' ' + dim
    
    with open(mergedFileName, 'w') as f:
        f.write(header)
        
        for item in r_content[1:]:
            f.write("%s" % item)        
        for item in e_content[1:]:
            f.write("%s" % item)

In [5]:
def processFold(fold_number, model, x_train, y_train, x_test, y_test, maxlen, num_classes, epochs):

    x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
    x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)
    BATCH_SIZE = 1

    print ("Training model...")
    model.fit(x_train, y_train,
        epochs=epochs,
        verbose=True,         
        #validation_split=0.1,
        batch_size=BATCH_SIZE)

    loss, accuracy = model.evaluate(x_test, y_test, verbose=True)
    if (num_classes > 0):
        print("Fold {}: Cross-entropy loss: {:.4f}, Accuracy: {:.4f}".format(fold_number, loss, accuracy))
    else:
        print("Fold {}: Loss: {:.4f}, RMSE: {:.4f}".format(fold_number, loss, accuracy))

    '''
    y_preds = model.predict(x_test)
    y_preds = convertSoftmaxToLabels(y_preds)
    
    # in this part of the code use the true values (in case of classification) to predict values and compute rmse...
    if ptype=='m':
        # perform and evaluate 2-step regression... classify and then sample a value around the median from the interval
        accuracy = computeTwoStagedRMSE(num_classes, fold_number, y_preds, y_train_vals, y_test_vals)
    '''
    
    return accuracy


In [6]:
from model.lstm import create_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
import random

class RCT_Ref_Noise_Pairs:
    
    def __init__(self, datafile, noisy_datafile):        
        rct_ref_dict = self.loadData(datafile)
        rct_noisy_dict = self.loadData(noisy_datafile)
    
        ref_keys = set(rct_ref_dict.keys())
        noisy_keys = set(rct_noisy_dict.keys())
        common_keys = ref_keys.intersection(noisy_keys)
        
        self.numinstances = len(common_keys)
        self.rcts = {}
        
        for key in common_keys:
            self.rcts[key] = [rct_ref_dict[key], rct_noisy_dict[key]]

    def loadData(self, datafile):
        rct_dict = {}    
        rcts = RCTArms(datafile)
        #rcts.convertWordsToIds()
        
        for rct in rcts.rcts:
            rct_dict[rct.docname] = rct
        return rct_dict

    def printAllPairs(self):
        for key in self.rcts:
            print (key + ', ' + str(self.rcts[key][0]) + ', ' + str(self.rcts[key][1]))

    def formXY(self, maxlen=50):        

        all_text_ref = []
        self.Y_ref = []
        for key in self.rcts:
            all_text_ref.append(self.rcts[key][0].text)
            self.Y_ref.append(self.rcts[key][0].ov)
            
        all_text_noisy = []
        self.Y_noisy = []
        for key in self.rcts:
            all_text_noisy.append(self.rcts[key][1].text)
            self.Y_noisy.append(self.rcts[key][1].ov)
        
        self.keras_tokenizer = Tokenizer(num_words=None, filters=[], lower=False, split=' ')
        
        all_text = all_text_ref[:] # copy ref into all_text
        all_text.extend(all_text_noisy)
        
        self.keras_tokenizer.fit_on_texts(all_text)
        
        self.vsize = len(self.keras_tokenizer.word_index) + 1
        
        self.X_ref = self.keras_tokenizer.texts_to_sequences(all_text_ref)
        self.X_ref = pad_sequences(self.X_ref, padding='post', maxlen=50)        
        self.X_noisy = self.keras_tokenizer.texts_to_sequences(all_text_ref)
        self.X_noisy = pad_sequences(self.X_noisy, padding='post', maxlen=50)
        
        self.X_ref_array = np.asarray(self.X_ref)
        self.Y_ref_array = np.asarray(self.Y_ref, dtype=np.float64)

    # Choose random indexes from train_indexes and replace the ref data with noise
    # For baseline for the noise injection experiments, we use only
    # a fraction of the clean data (as given by the SNR). The noise in this case is 0,
    # i.e., no additional data is provided as input.
    def getXYForFold(self, train_indexes, test_indexes, snr, clean_indexes=None):
        num_clean_instances = int(snr * len(train_indexes))
        clean_indexes_map = {}
        include_noise = False
        
        if clean_indexes==None:
            clean_indexes = random.choices(train_indexes, k=num_clean_instances)
            include_noise = True
            
        for i in clean_indexes:
            clean_indexes_map[i] = i
        
        X_train = []
        X_test = []
        Y_train = []
        Y_test = []
        
        for index in train_indexes:
            if index in clean_indexes_map:
                X_train.append(self.X_ref[index])
                Y_train.append(self.Y_ref[index])
            elif include_noise:
                X_train.append(self.X_noisy[index])
                Y_train.append(self.Y_noisy[index])
            
        for index in test_indexes:
            X_test.append(self.X_ref[index])
            Y_test.append(self.Y_ref[index])
            
        return X_train, Y_train, X_test, Y_test, clean_indexes
        
    def runCrossFoldExperiment(self, model, n_splits, snr, MAXLEN, NUM_CLASSES, epochs=20):
        self.formXY(MAXLEN)  # get the data in XY form
        avg_over_experiments = 0
        avg_over_experiments_baseline = 0
        
        for i in range (0, NUM_EXPERIMENTS):        
            fold_info = KFold(n_splits=n_splits, random_state=SEED)
            
            n = 0
            avg_metric_value = 0
            baseline_metric_value = 0 # only with clean data
            
            for train_indexes, test_indexes in fold_info.split(self.X_ref_array, self.Y_ref_array):                
                X_train, Y_train, X_test, Y_test, clean_indexes = self.getXYForFold(train_indexes, test_indexes, snr, None)
                
                print ("|train_with_noise| = {}, |test| = {}".format(len(X_train), len(X_test)))
                
                Y_train, Y_test = transformLabels(Y_train, Y_test, NUM_CLASSES, useMedians=True)                
                avg_metric_value += processFold(n, model, X_train, Y_train, X_test, Y_test, MAXLEN, NUM_CLASSES, epochs)

                #baseline with only true data (subset given by snr)
                X_train, Y_train, X_test, Y_test, clean_indexes = self.getXYForFold(train_indexes, test_indexes, snr, clean_indexes)
                
                print ("|train_refonly| = {}, |test| = {}".format(len(X_train), len(X_test)))
                
                Y_train, Y_test = transformLabels(Y_train, Y_test, NUM_CLASSES, useMedians=True)                
                baseline_metric_value += processFold(n, model, X_train, Y_train, X_test, Y_test, MAXLEN, NUM_CLASSES, epochs)
                
                n += 1 # next fold
                
            avg_metric_value /= n
            baseline_metric_value /= n
            
            avg_over_experiments += avg_metric_value
            avg_over_experiments_baseline += baseline_metric_value
            
        return avg_over_experiments_baseline/NUM_EXPERIMENTS, avg_over_experiments/NUM_EXPERIMENTS
    

In [7]:
def process(DATAFILE, NOISY_DATAFILE, NODEVECFILE_REF, NODEVECFILE_NOISY, NUM_CLASSES, SNR):
    
    mergeVecFiles(NODEVECFILE_REF, NODEVECFILE_NOISY, MERGE_VEC_FILE)
    inpH = InputHelper()
    inpH.convertWordsToIds(MERGE_VEC_FILE)
    
    inpH.loadW2V(MERGE_VEC_FILE)
    
    ref_noise_pairs = RCT_Ref_Noise_Pairs(DATAFILE, NOISY_DATAFILE) 
    #ref_noise_pairs.printAllPairs()
    
    #create model
    model = create_model(inpH, NUM_CLASSES, MAXLEN)
    
    baseline, eval_metric_val = ref_noise_pairs.runCrossFoldExperiment(model, FOLD, SNR, MAXLEN, NUM_CLASSES, epochs=EPOCHS) 
    print ("SNR: {}, Baseline: {}, With-Extracted: {}".format(SNR, baseline, eval_metric_val))


In [9]:
if __name__ == "__main__":
    #main(sys.argv[1:])
    for SNR in [0.4, 0.5, 0.6]:
        process(DATAFILE, NOISY_DATAFILE, NODEVECFILE_REF, NODEVECFILE_NOISY, NUM_CLASSES, SNR)

Collecting node names...
Collected node names...
Converting words to ids...
Finished converting words to ids...
Loading W2V data...
loaded word2vec for 31173 nodes
1 words out of 31174 not found
DEBUG: shape of embedding: (31174, 333)
DEBUG: include_wordvecs = False
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 333)          10380942  
_________________________________________________________________
bidirectional_4 (Bidirection (None, 128)               203776    
_________________________________________________________________
output_vals (Dense)          (None, 1)                 129       
Total params: 10,584,847
Trainable params: 203,905
Non-trainable params: 10,380,942
_________________________________________________________________
|train_with_noise| = 51, |test| = 13
Training model...




Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 0: Loss: 12.2749, RMSE: 12.2749
|train_refonly| = 16, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 0: Loss: 12.7575, RMSE: 12.7575
|train_with_noise| = 51, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 1: Loss: 7.3964, RMSE: 7.3964
|train_refonly| = 16, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 1: Loss: 7.3819, RMSE: 7.3819
|train_with_noise| = 51, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 2: Loss: 7.3329, RMSE: 7.3329
|train_refonly| = 16, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 2: Loss: 7.5393, RMSE: 7.5393
|train_with_noise| = 51, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 3: Loss: 18.0763, RMSE: 18.0763
|train_refonly| = 17, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 3: Loss: 17.8730, RMSE: 17.8730
|train_with_noise| = 52, |test| = 12
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 4: Loss: 



Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 0: Loss: 12.0331, RMSE: 12.0331
|train_refonly| = 22, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 0: Loss: 11.9458, RMSE: 11.9458
|train_with_noise| = 51, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 1: Loss: 7.6352, RMSE: 7.6352
|train_refonly| = 18, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 1: Loss: 7.5002, RMSE: 7.5002
|train_with_noise| = 51, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 2: Loss: 7.0186, RMSE: 7.0186
|train_refonly| = 15, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 2: Loss: 7.0108, RMSE: 7.0108
|train_with_noise| = 51, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 3: Loss: 17.9191, RMSE: 17.9191
|train_refonly| = 20, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 3: Loss: 19.6736, RMSE: 19.6736
|train_with_noise| = 52, |test| = 12
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 4: Loss: 



Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 0: Loss: 12.4093, RMSE: 12.4093
|train_refonly| = 23, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 0: Loss: 12.9142, RMSE: 12.9142
|train_with_noise| = 51, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 1: Loss: 8.1645, RMSE: 8.1645
|train_refonly| = 21, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 1: Loss: 7.2003, RMSE: 7.2003
|train_with_noise| = 51, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 2: Loss: 7.1085, RMSE: 7.1085
|train_refonly| = 20, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 2: Loss: 7.7964, RMSE: 7.7964
|train_with_noise| = 51, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 3: Loss: 17.0026, RMSE: 17.0026
|train_refonly| = 25, |test| = 13
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 3: Loss: 16.9287, RMSE: 16.9287
|train_with_noise| = 52, |test| = 12
Training model...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 4: Loss: 