In [1]:
import sys
sys.path.append('..')
import statistics
from collections import defaultdict
%reload_ext autoreload

%autoreload 2
import numpy as np
from models import lstm
import torch
from collections import defaultdict
import scipy.stats as st
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


In [2]:
# FILES TO CHANGE (This is where your sequence_nlp_harvey.zip etc is) 
NPY_INPUT_DIR = '../data/extracted_features/combined_ML/maria/'
NPY_OUTPUT_DIR = '../data/results/maria/'

NUM_TESTS = 100

HIDDEN_DIM0 = 183
NUM_LAYERS0 = 1
BIDIRECTIONAL = False
LEARNING_RATE0 =  0.011095843396210794
IMAGE_HIDDEN= 200
DECAY_FACTOR0 = 0.18991100364301827
EPOCH0 = 4

HIDDEN_DIM1 = 364
NUM_LAYERS1 = 1
ACTIVATION1 = torch.relu
LEARNING_RATE1 =  0.015103711963967328
MOMENTUM1= 0.9
DECAY_FACTOR1 =  0.3093059390000923
EPOCH1 = 2

HIDDEN_DIM2 = 197
NUM_LAYERS2 = 2
ACTIVATION2 = torch.relu
LEARNING_RATE2 =  0.030727178086089976
MOMENTUM2= 0.9
DECAY_FACTOR2 =  0.7632100704986595
EPOCH2 = 2

# LSTM -------------------------------------------------------------------*

In [3]:

def plot_cis(binNames, binRatios, priors):
    '''
    Requires a list of group str outputs and bin ratios from get_accuracy_graph - one for each run
    Collect results of both get_accuracy_plot return values -- names and binRatios-- in an array to run this.
    '''
    priors = np.array(priors[0])
    binVals = defaultdict(list)
    for run in range(len(binRatios)):
        for bin in range(len(binRatios[run])):
            binVals[bin+1].append(binRatios[run][bin]) # append the ratio (accuracy) of the bin to list
    cis = []
    means = []
    keys = []
    binLabels = [name for name in binNames[0]]
    for bin in binVals: # Calculate mean and CI for each bin
        keys.append(bin)
        mean = np.mean(binVals[bin])
        means.append(mean)
        standard = statistics.stdev(binVals[bin])
        cis.append(standard)
    plt.figure()  # initiate accuracy plot
    plt.plot(keys, means, label="Mean Accuracy by Bin")  # plot accuracy by bin
    plt.plot(keys, priors, label="Naive Accuracy")
    plt.errorbar(keys, means, yerr=cis)
    plt.xticks(keys, binLabels)
    plt.suptitle('Test classification accuracy rate by user history length (CI .95)')
    plt.xlabel('User history length (lowest to highest), sorted into bins (ascending order)')
    plt.ylabel('Accuracy rate')
    plt.show()
    return


def test_100(X_train, y_train, X_test, y_test, X_img_train, X_img_test):
    running_acc = 0.0
    running_auc = 0.0
    running_f1 = 0.0
    running_precision = 0.0
    running_recall = 0.0
    graph_results = []
    bin_names = []
    priors_list = []
    
    HIDDEN_DIM0 = 364

    for i in range(NUM_TESTS):
        model = lstm.LSTM(input_dim = 400, hidden_dim = HIDDEN_DIM0, img_hidden_dim = IMAGE_HIDDEN,
                          num_layers = NUM_LAYERS0, bidirectional = BIDIRECTIONAL,
                          learning_rate = LEARNING_RATE0,decay_factor = DECAY_FACTOR0)
        model.learn(X_train, X_img_train, y_train, epochs=EPOCH0)
        
        # evaluate
        running_acc += model.get_accuracy(X_test, X_img_test, y_test)
        running_auc += model.get_auc(X_test, X_img_test, y_test)
        running_f1 += model.get_f1(X_test, X_img_test, y_test)
        running_precision += model.get_precision(X_test, X_img_test, y_test)
        running_recall += model.get_recall(X_test, X_img_test, y_test)
        bins, accRates, priors = model.get_accuracy_graph(X_test, X_img_test, y_test)
        bin_names.append(bins)
        graph_results.append(accRates)
        priors_list.append(priors)
    
    return running_acc/NUM_TESTS, running_auc/NUM_TESTS, running_f1/NUM_TESTS, \
           running_precision/NUM_TESTS, running_recall/NUM_TESTS, \
           bin_names, graph_results, priors_list

In [None]:
# load data from files
X_seq_img_train = np.load(NPY_INPUT_DIR + 'images_lstm_train.npz', allow_pickle=True)
X_seq_img_test = np.load(NPY_INPUT_DIR + 'images_lstm_test.npz', allow_pickle=True)

X_seq_tfidf_train = np.load(NPY_INPUT_DIR + 'X_seq_tfidf_train.npz')
X_seq_tfidf_test = np.load(NPY_INPUT_DIR + 'X_seq_tfidf_test.npz')


X_seq_glove_train = np.load(NPY_INPUT_DIR + 'X_seq_glove_train.npz')
X_seq_glove_test = np.load(NPY_INPUT_DIR + 'X_seq_glove_test.npz')

y_train = torch.Tensor(np.load(NPY_INPUT_DIR + 'y_train.npy'))
y_test = torch.Tensor(np.load(NPY_INPUT_DIR + 'y_test.npy'))


# build lists with features concatenated
# img feat
X_raw_img_train = []
for i in range(len(y_train)):
    file = X_seq_img_train.files[i]
    hist = X_seq_img_train[file] # an array of objects with each object being a 2d matrix of all img in a tweet
    for j in range(hist.shape[0]): # for each tweet in sequence
        X_raw_img_train.append(hist[j])

img_scaler = StandardScaler()
X_img_fit_train = np.concatenate(X_raw_img_train) # a stack of all image feat
img_scaler.fit(X_img_fit_train)

# now build list of list of normalized 2d matricies representing all img in a tweet
X_img_train = []
for i in range(len(y_train)):
    X_i_img = []
    file = X_seq_img_train.files[i]
    hist = X_seq_img_train[file] # an array of objects with each object being a 2d matrix of all img in a tweet
    for j in range(hist.shape[0]): # for each tweet in sequence
        X_i_img.append(torch.Tensor(img_scaler.transform(hist[j])))
    X_img_train.append(X_i_img)
    
X_img_test = []
for i in range(len(y_test)):
    X_i_img = []
    file = X_seq_img_test.files[i]
    hist = X_seq_img_test[file] # an array of objects with each object being a 2d matrix of all img in a tweet
    for j in range(hist.shape[0]): # for each tweet in sequence
        X_i_img.append(torch.Tensor(img_scaler.transform(hist[j])))
    X_img_test.append(X_i_img)

# lang feat
X_raw_train = []
for i in range(len(y_train)):
    file = X_seq_tfidf_train.files[i]
    X_raw_train.append(np.concatenate((X_seq_tfidf_train[file], X_seq_glove_train[file]), axis=1))

scaler = StandardScaler()
X_fit_train = np.concatenate(X_raw_train)
scaler.fit(X_fit_train)

X_train = [torch.Tensor(scaler.transform(X_i)) for X_i in X_raw_train]

X_test = []
for i in range(len(y_test)):
    file = X_seq_tfidf_test.files[i]
    X_test.append(torch.Tensor(scaler.transform(np.concatenate((X_seq_tfidf_test[file], X_seq_glove_test[file]), axis=1))))


In [5]:
%%capture
# mutes output for this cell

acc, auc, f1, precision, recall, bin_names, graph_results, priors_list = test_100(X_train,y_train,X_test,y_test, X_img_train, X_img_test)

KeyboardInterrupt: 

In [6]:
print(acc)
print(auc)
plot_cis(bin_names, graph_results, priors_list)

NameError: name 'acc' is not defined

# BASELINE -------------------------------------------------------------------*

In [None]:
from collections import defaultdict
from models import baseline_mlp
import numpy as np
import torch
from sklearn.metrics import roc_auc_score
import statistics
import matplotlib.pyplot as plt

GLOVE_DIR = NPY_INPUT_DIR
TFIDF_DIR = NPY_INPUT_DIR

# with histories

In [None]:
def test_100(X_train, y_train, X_test, y_test, X_hist_len_test):
    running_acc = 0.0
    running_auc = 0.0
    running_f1 = 0.0
    running_precision = 0.0
    running_recall = 0.0
    graph_results = []
    bin_names = []
    priors_list = []
    
    for i in range(NUM_TESTS):
        # make model
        baseline = baseline_mlp.MLP(X_train.shape[1], hidden_dim = HIDDEN_DIM1, num_layers= NUM_LAYERS1,
                                    activation_function = ACTIVATION1, learning_rate =  LEARNING_RATE1,
                                    momentum = MOMENTUM1, decay_factor = DECAY_FACTOR1)
        
        # train
        baseline.learn(torch.Tensor(X_train), torch.Tensor(y_train), epochs = EPOCH1)
        
        # evaluate
        running_acc += baseline.get_accuracy(torch.Tensor(X_test), torch.Tensor(y_test))
        running_auc += baseline.get_auc(torch.Tensor(X_test), torch.Tensor(y_test))
        running_f1 += baseline.get_f1(torch.Tensor(X_test), torch.Tensor(y_test))
        running_precision += baseline.get_precision(torch.Tensor(X_test), torch.Tensor(y_test))
        running_recall += baseline.get_recall(torch.Tensor(X_test), torch.Tensor(y_test))
        bins, accRates, priors = baseline.get_accuracy_graph(torch.Tensor(X_test), torch.Tensor(y_test), X_hist_len_test)
        bin_names.append(bins)
        graph_results.append(accRates)
        priors_list.append(priors)
    
    return running_acc/NUM_TESTS, running_auc/NUM_TESTS, running_f1/NUM_TESTS, \
           running_precision/NUM_TESTS, running_recall/NUM_TESTS, \
           bin_names, graph_results, priors_list

In [None]:
# retieve test data history sequence lengths from LSTM data
X_hist_len_test = np.load(NPY_INPUT_DIR + 'full_hist_lens_test.npy')

# class labels
y_train = np.load(GLOVE_DIR + 'y_train.npy')
y_test= np.load(GLOVE_DIR + 'y_test.npy')

# load features
# img
X_histories_train_img = np.load(NPY_INPUT_DIR + 'images_meanvecs_train.npy')
X_histories_test_img = np.load(NPY_INPUT_DIR + 'images_meanvecs_test.npy')

X_labeled_train_img = np.load(NPY_INPUT_DIR + 'images_nohistories_train.npy')
X_labeled_test_img = np.load(NPY_INPUT_DIR + 'images_nohistories_test.npy')

# lang
X_labeled_train_glove = np.load(GLOVE_DIR + 'X_labeled_train.npy')
X_histories_train_glove = np.load(GLOVE_DIR + 'X_histories_train.npy')
X_labeled_test_glove = np.load(GLOVE_DIR + 'X_labeled_test.npy')
X_histories_test_glove = np.load(GLOVE_DIR + 'X_histories_test.npy')



X_labeled_train_tfidf = np.load(TFIDF_DIR + 'trainTweets.npy')
X_histories_train_tfidf = np.load(TFIDF_DIR + 'trainHistories.npy')
X_labeled_test_tfidf = np.load(TFIDF_DIR + 'testTweets.npy')
X_histories_test_tfidf = np.load(TFIDF_DIR + 'testHistories.npy')


scaler1 = StandardScaler()
scaler2 = StandardScaler()
scaler3 = StandardScaler()
scaler4 = StandardScaler()

scaler5 = StandardScaler()
scaler6 = StandardScaler()


X_histories_train_img_norm = scaler5.fit_transform(X_histories_train_img)
X_histories_test_img_norm  = scaler5.transform(X_histories_test_img)
X_labeled_train_img_norm = scaler5.fit_transform(X_labeled_train_img)
X_labeled_test_img_norm  = scaler5.transform(X_labeled_test_img)
        
X_labeled_train_glove_norm    = scaler1.fit_transform(X_labeled_train_glove)
X_histories_train_glove_norm  = scaler2.fit_transform(X_histories_train_glove)
X_labeled_test_glove_norm     = scaler1.transform(X_labeled_test_glove)
X_histories_test_glove_norm  = scaler2.transform(X_histories_test_glove)

X_labeled_train_tfidf_norm    = scaler3.fit_transform(X_labeled_train_tfidf)
X_histories_train_tfidf_norm  = scaler4.fit_transform(X_histories_train_tfidf)
X_labeled_test_tfidf_norm     = scaler3.transform(X_labeled_test_tfidf)
X_histories_test_tfidf_norm   = scaler4.transform(X_histories_test_tfidf)

# merge data

X_train = np.concatenate((X_labeled_train_glove_norm, X_labeled_train_tfidf_norm,
                          X_histories_train_glove_norm, X_histories_train_tfidf_norm),
                         axis = 1)
X_test = np.concatenate((X_labeled_test_glove_norm, X_labeled_test_tfidf_norm,
                         X_histories_test_glove_norm, X_histories_test_tfidf_norm),
                        axis = 1)


In [None]:
%%capture
# mutes output for this cell

acc, auc, f1, precision, recall, bin_names, graph_results, priors_list= test_100(X_train,y_train,X_test,y_test, X_hist_len_test)

In [None]:
print(acc)
print(auc)
plot_cis(bin_names, graph_results, priors_list)

# without histories

In [None]:
def test_100(X_train, y_train, X_test, y_test, X_hist_len_test):
    running_acc = 0.0
    running_auc = 0.0
    running_f1 = 0.0
    running_precision = 0.0
    running_recall = 0.0
    graph_results = []
    bin_names = []
    priors_list = []
    all_results = []
    
    for i in range(NUM_TESTS):
        # make model
        baseline = baseline_mlp.MLP(X_train.shape[1], hidden_dim = HIDDEN_DIM2, num_layers= NUM_LAYERS2,
                                    activation_function = ACTIVATION2, learning_rate =  LEARNING_RATE2,
                                    momentum = MOMENTUM2, decay_factor = DECAY_FACTOR2)
        
        # train
        baseline.learn(torch.Tensor(X_train), torch.Tensor(y_train), epochs = EPOCH2)
        
        # evaluate
        acc = baseline.get_accuracy(torch.Tensor(X_test), torch.Tensor(y_test))
        auc = baseline.get_auc(torch.Tensor(X_test), torch.Tensor(y_test))
        f1 = baseline.get_f1(torch.Tensor(X_test), torch.Tensor(y_test))
        precision = baseline.get_precision(torch.Tensor(X_test), torch.Tensor(y_test))
        recall = baseline.get_recall(torch.Tensor(X_test), torch.Tensor(y_test))
        running_acc += acc
        running_auc += auc
        running_f1 += f1
        running_precision += precision
        running_recall += recall
        bins, accRates, priors = baseline.get_accuracy_graph(torch.Tensor(X_test), torch.Tensor(y_test), X_hist_len_test)
        bin_names.append(bins)
        graph_results.append(accRates)
        priors_list.append(priors)
    
    results = np.hstack(all_results)
    np.save(NPY_OUTPUT_DIR + '.npy', results)
    
    return running_acc/NUM_TESTS, running_auc/NUM_TESTS, running_f1/NUM_TESTS, \
           running_precision/NUM_TESTS, running_recall/NUM_TESTS, \
           bin_names, graph_results, priors_list

In [None]:
# merge data

# merge data
X_train = np.concatenate((X_labeled_train_glove_norm, X_labeled_train_tfidf_norm,
                      X_labeled_train_img_norm),
                     axis = 1)
X_test = np.concatenate((X_labeled_test_glove_norm, X_labeled_test_tfidf_norm,
                     X_labeled_test_img_norm),
                    axis = 1)

In [None]:
%%capture
# mutes output for this cell

acc, auc, f1, precision, recall, bin_names, graph_results, priors_list = test_100(X_train,y_train,X_test,y_test, X_hist_len_test)

In [None]:
print(acc)
print(auc)
plot_cis(bin_names, graph_results, priors_list)



