In [1]:
%reload_ext autoreload

%autoreload 2
import numpy as np
from models import lstm
import torch
from sklearn.preprocessing import StandardScaler


In [2]:
# FILES TO CHANGE (This is where your sequence_nlp_harvey.zip etc is) test

NPY_INPUT_DIR = '/Users/ianmagnusson/IITUDND/data/extracted_features/combined_NLP/maria/kfold/'

In [3]:
def run_experiment(num_hidden, learning_rate, data_dir):
    
    # load data from files
    
    X_seq_tfidf_train = np.load(data_dir + 'X_seq_tfidf_train.npz')
    X_seq_tfidf_test = np.load(data_dir + 'X_seq_tfidf_test.npz')
    
    
    X_seq_glove_train = np.load(data_dir + 'X_seq_glove_train.npz')
    X_seq_glove_test = np.load(data_dir + 'X_seq_glove_test.npz')
    
    y_train = torch.Tensor(np.load(data_dir + 'y_train.npy'))
    y_test = torch.Tensor(np.load(data_dir + 'y_test.npy'))
    
    # build lists with features concatenated
    
    X_raw_train = []
    for i in range(len(y_train)):
        file = X_seq_tfidf_train.files[i]
        X_raw_train.append(np.concatenate((X_seq_tfidf_train[file], X_seq_glove_train[file]), axis=1))
    
    scaler = StandardScaler()
    X_fit_train = np.concatenate(X_raw_train)
    scaler.fit(X_fit_train)
    
    X_train = [torch.Tensor(scaler.transform(X_i)) for X_i in X_raw_train]
    
    X_test = []
    for i in range(len(y_test)):
        file = X_seq_tfidf_test.files[i]
        X_test.append(torch.Tensor(scaler.transform(np.concatenate((X_seq_tfidf_test[file], X_seq_glove_test[file]), axis=1))))

    
    # build and train model
    
    model = lstm.LSTM(input_dim=400, hidden_dim=num_hidden, learning_rate=learning_rate)
    
    model.learn(X_train, y_train)
    
    # evaluate
    return model.get_accuracy(X_test, y_test)

def cross_validate(num_hidden, learning_rate, data_dir, folds = 10):
    accuracy_sum = 0.0
    for i in range(folds):
        fold_dir = data_dir + str(i) + '/'
        print('running experiment', i)
        acc = run_experiment(num_hidden,learning_rate,fold_dir)
        print('result', acc)
        accuracy_sum += acc
        
    return accuracy_sum / folds


In [None]:
# cross validate search for hyper-parameters
learning_rates = [0.002, 0.0019, 0.0018, 0.0017, 0.0016, 0.0015, 0.0014,
                  0.0013, 0.0012, 0.0011, 0.001, 0.0009, 0.0008, 0.0007, 0.0006]
hidden_dims = [100,300,500]
results = np.zeros((len(learning_rates), len(hidden_dims)))
tests = 1
for i, l in enumerate(learning_rates):
    for j, h in enumerate(hidden_dims):
        print('starting test', tests,'hidden', h, 'learning', l)
        results[i,j] = cross_validate(h, l, NPY_INPUT_DIR) # TODO CHECK THESE PARAMS ARE RIGHT
        tests += 1
        print('test outcome', results[i,j])
        print('************************************************************')
        
np.save('maria_LSTM_results.npy', results) # to save results
print(results)


starting test 1 hidden 100 learning 0.002
running experiment 0
epoch: 0 learning rate: [0.002]
[1,   200] loss: 0.657
[1,   400] loss: 0.614
[1,   600] loss: 0.562
[1,   800] loss: 0.503
[1,  1000] loss: 0.483
[1,  1200] loss: 0.519
[1,  1400] loss: 0.506
[1,  1600] loss: 0.536
[1,  1800] loss: 0.501
[1,  2000] loss: 0.490
[1,  2200] loss: 0.456
[1,  2400] loss: 0.533
[1,  2600] loss: 0.446
[1,  2800] loss: 0.541
epoch: 1 learning rate: [0.001]
[2,   200] loss: 0.423
[2,   400] loss: 0.389
[2,   600] loss: 0.391
[2,   800] loss: 0.375
[2,  1000] loss: 0.423
[2,  1200] loss: 0.375
[2,  1400] loss: 0.356
[2,  1600] loss: 0.367
[2,  1800] loss: 0.375
[2,  2000] loss: 0.393
[2,  2200] loss: 0.367
[2,  2400] loss: 0.365
[2,  2600] loss: 0.416
