In [None]:
import gensim.models.word2vec as w2v
import pickle
import multiprocessing
import re
import ast
import nltk
import time

import pandas as pd
import numpy as np

import lasagne
import theano
import theano.tensor as T

from IPython.display import clear_output

In [None]:
with open('twitter_df.pickle', 'rb') as dfile:
    raw_data = pickle.load(dfile)
    
labels = raw_data['tags']
labels = pd.Series([ast.literal_eval(label)[0] for label in labels])
raw_data = raw_data['tokenized_text']

In [None]:
t2v = w2v.Word2Vec.load('tweet2vec.w2v')

sentences = []
for row in raw_data:
    for sentence in row:
        sentences.append(sentence)
        
data = raw_data.apply(lambda row: [[t2v.wv[word] for word in sentence] for sentence in row])
data = np.array([np.array(batch[0]) for batch in data])

labs, counts = np.unique(labels, return_counts=True)
counts, labs = zip(*sorted(zip(counts, labs), reverse=True))
split = round(0.1*len(labs))
keep = labs[:split]
remove = labs[split:]

labels = pd.Series([lab if lab in keep else None for lab in labels])
y = np.array([t2v.wv[word] if word is not None else None for word in labels])
mask = np.array([el is not None for el in y])
y = np.vstack(y[mask])
data = data[mask]

In [None]:
data_size = labels.shape[0]

border = round(data_size*0.8)
X_train = data[:border]
X_test = data[border:]
y_train = y[:border]
y_test = y[border:]

In [5]:
# Define all global params here

# Number of epochs
NUM_EPOCHS = 30
# Batch size
N_BATCH = 64
# Max sequence length
MAX_LENGTH = 145
# Dimensionality of character lookup
CHAR_DIM = 150
# Initialization scale
SCALE = 0.1
# Dimensionality of C2W hidden states
C2W_HDIM = 500
# Dimensionality of word vectors
WDIM = 500
# Number of classes
MAX_CLASSES = 6000
# Learning rate
LEARNING_RATE = 0.01
# Display frequency
DISPF = 5
# Save frequency
SAVEF = 1000
# Regularization
REGULARIZATION = 0.0001
# Reload
RELOAD_MODEL = False
# NAG
MOMENTUM = 0.9
# clipping
GRAD_CLIP = 5.
# use bias
BIAS = False
# use schedule
SCHEDULE = True

In [6]:
from collections import OrderedDict

def init_params(n_chars):
    '''
    Initialize all params
    '''
    params = OrderedDict()

    np.random.seed(0)

    # lookup table
    params['Wc'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(n_chars,CHAR_DIM)).astype('float32'), name='Wc')

    # f-GRU
    params['W_c2w_f_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_f_r')
    params['W_c2w_f_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_f_z')
    params['W_c2w_f_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_f_h')
    params['b_c2w_f_r'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_f_r')
    params['b_c2w_f_z'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_f_z')
    params['b_c2w_f_h'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_f_h')
    params['U_c2w_f_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_f_r')
    params['U_c2w_f_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_f_z')
    params['U_c2w_f_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_f_h')
    params['hid_ini_f'] = theano.shared(np.zeros((1,C2W_HDIM)).astype('float32'), name='hid_ini_f')

    # b-GRU
    params['W_c2w_b_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_b_r')
    params['W_c2w_b_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_b_z')
    params['W_c2w_b_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_b_h')
    params['b_c2w_b_r'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_b_r')
    params['b_c2w_b_z'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_b_z')
    params['b_c2w_b_h'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_b_h')
    params['U_c2w_b_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_b_r')
    params['U_c2w_b_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_b_z')
    params['U_c2w_b_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_b_h')
    params['hid_ini_b'] = theano.shared(np.zeros((1,C2W_HDIM)).astype('float32'), name='hid_ini_b')

    # dense
    params['W_c2w_df'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,WDIM)).astype('float32'), name='W_c2w_df')
    params['W_c2w_db'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,WDIM)).astype('float32'), name='W_c2w_db')
    if BIAS:
        params['b_c2w_df'] = theano.shared(np.zeros((WDIM)).astype('float32'), name='b_c2w_db')
        params['b_c2w_db'] = theano.shared(np.zeros((WDIM)).astype('float32'), name='b_c2w_df')

    return params

def tweet2vec(tweet,mask,params,n_chars):
    '''
    Tweet2Vec
    '''
    # Input layer over characters
    l_in_source = lasagne.layers.InputLayer(shape=(N_BATCH,MAX_LENGTH,1), input_var=tweet, name='input')

    # Mask layer for variable length sequences
    l_mask = lasagne.layers.InputLayer(shape=(N_BATCH,MAX_LENGTH), input_var=mask, name='mask')

    # lookup
    l_clookup_source = lasagne.layers.EmbeddingLayer(l_in_source, input_size=n_chars, output_size=CHAR_DIM, W=params['Wc'])

    # f-GRU
    c2w_f_reset = lasagne.layers.Gate(W_in=params['W_c2w_f_r'], W_hid=params['U_c2w_f_r'], W_cell=None, b=params['b_c2w_f_r'], nonlinearity=lasagne.nonlinearities.sigmoid)
    c2w_f_update = lasagne.layers.Gate(W_in=params['W_c2w_f_z'], W_hid=params['U_c2w_f_z'], W_cell=None, b=params['b_c2w_f_z'], nonlinearity=lasagne.nonlinearities.sigmoid)
    c2w_f_hidden = lasagne.layers.Gate(W_in=params['W_c2w_f_h'], W_hid=params['U_c2w_f_h'], W_cell=None, b=params['b_c2w_f_h'], nonlinearity=lasagne.nonlinearities.tanh)

    l_fgru_source = lasagne.layers.GRULayer(l_clookup_source, C2W_HDIM, resetgate=c2w_f_reset, updategate=c2w_f_update, hidden_update=c2w_f_hidden, hid_init=params['hid_ini_f'], backwards=False, learn_init=True, gradient_steps=-1, grad_clipping=GRAD_CLIP, unroll_scan=False, precompute_input=True, mask_input=l_mask)

    # b-GRU
    c2w_b_reset = lasagne.layers.Gate(W_in=params['W_c2w_b_r'], W_hid=params['U_c2w_b_r'], W_cell=None, b=params['b_c2w_b_r'], nonlinearity=lasagne.nonlinearities.sigmoid)
    c2w_b_update = lasagne.layers.Gate(W_in=params['W_c2w_b_z'], W_hid=params['U_c2w_b_z'], W_cell=None, b=params['b_c2w_b_z'], nonlinearity=lasagne.nonlinearities.sigmoid)
    c2w_b_hidden = lasagne.layers.Gate(W_in=params['W_c2w_b_h'], W_hid=params['U_c2w_b_h'], W_cell=None, b=params['b_c2w_b_h'], nonlinearity=lasagne.nonlinearities.tanh)

    l_bgru_source = lasagne.layers.GRULayer(l_clookup_source, C2W_HDIM, resetgate=c2w_b_reset, updategate=c2w_b_update, hidden_update=c2w_b_hidden, hid_init=params['hid_ini_b'], backwards=True, learn_init=True, gradient_steps=-1, grad_clipping=GRAD_CLIP, unroll_scan=False, precompute_input=True, mask_input=l_mask)

    # Slice final states
    l_f_source = lasagne.layers.SliceLayer(l_fgru_source, -1, 1)
    l_b_source = lasagne.layers.SliceLayer(l_bgru_source, 0, 1)

    # Dense layer
    if BIAS:
        l_fdense_source = lasagne.layers.DenseLayer(l_f_source, WDIM, W=params['W_c2w_df'], b=params['b_c2w_df'], nonlinearity=None)
        l_bdense_source = lasagne.layers.DenseLayer(l_b_source, WDIM, W=params['W_c2w_db'], b=params['b_c2w_db'], nonlinearity=None)
    else:
        l_fdense_source = lasagne.layers.DenseLayer(l_f_source, WDIM, W=params['W_c2w_df'], b=None, nonlinearity=None)
        l_bdense_source = lasagne.layers.DenseLayer(l_b_source, WDIM, W=params['W_c2w_db'], b=None, nonlinearity=None)
    l_c2w_source = lasagne.layers.ElemwiseSumLayer([l_fdense_source, l_bdense_source], coeffs=1)

    return l_c2w_source

In [7]:
def classify(tweet, t_mask, params, n_classes, n_chars):
    # tweet embedding
    emb_layer = tweet2vec(tweet, t_mask, params, n_chars)
    # Dense layer for classes
    l_dense = lasagne.layers.DenseLayer(emb_layer, n_classes, nonlinearity=lasagne.nonlinearities.softmax)

    return lasagne.layers.get_output(l_dense), lasagne.layers.get_output(emb_layer)

In [11]:
tweet = T.itensor3()
t_mask = T.fmatrix()

n_char = t2v.corpus_count
params = init_params(n_char)
n_classes = len(np.unique(labels[mask]))
predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char)
clear_output()

In [12]:
print("Compiling theano functions...")
predict = theano.function([tweet,t_mask],predictions)
encode = theano.function([tweet,t_mask],embeddings)
clear_output()

Compiling theano functions...
Compiling theano functions...
Compiling theano functions...


ERROR (theano.gof.opt): Optimization failure due to: local_subtensor_merge
ERROR (theano.gof.opt): Optimization failure due to: local_subtensor_merge
ERROR (theano.gof.opt): Optimization failure due to: local_subtensor_merge
--- Logging error ---
Traceback (most recent call last):
  File "/home/daria/anaconda3/lib/python3.5/site-packages/theano/gof/opt.py", line 1484, in process_node
    replacements = lopt.transform(node)
  File "/home/daria/anaconda3/lib/python3.5/site-packages/theano/tensor/opt.py", line 2502, in local_subtensor_merge
    ushape[pos_2]))
  File "/home/daria/anaconda3/lib/python3.5/site-packages/theano/tensor/opt.py", line 2451, in merge_two_slices
    start = pre_greedy_local_optimizer(list_opt, start)
  File "/home/daria/anaconda3/lib/python3.5/site-packages/theano/gof/opt.py", line 2186, in pre_greedy_local_optimizer
    list_optimizations, out, {}, 0)
  File "/home/daria/anaconda3/lib/python3.5/site-packages/theano/gof/opt.py", line 2158, in local_recursive_funct

OSError: [Errno 12] Cannot allocate memory

Problem occurred during compilation with the command line below:
/usr/bin/g++ -shared -g -O3 -fno-math-errno -Wno-unused-label -Wno-unused-variable -Wno-write-strings -D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION -m64 -fPIC -I/home/daria/anaconda3/lib/python3.5/site-packages/numpy/core/include -I/home/daria/anaconda3/include/python3.5m -I/home/daria/anaconda3/lib/python3.5/site-packages/theano/gof -fvisibility=hidden -o /home/daria/.theano/compiledir_Linux-3.19--generic-x86_64-with-debian-jessie-sid-x86_64-3.5.3-64/tmp1dtyk8nd/m98ae390158a9e24998ccfd00a5b92bd4.so /home/daria/.theano/compiledir_Linux-3.19--generic-x86_64-with-debian-jessie-sid-x86_64-3.5.3-64/tmp1dtyk8nd/mod.cpp -L/home/daria/anaconda3/lib -lpython3.5m
ERROR (theano.gof.cmodule): [Errno 12] Cannot allocate memory
--- Logging error ---
Traceback (most recent call last):
  File "/home/daria/anaconda3/lib/python3.5/site-packages/theano/gof/cmodule.py", line 1094, in module_from_key
    module = lnk.compile_cmodule(location)

OSError: [Errno 12] Cannot allocate memory

--- Logging error ---
Traceback (most recent call last):
  File "/home/daria/anaconda3/lib/python3.5/site-packages/theano/gof/opt.py", line 1484, in process_node
    replacements = lopt.transform(node)
  File "/home/daria/anaconda3/lib/python3.5/site-packages/theano/tensor/opt.py", line 2502, in local_subtensor_merge
    ushape[pos_2]))
  File "/home/daria/anaconda3/lib/python3.5/site-packages/theano/tensor/opt.py", line 2451, in merge_two_slices
    start = pre_greedy_local_optimizer(list_opt, start)
  File "/home/daria/anaconda3/lib/python3.5/site-packages/theano/gof/opt.py", line 2186, in pre_greedy_local_optimizer
    list_optimizations, out, {}, 0)
  File "/home/daria/anaconda3/lib/python3.5/site-packages/theano/gof/opt.py", line 2158, in local_recursive_function
    depth + 1)
  File "/home/daria/anaconda3/lib/python3.5/site-packages/theano/gof/opt.py", line 2158, in local_recursive_function
    depth + 1)
  File "/home/daria/anaconda3/lib/python3.5/site-packages/theano/gof/opt.p

OSError: [Errno 12] Cannot allocate memory