In [7]:
import numpy as np
import sys

def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))

def softmax(x):
    values = np.exp(x - np.max(x))
    return values/ np.sum(values)

def save_model_parameters(model, outfile):
    np.savez(outfile,
        Wf=model.Wf,
        Uf=model.Uf,
        bf=model.bf,
        Wi=model.Wi,
        Ui=model.Ui,
        bi=model.bi,
        Wg=model.Wg,
        Ug=model.Ug,
        bg=model.bg,
        Wo=model.Wo,
        Uo=model.Uo,
        bo=model.bo,
        V=model.V,
        b=model.b)
    print("Saved model parameters to %s." % outfile)

def load_model_parameters(path, modelClass=LSTMLM):
    npzfile = np.load(path)
    Wf,Uf,bf,Wi,Ui,bi,Wg,Ug,bg,Wo,Uo,bo,V,b = npzfile["Wf"], npzfile["Uf"], npzfile["bf"], npzfile["Wi"], npzfile["Ui"], npzfile["bi"],npzfile["Wg"], npzfile["Ug"], npzfile["bg"], npzfile["Wo"], npzfile["Uo"], npzfile["bo"],npzfile['V'],npzfile['b']
    hidden_dim, word_dim = Wf.shape
    print("Building model model from %s with hidden_dim=%d word_dim=%d" % (path, hidden_dim, word_dim))
    sys.stdout.flush()
    model = modelClass(word_dim, hidden_dim=hidden_dim)
    model.Wf=Wf,
    model.Uf=Uf,
    model.bf=bf,
    model.Wi=Wi,
    model.Ui=Ui,
    model.bi=bi,
    model.Wg=Wg,
    model.Ug=Ug,
    model.bg=bg,
    model.Wo=Wo,
    model.Uo=Uo,
    model.bo=bo,
    model.V=V,
    model.b=b
    return model

In [6]:
import numpy as np

class LSTMLM:
    def __init__(self,word_dim,hidden_dim=128,bptt_truncate=-1):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        
        self.Wf = np.random.uniform(-np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim), (hidden_dim, word_dim))
        self.Uf = np.random.uniform(-np.sqrt(1. / word_dim), np.sqrt(1. / word_dim), (hidden_dim, hidden_dim))
        self.bf=np.zeros(hidden_dim)
        self.Wi = np.random.uniform(-np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim), (hidden_dim, word_dim))
        self.Ui = np.random.uniform(-np.sqrt(1. / word_dim), np.sqrt(1. / word_dim), (hidden_dim, hidden_dim))
        self.bi=np.zeros(hidden_dim)
        self.Wg = np.random.uniform(-np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim), (hidden_dim, word_dim))
        self.Ug = np.random.uniform(-np.sqrt(1. / word_dim), np.sqrt(1. / word_dim), (hidden_dim, hidden_dim))
        self.bg=np.zeros(hidden_dim)
        self.Wo = np.random.uniform(-np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim), (hidden_dim, word_dim))
        self.Uo = np.random.uniform(-np.sqrt(1. / word_dim), np.sqrt(1. / word_dim), (hidden_dim, hidden_dim))
        self.bo=np.zeros(hidden_dim)
        self.V = np.random.uniform(-np.sqrt(1. / word_dim), np.sqrt(1. / word_dim), (word_dim, hidden_dim))
        self.b=np.zeros(word_dim)
        
    
    def forward_propagation(self,x):
        T=len(x)
        f=np.zeros((T,self.hidden_dim))
        i=np.zeros((T,self.hidden_dim))
        g=np.zeros((T,self.hidden_dim))
        c=np.zeros((T+1,self.hidden_dim))
        o=np.zeros((T,self.hidden_dim))
        h=np.zeros((T+1,self.hidden_dim))
        z=np.zeros((T,self.word_dim))
        
        for t in np.arange(T):
            inputs=np.zeros(self.word_dim)
            inputs[x[t]]=1
            f[t]=sigmoid(np.dot(self.Wf,inputs)+np.dot(self.Uf,h[t-1])+self.bf)
            i[t]=sigmoid(np.dot(self.Wi,inputs)+np.dot(self.Ui,h[t-1])+self.bi)
            g[t]=np.tanh(np.dot(self.Wg,inputs)+np.dot(self.Ug,h[t-1])+self.bg)
            c[t]=f[t]*c[t-1]+i[t]*g[t]
            o[t]=sigmoid(np.dot(self.Wo,inputs)+np.dot(self.Uo,h[t-1])+self.bo)
            h[t]=o[t]*np.tanh(c[t])
            z[t] = softmax(np.dot(self.V,h[t])+self.b)
        return (f,i,g,c,o,h,z)
    
    def predict(self,x):
        f,i,g,c,o,h,z=self.forward_propagation(x)
        return np.argmax(z,axis=1)
    
    def calculate_loss(self,x,y):
        f,i,g,c,o,h,z=self.forward_propagation(x)
        loss=0.0
        for i in np.arange(len(y)):
            correct_word_predictions = z[i, y[i]]
            loss+=-1.0*np.log(correct_word_predictions)
        return loss
    
    def calculate_total_loss(self,X,Y):
        L=0.0
        N=0
        for i in np.arange(len(Y)):
            L+=self.calculate_loss(X[i],Y[i])
            N+=len(Y[i])
        return L/N
    
    def bptt(self, x, y):
        T = len(y)
        f,i,g,c,o,h,z = self.forward_propagation(x)
    
        dWf = np.zeros(self.Wf.shape)
        dUf = np.zeros(self.Uf.shape)
        dbf = np.zeros(self.bf.shape)
        dWi = np.zeros(self.Wi.shape)
        dUi = np.zeros(self.Ui.shape)
        dbi = np.zeros(self.bi.shape)
        dWg = np.zeros(self.Wg.shape)
        dUg = np.zeros(self.Ug.shape)
        dbg = np.zeros(self.bg.shape)
        dWo = np.zeros(self.Wo.shape)
        dUo = np.zeros(self.Uo.shape)
        dbo = np.zeros(self.bo.shape)
        dV = np.zeros(self.V.shape)
        db = np.zeros(self.b.shape)
        
        delta_z = z
        delta_z[np.arange(len(y)), y] -= 1.0
        
        delta_h=np.zeros(h.shape)
        delta_c=np.zeros(c.shape)
        for t in np.arange(T)[::-1]:
            dV += np.outer(delta_z[t], h[t].T)
            db += delta_z[t]

            delta_h[t] = np.dot(self.V.T,delta_z[t])+delta_h[t+1]
            delta_o = delta_h[t]*np.tanh(c[t])
            delta_c[t] = delta_h[t]*o[t]*(1-np.tanh(c[t])**2)+delta_c[t+1]
            
            delta_i=delta_c[t]*g[t]*i[t]*(1-i[t])
            delta_g=delta_c[t]*i[t]*(1-g[t]**2)
            delta_f=delta_c[t]*c[t-1]*f[t]*(1-f[t])
            delta_o_net=delta_o*o[t]*(1-o[t])
                    
            inputs=np.zeros(self.word_dim)
            inputs[x[t]]=1
            dWf +=np.outer(delta_f,inputs.T)
            dUf +=np.outer(delta_f,h[t-1].T)
            dbf +=delta_f
            dWi +=np.outer(delta_i,inputs.T)
            dUi +=np.outer(delta_i,h[t-1].T)
            dbi +=delta_i
            dWg +=np.outer(delta_g,inputs.T)
            dUg +=np.outer(delta_g,h[t-1].T)
            dbg +=delta_g
            dWo +=np.outer(delta_o_net,inputs.T)
            dUo +=np.outer(delta_o_net,h[t-1].T)
            dbo +=delta_o_net
           
        return (dWf,dUf,dbf,dWi,dUi,dbi,dWg,dUg,dbg,dWo,dUo,dbo,dV,db)
    
    def sgd_step(self, x, y, learning_rate):
        dWf,dUf,dbf,dWi,dUi,dbi,dWg,dUg,dbg,dWo,dUo,dbo,dV,db = self.bptt(x, y)
        self.Wf -= learning_rate * dWf
        self.Uf -= learning_rate * dUf
        self.bf -= learning_rate * dbf
        self.Wi -= learning_rate * dWi
        self.Ui -= learning_rate * dUi
        self.bi -= learning_rate * dbi
        self.Wg -= learning_rate * dWg
        self.Ug -= learning_rate * dUg
        self.bg -= learning_rate * dbg
        self.Wo -= learning_rate * dWo
        self.Uo -= learning_rate * dUo
        self.bo -= learning_rate * dbo
        self.V -= learning_rate * dV
        self.b -= learning_rate * db

In [12]:
import sys
from datetime import datetime

def train(model, X, Y, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    num_examples_seen = 0
    losses = []
    for epoch in range(nepoch):
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_total_loss(X, Y)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
            # Adjust the learning rate if loss increases
            if len(losses) > 1 and losses[-1][1] > losses[-2][1]:
                learning_rate = learning_rate * 0.5
                print("Setting learning rate to %f" % learning_rate)
            sys.stdout.flush()
        # For each training example...
        for i in range(len(Y)):
            model.sgd_step(X[i], Y[i], learning_rate)
            num_examples_seen += 1
        if epoch%100==0:
            save_model_parameters(model, 'lstmlm.parameters.epoch%s'%epoch)

In [2]:
import csv
import itertools
import nltk

def getSentenceData(path, vocabulary_size=8000):
    unknown_token = "UNKNOWN_TOKEN"
    sentence_start_token = "SENTENCE_START"
    sentence_end_token = "SENTENCE_END"

    # Read the data and append SENTENCE_START and SENTENCE_END tokens
    print("Reading CSV file...")
    with open(path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, skipinitialspace=True)
        # Split full comments into sentences
        sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
        # Append SENTENCE_START and SENTENCE_END
        sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
    print("Parsed %d sentences." % (len(sentences)))

    # Tokenize the sentences into words
    tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    # Filter the sentences having few words (including SENTENCE_START and SENTENCE_END)
    tokenized_sentences = list(filter(lambda x: len(x) > 3, tokenized_sentences))

    # Count the word frequencies
    word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    print("Found %d unique words tokens." % len(word_freq.items()))

    # Get the most common words and build index_to_word and word_to_index vectors
    vocab = word_freq.most_common(vocabulary_size-1)
    index_to_word = [x[0] for x in vocab]
    index_to_word.append(unknown_token)
    word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

    print("Using vocabulary size %d." % vocabulary_size)
    print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

    # Replace all words not in our vocabulary with the unknown token
    for i, sent in enumerate(tokenized_sentences):
        tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

    print("\nExample sentence: '%s'" % sentences[1])
    print("\nExample sentence after Pre-processing: '%s'\n" % tokenized_sentences[0])

    # Create the training data
    X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
    y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

    print("X_train shape: " + str(X_train.shape))
    print("y_train shape: " + str(y_train.shape))

    # Print an training data example
    x_example, y_example = X_train[17], y_train[17]
    print("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
    print("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

    return X_train, y_train,index_to_word, word_to_index

if __name__ == '__main__':
    X_train, y_train ,index_to_word, word_to_index= getSentenceData('data/reddit-comments-2015-08.csv')

Reading CSV file...
Parsed 79171 sentences.
Found 65467 unique words tokens.
Using vocabulary size 8000.
The least frequent word in our vocabulary is 'documentary' and appeared 10 times.

Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']'

X_train shape: (78483,)
y_train shape: (78483,)
x:
SENTENCE_START what are n't you understanding about this ? !
[0, 51, 27, 16, 10, 857, 54, 25, 34, 69]

y:
what are n't you understanding about this ? ! SENTENCE_END
[51, 27, 16, 10, 857, 54, 25, 34, 69, 1]


In [None]:
lstm=LSTMLM(8000)
train(lstm, X_train, y_train, learning_rate=0.005, nepoch=1001, evaluate_loss_after=5)

In [8]:
def generate_sentence(model, index_to_word, word_to_index):
    new_sentence = [word_to_index['SENTENCE_START']]
    while not (new_sentence[-1] == word_to_index['SENTENCE_END'] or len(new_sentence) > 100 or new_sentence[-1]==word_to_index['UNKNOWN_TOKEN']):
        next_word = model.predict(new_sentence)[-1]
        new_sentence.append(next_word)
    sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    return " ".join(sentence_str)

In [10]:
model=load_model_parameters('lstmlm.parameters.epoch 0.npz')
sent=generate_sentence(model,index_to_word, word_to_index)
for i in range(10):
    print(sent)

Building model model from lstmlm.parameters.epoch 0.npz with hidden_dim=128 word_dim=8000
thethethethethethethethethe
thethethethethethethethethe
thethethethethethethethethe
thethethethethethethethethe
thethethethethethethethethe
thethethethethethethethethe
thethethethethethethethethe
thethethethethethethethethe
thethethethethethethethethe
thethethethethethethethethe
