In [1]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# Pandas NumPy and TensorFlow
import pandas as pd
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# Helper libraries
from utils import utils_proj, vocabulary

from sklearn.model_selection import train_test_split

import lstm; reload(lstm)
import matplotlib.pyplot as plt
% matplotlib inline

  from ._conv import register_converters as _register_converters


In [2]:
names = ['timestamp','date','query','handle','message']
df = pd.read_csv('../sentiment140.csv',encoding='Latin1',names=names)
df['sentiment'] = df.index
#Instead of positive being 4 make positive 1
df.loc[df['sentiment'] == 4,'sentiment']=1


In [3]:
df.head()

Unnamed: 0,timestamp,date,query,handle,message,sentiment
0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,0
0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,0
0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,0
0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",0


In [74]:
#Just use 1000 rows while developing the code, full set takes too long
x_data = df['message'].head(100000).tolist()
y_data = df['sentiment'].head(100000).tolist()

#build the vocab
x_data = [i.split() for i in x_data]
flat_list = [item for sublist in x_data for item in sublist]
vocab = utils_proj.build_vocab(flat_list)


Vocabulary: 10,000 types


In [75]:
#Determine the length of the longest tweet
max_val=0
for i in x_data:
    if len(i) > max_val:
        max_val = len(i)
print(max_val)

35


In [76]:
#canonicalize sentences, convert to ids, pad to consistant length of 100
x_ids = []
for tweet in x_data:
    temp = utils_proj.preprocess_sentences([tweet],vocab)
    x_ids.append(
        np.pad(utils_proj.preprocess_sentences([tweet],vocab),(98-len(tweet),0),'constant').tolist()) 

In [77]:
#Split into 70% train, 15% dev, 15% test

def train_test_dev(x,y,train_pct=.8,test_pct=.1,random_state=1):
    #Split into train and test
    X_train, X_test, y_train, y_test = (
        train_test_split(x, y, test_size=round(1-train_pct,1), random_state=random_state))
    #split test into dev and test
    X_dev, X_test, y_dev, y_test = (
        train_test_split(X_test, y_test, test_size=test_pct/(1-train_pct), random_state=random_state))
    
    return np.asarray(X_train),np.asarray(X_dev),np.asarray(X_test),np.asarray(y_train),np.asarray(y_dev),np.asarray(y_test)


X_train,X_dev,X_test,y_train,y_dev,y_test = train_test_dev(x_ids,y_data,.7,.15)

In [78]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=None):
    assert(learning_rate is not None)
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_tweets = 0
    
    if train:
        train_op = lm.train_step_
        use_dropout = True
        loss = lm.train_loss_
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time
        loss = lm.loss_  # true loss, if train_loss is an approximation
        
    val_acc = []
    y_true = []
    y_preds = []
    for i, (w, y) in enumerate(batch_iterator):
        # At first batch in epoch, get a clean intitial state.
        
        y_true.append(y)
        if i == 0:
            h = session.run(lm.initial_h_, {lm.input_w_: w})
        feed_dict = {
            lm.input_w_: w,
            lm.target_y_: y,
            lm.initial_h_: h,
            lm.learning_rate_: learning_rate,
            lm.use_dropout_: use_dropout
        }
        ops = [loss, lm.final_h_, train_op,lm.accuracy_,lm.predictions_]        
        # session.run(...) the ops with the feed_dict constructed above.
        # Ensure "cost" becomes the value of "loss".
        # Hint: see "ops" for other variables that need updating in this loop.
        cost,h,_,batch_acc,preds = session.run(ops,feed_dict=feed_dict)
        
        y_pred.append(preds)
        val_acc.append(batch_acc)
        total_cost += cost
        total_batches = i + 1
        total_tweets += w.shape[0]  

        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_tps = total_tweets / (time.time() - start_time)
            print("[batch {:d}]: seen {:d} tweets at {:.1f} tps, cost = {:.3f}".format(
                i, total_tweets, avg_tps, avg_cost))
            tick_time = time.time()  # reset time ticker
    if Train:
        return total_cost / total_batches
    else:
        return total_cost / total_batches, val_acc,y_true,y_pred

In [79]:
def score_dataset(lm, session,ids,labels, name="Data"):
    # For scoring, we can use larger batches to speed things up.
    bi = utils_proj.rnnlm_batch_generator(ids, 100, 100,labels)
    cost,logits = run_epoch(lm, session, bi, 
                     learning_rate=0.0, train=False, 
                     verbose=False, tick_s=3600)
    print("{:s}: avg. loss: {:.03f}  (perplexity: {:.02f})".format(name, cost, np.exp(cost)))
    return cost

In [80]:
# Training parameters
max_time = 100
batch_size = 1000
learning_rate = 0.01
num_epochs = 5

# Model parameters
model_params = dict(V=vocab.size, 
                    H=200, 
                    softmax_ns=200,
                    num_layers=2,
                    num_classes=1)

TF_GRAPHDIR = "/tmp/w266/a3_graph"
summary_writer = tf.summary.FileWriter(TF_GRAPHDIR, lm.graph)
TF_SAVEDIR = "/tmp/w266/a3_model"
checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")

In [81]:
# Will print status every this many seconds
reload(lstm)
reload(utils_proj)
print_interval = 5

lm = lstm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer)
    
    
    for epoch in range(1,num_epochs+1):
        t0_epoch = time.time()
        bi = utils_proj.rnnlm_batch_generator(X_train, batch_size, max_time,y_train)

        cost,val_acc,y_true,y_pred = run_epoch(lm, session, bi,True,
                  True,1.0,learning_rate)
        
        #### END(YOUR CODE) ####
        print("[epoch {:d}] Completed in {:s}".format(epoch, utils_proj.pretty_timedelta(since=t0_epoch)))
    
        # Save a checkpointls 
        saver.save(session, checkpoint_filename, global_step=epoch)
    
    saver.save(session, trained_filename)

[batch 0]: seen 1000 tweets at 215.8 tps, cost = 0.254
[batch 1]: seen 2000 tweets at 213.8 tps, cost = 0.137
[batch 2]: seen 3000 tweets at 212.9 tps, cost = 0.091
[batch 3]: seen 4000 tweets at 212.4 tps, cost = 0.069
[batch 4]: seen 5000 tweets at 212.1 tps, cost = 0.055
[batch 5]: seen 6000 tweets at 212.0 tps, cost = 0.046
[batch 6]: seen 7000 tweets at 211.8 tps, cost = 0.039
[batch 7]: seen 8000 tweets at 211.8 tps, cost = 0.034
[batch 8]: seen 9000 tweets at 211.7 tps, cost = 0.030
[batch 9]: seen 10000 tweets at 211.6 tps, cost = 0.027
[batch 10]: seen 11000 tweets at 211.2 tps, cost = 0.025
[batch 11]: seen 12000 tweets at 211.4 tps, cost = 0.023
[batch 12]: seen 13000 tweets at 211.3 tps, cost = 0.021
[batch 13]: seen 14000 tweets at 211.3 tps, cost = 0.020
[batch 14]: seen 15000 tweets at 211.2 tps, cost = 0.018
[batch 15]: seen 16000 tweets at 211.2 tps, cost = 0.017
[batch 16]: seen 17000 tweets at 211.2 tps, cost = 0.016
[batch 17]: seen 18000 tweets at 211.2 tps, cost =

ValueError: too many values to unpack (expected 2)

In [None]:
print(trained_filename)

In [None]:
# Testingg parameters
max_time = 100
batch_size = 100
learning_rate = 0.00
num_epochs = 1

# Will print status every this many seconds
print_interval = 5

lm = lstm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

with lm.graph.as_default():
    saver = tf.train.Saver()

y_pred=[]
y_true=[]
    
with tf.Session(graph=lm.graph) as session:
    saver.restore(session, trained_filename)
    # Seed RNG for repeatability
    tf.set_random_seed(42)

  
    for epoch in range(1,num_epochs+1):
        t0_epoch = time.time()
        bi = utils_proj.rnnlm_batch_generator(X_dev, batch_size, max_time,y_dev)
        print("[epoch {:d}] Starting epoch {:d}".format(epoch, epoch))
        #### YOUR CODE HERE ####
        # Run a training epoch.
        cost,val_acc,true,pred = run_epoch(lm, session, bi,False,
                  True,1.0,learning_rate)
        
        y_true.append(true)
        y_pred.append(pred)
        
        #### END(YOUR CODE) ####
        print("[epoch {:d}] Completed in {:s}".format(epoch, utils_proj.pretty_timedelta(since=t0_epoch)))
    
  

    #print("[epoch {:d}]".format(epoch), end=" ")
    #score_dataset(lm, session,X_train, y_train, name="Train set")
  #  print("[epoch {:d}]".format(epoch), end=" ")
  #  score_dataset(lm, session,X_dev, y_dev, name="Dev set")
  #  print("")
    # Save final model


In [None]:
np.mean(val_acc)

In [None]:
y_train.max()