In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

import sys
import os
import pytreebank
import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

# SCRIPT_DIR = os.path.dirname(os.path.abspath('../scripts/scripts/'))
# sys.path.append(os.path.dirname(SCRIPT_DIR))

# from scripts.constants import PATH_TO_DATA, DATA_FILE_NAME

In [3]:
data = pytreebank.load_sst("data/SST2-Data/SST2-Data/trainDevTestTrees_PTB/trees/")

In [4]:
out_path = os.path.join("data/sst_{}.txt")

In [5]:
for cat in ['train','test','dev']:
    with open(out_path.format(cat),"w") as file:
        for item in data[cat]:
            file.write("__label__{}\t{}\n".format(
                item.to_labeled_lines()[0][0] +1,
                item.to_labeled_lines()[0][1]
            ))
    
    print("done with {}".format(file))

done with <_io.TextIOWrapper name='data/sst_train.txt' mode='w' encoding='UTF-8'>
done with <_io.TextIOWrapper name='data/sst_test.txt' mode='w' encoding='UTF-8'>
done with <_io.TextIOWrapper name='data/sst_dev.txt' mode='w' encoding='UTF-8'>


In [6]:
df_train = pd.read_csv("data/sst_train.txt",sep="\t",header=None,names=['label','text'])
df_train['label'] = df_train['label'].str.replace("__label__","")
df_train['label'] = df_train['label'].astype(int).astype("category")

df_test = pd.read_csv("data/sst_test.txt",sep="\t",header=None,names=['label','text'])
df_test['label'] = df_test['label'].str.replace("__label__","")
df_test['label'] = df_test['label'].astype(int).astype("category")

In [6]:
df_train

Unnamed: 0,label,text
0,4,The Rock is destined to be the 21st Century 's...
1,5,The gorgeously elaborate continuation of `` Th...
2,4,Singer/composer Bryan Adams contributes a slew...
3,3,You 'd think by now America would have had eno...
4,4,Yet the act is still charming here .
...,...,...
8539,1,A real snooze .
8540,2,No surprises .
8541,4,We 've seen the hippie-turned-yuppie plot befo...
8542,1,Her fans walked out muttering words like `` ho...


In [7]:
df_test

Unnamed: 0,label,text
0,3,Effective but too-tepid biopic
1,4,If you sometimes like to go to the movies to h...
2,5,"Emerges as something rare , an issue movie tha..."
3,3,The film provides some great insight into the ...
4,5,Offers that rare combination of entertainment ...
...,...,...
2205,4,An imaginative comedy/thriller .
2206,5,"( A ) rare , beautiful film ."
2207,5,( An ) hilarious romantic comedy .
2208,4,Never ( sinks ) into exploitation .


In [7]:
import tensorflow as tf

In [8]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"

In [9]:
import tensorflow_hub as hub
hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True)

In [10]:
hub_layer(df_train['text'][:10])

<tf.Tensor: shape=(10, 20), dtype=float32, numpy=
array([[-0.17569035, -1.5676816 ,  0.8543786 , -0.5981151 , -1.2136153 ,
        -0.01878982, -0.93524003,  0.23275281,  0.86174583, -0.01399387,
        -1.3430052 ,  0.01748537,  0.18257691, -0.6852263 , -0.3805004 ,
         0.15710342,  0.5811184 , -0.21914063, -0.13737044,  0.14191812],
       [ 0.6795094 , -1.8112632 ,  2.1644907 , -0.98349875, -0.68851036,
         0.9659213 , -0.631428  , -0.29652372,  2.2149396 , -1.2758911 ,
        -0.27002046, -1.2925656 ,  0.16214599, -0.720647  ,  0.9443487 ,
        -0.6805398 , -0.66745204,  0.89824945,  0.44124267, -0.76817054],
       [ 0.9598038 , -1.6044576 ,  1.3790556 , -0.8387672 , -1.428725  ,
        -0.54128206, -0.25341454, -0.5005333 ,  1.9526852 , -2.4039288 ,
        -1.2525821 ,  0.46131307, -0.9943082 , -0.7892277 , -0.81704605,
        -0.34173867,  0.6299704 , -0.61374605, -2.2921426 , -0.78803724],
       [ 0.21703832, -0.21944825,  0.6043717 , -0.3707854 , -1.8639021 

In [16]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='softmax'))  


In [17]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 20)                400020    
                                                                 
 dense_2 (Dense)             (None, 16)                336       
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 400373 (1.53 MB)
Trainable params: 400373 (1.53 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
model.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [19]:
train_dataset = tf.data.Dataset.from_tensor_slices((df_train['text'], df_train['label']))
test_dataset = tf.data.Dataset.from_tensor_slices((df_test['text'], df_test['label']))

# Batch the datasets
batch_size = 512
train_dataset = train_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

In [25]:
df_train

Unnamed: 0,label,text
0,4,The Rock is destined to be the 21st Century 's...
1,5,The gorgeously elaborate continuation of `` Th...
2,4,Singer/composer Bryan Adams contributes a slew...
3,3,You 'd think by now America would have had eno...
4,4,Yet the act is still charming here .
...,...,...
8539,1,A real snooze .
8540,2,No surprises .
8541,4,We 've seen the hippie-turned-yuppie plot befo...
8542,1,Her fans walked out muttering words like `` ho...


In [20]:
history = model.fit(
    train_dataset,      
    epochs=100,                                 
    validation_data=test_dataset, 
    verbose=1                                   
)

Epoch 1/100


  return dispatch_target(*args, **kwargs)
  output, from_logits = _get_logits(


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [28]:
def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((df_train["text"], df_train["label"]))
    dataset = dataset.shuffle(buffer_size=len(df_train))
    dataset = dataset.batch(batch_size)
    return dataset

def test_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((df_test["text"], df_test["label"]))
    dataset = dataset.batch(batch_size)
    return dataset

embedded_text_feature_column = hub.text_embedding_column(
    key="text",
    module_spec="https://tfhub.dev/google/nnlm-en-dim128/1",
    trainable=True)

run_config = tf.estimator.RunConfig(keep_checkpoint_max=1)

estimator = tf.estimator.DNNClassifier(
    hidden_units=[250, 50],
    feature_columns=[embedded_text_feature_column],
    n_classes=5,
    config=run_config,
    optimizer=tf.compat.v1.train.AdagradOptimizer(learning_rate=0.003))

# Train the estimator
estimator.train(input_fn=train_input_fn, steps=10000)

# You can also evaluate the model on the test set:
results = estimator.evaluate(input_fn=test_input_fn)
print("Test set accuracy: {accuracy}".format(**results))


Instructions for updating:
Use tf.keras instead.


Instructions for updating:
Use tf.keras instead.


Instructions for updating:
Use tf.keras instead.


Instructions for updating:
Use tf.keras instead.


Instructions for updating:
Use tf.keras instead.


Instructions for updating:
Use tf.keras instead.


Instructions for updating:
Use tf.keras instead.


Instructions for updating:
Use tf.keras instead.






INFO:tensorflow:Using config: {'_model_dir': '/var/folders/0s/6y_9xhvn7tx64mnfljyvm01m0000gn/T/tmpfkk9dkqw', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': '/var/folders/0s/6y_9xhvn7tx64mnfljyvm01m0000gn/T/tmpfkk9dkqw', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


Instructions for updating:
Use tf.keras instead.


Instructions for updating:
Use tf.keras instead.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


ValueError: features should be a dictionary of `Tensor`s. Given type: <class 'tensorflow.python.framework.ops.SymbolicTensor'>

In [21]:
def textblob_score(sentence):
    return TextBlob(sentence).sentiment.polarity
df_test['tb_score'] =  df_test['text'].apply(textblob_score)
df_test

Unnamed: 0,label,text,tb_score
0,3,Effective but too-tepid biopic,0.600
1,4,If you sometimes like to go to the movies to h...,0.500
2,5,"Emerges as something rare , an issue movie tha...",0.450
3,3,The film provides some great insight into the ...,0.275
4,5,Offers that rare combination of entertainment ...,0.200
...,...,...,...
2205,4,An imaginative comedy/thriller .,0.600
2206,5,"( A ) rare , beautiful film .",0.575
2207,5,( An ) hilarious romantic comedy .,0.250
2208,4,Never ( sinks ) into exploitation .,0.050


In [22]:
df_test['tb_label'] = pd.cut(df_test['tb_score'],bins=5,labels=[1,2,3,4,5])
df_test  = df_test.drop(['tb_score'],axis=1)

In [23]:
df_test

Unnamed: 0,label,text,tb_label
0,3,Effective but too-tepid biopic,4
1,4,If you sometimes like to go to the movies to h...,4
2,5,"Emerges as something rare , an issue movie tha...",4
3,3,The film provides some great insight into the ...,4
4,5,Offers that rare combination of entertainment ...,3
...,...,...,...
2205,4,An imaginative comedy/thriller .,4
2206,5,"( A ) rare , beautiful film .",4
2207,5,( An ) hilarious romantic comedy .,4
2208,4,Never ( sinks ) into exploitation .,3


In [25]:
from sklearn.metrics import f1_score, accuracy_score

In [30]:
def f1_acc(df,pred_column):
    f1_macro  = f1_score(df['label'],df[pred_column],average='macro')
    acc = accuracy_score(df['label'],df[pred_column])
    print("F1 Score : {} \n Accuracy : {}".format(f1_macro,acc))

In [31]:
f1_acc(df_test,"tb_label")

F1 Score : 0.2468141571266554 
 Accuracy : 0.283710407239819


In [35]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mazeltan/nltk_data...


True

In [36]:
vader = SentimentIntensityAnalyzer()


In [37]:
def vader_score(sent,vader):
    return vader.polarity_scores(sent)['compound']

In [41]:
df_test['vader_score'] =  df_test['text'].apply(lambda x : vader_score(x,vader))
df_test

Unnamed: 0,label,text,tb_label,vader_score
0,3,Effective but too-tepid biopic,4,0.2617
1,4,If you sometimes like to go to the movies to h...,4,0.8271
2,5,"Emerges as something rare , an issue movie tha...",4,0.6592
3,3,The film provides some great insight into the ...,4,0.5994
4,5,Offers that rare combination of entertainment ...,3,0.4215
...,...,...,...,...
2205,4,An imaginative comedy/thriller .,4,0.0000
2206,5,"( A ) rare , beautiful film .",4,0.5994
2207,5,( An ) hilarious romantic comedy .,4,0.7845
2208,4,Never ( sinks ) into exploitation .,3,0.0000


In [42]:
df_test['vader_label'] = pd.cut(df_test['vader_score'],bins=5,labels=[1,2,3,4,5])
df_test = df_test.drop('vader_score',axis=1)
df_test

Unnamed: 0,label,text,tb_label,vader_label
0,3,Effective but too-tepid biopic,4,4
1,4,If you sometimes like to go to the movies to h...,4,5
2,5,"Emerges as something rare , an issue movie tha...",4,5
3,3,The film provides some great insight into the ...,4,5
4,5,Offers that rare combination of entertainment ...,3,4
...,...,...,...,...
2205,4,An imaginative comedy/thriller .,4,3
2206,5,"( A ) rare , beautiful film .",4,5
2207,5,( An ) hilarious romantic comedy .,4,5
2208,4,Never ( sinks ) into exploitation .,3,3


In [44]:
f1_acc(df_test,"vader_label")

F1 Score : 0.3136923605939262 
 Accuracy : 0.3158371040723982
