In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

import sys
import os
import pytreebank
import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

# SCRIPT_DIR = os.path.dirname(os.path.abspath('../scripts/scripts/'))
# sys.path.append(os.path.dirname(SCRIPT_DIR))

# from scripts.constants import PATH_TO_DATA, DATA_FILE_NAME

In [2]:
data = pytreebank.load_sst("data/SST2-Data/SST2-Data/trainDevTestTrees_PTB/trees/")

In [3]:
out_path = os.path.join("data/sst_{}.txt")

In [4]:
for cat in ['train','test','dev']:
    with open(out_path.format(cat),"w") as file:
        for item in data[cat]:
            file.write("__label__{}\t{}\n".format(
                item.to_labeled_lines()[0][0] +1,
                item.to_labeled_lines()[0][1]
            ))
    
    print("done with {}".format(file))

done with <_io.TextIOWrapper name='data/sst_train.txt' mode='w' encoding='UTF-8'>
done with <_io.TextIOWrapper name='data/sst_test.txt' mode='w' encoding='UTF-8'>
done with <_io.TextIOWrapper name='data/sst_dev.txt' mode='w' encoding='UTF-8'>


In [5]:
df_train = pd.read_csv("data/sst_train.txt",sep="\t",header=None,names=['label','text'])
df_train['label'] = df_train['label'].str.replace("__label__","")
df_train['label'] = df_train['label'].astype(int).astype("category")

df_test = pd.read_csv("data/sst_test.txt",sep="\t",header=None,names=['label','text'])
df_test['label'] = df_test['label'].str.replace("__label__","")
df_test['label'] = df_test['label'].astype(int).astype("category")

In [6]:
df_train

Unnamed: 0,label,text
0,4,The Rock is destined to be the 21st Century 's...
1,5,The gorgeously elaborate continuation of `` Th...
2,4,Singer/composer Bryan Adams contributes a slew...
3,3,You 'd think by now America would have had eno...
4,4,Yet the act is still charming here .
...,...,...
8539,1,A real snooze .
8540,2,No surprises .
8541,4,We 've seen the hippie-turned-yuppie plot befo...
8542,1,Her fans walked out muttering words like `` ho...


In [7]:
df_test

Unnamed: 0,label,text
0,3,Effective but too-tepid biopic
1,4,If you sometimes like to go to the movies to h...
2,5,"Emerges as something rare , an issue movie tha..."
3,3,The film provides some great insight into the ...
4,5,Offers that rare combination of entertainment ...
...,...,...
2205,4,An imaginative comedy/thriller .
2206,5,"( A ) rare , beautiful film ."
2207,5,( An ) hilarious romantic comedy .
2208,4,Never ( sinks ) into exploitation .


In [8]:
import tensorflow as tf

In [10]:
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"

In [11]:
import tensorflow_hub as hub
hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True)

In [13]:
hub_layer(df_train['text'][:10])

<tf.Tensor: shape=(10, 50), dtype=float32, numpy=
array([[ 1.23052120e-01, -4.67815287e-02,  2.00500950e-01,
         1.68864891e-01,  1.44037500e-01, -2.09933762e-02,
        -4.41514961e-02, -1.56816453e-01, -3.81152302e-01,
         5.77566326e-02, -1.39520928e-01,  1.62546530e-01,
        -4.70357202e-02,  3.79464775e-03, -3.93334217e-02,
        -1.05295621e-01, -3.59188944e-01,  2.13422179e-01,
         9.63375997e-03, -4.80615586e-01,  1.91493690e-01,
        -8.31796303e-02,  6.94135651e-02,  2.92505831e-01,
        -2.11289406e-01,  2.41886660e-01, -2.95723855e-01,
         1.36249736e-01,  3.62417847e-03,  1.03047252e-01,
        -1.45836189e-01,  8.97351578e-02,  7.98443481e-02,
        -1.28481850e-01, -4.08470519e-02,  2.07875982e-01,
        -1.26691535e-02,  4.92160507e-02,  1.06361024e-01,
        -1.37091309e-01, -2.90117353e-01,  1.08392805e-01,
        -2.01896980e-01, -1.26373842e-01, -4.55318838e-01,
        -2.96488553e-01, -1.38639137e-01, -1.14851207e-01,
      

In [16]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1))

In [17]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 50)                48190600  
                                                                 
 dense_1 (Dense)             (None, 16)                816       
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 48191433 (183.84 MB)
Trainable params: 48191433 (183.84 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])

In [20]:
history = model.fit(
    df_train(512),      
    epochs=10,                                 
    validation_data=df_test(512), 
    verbose=1                                   
)

TypeError: 'DataFrame' object is not callable

In [21]:
def textblob_score(sentence):
    return TextBlob(sentence).sentiment.polarity
df_test['tb_score'] =  df_test['text'].apply(textblob_score)
df_test

Unnamed: 0,label,text,tb_score
0,3,Effective but too-tepid biopic,0.600
1,4,If you sometimes like to go to the movies to h...,0.500
2,5,"Emerges as something rare , an issue movie tha...",0.450
3,3,The film provides some great insight into the ...,0.275
4,5,Offers that rare combination of entertainment ...,0.200
...,...,...,...
2205,4,An imaginative comedy/thriller .,0.600
2206,5,"( A ) rare , beautiful film .",0.575
2207,5,( An ) hilarious romantic comedy .,0.250
2208,4,Never ( sinks ) into exploitation .,0.050


In [22]:
df_test['tb_label'] = pd.cut(df_test['tb_score'],bins=5,labels=[1,2,3,4,5])
df_test  = df_test.drop(['tb_score'],axis=1)

In [23]:
df_test

Unnamed: 0,label,text,tb_label
0,3,Effective but too-tepid biopic,4
1,4,If you sometimes like to go to the movies to h...,4
2,5,"Emerges as something rare , an issue movie tha...",4
3,3,The film provides some great insight into the ...,4
4,5,Offers that rare combination of entertainment ...,3
...,...,...,...
2205,4,An imaginative comedy/thriller .,4
2206,5,"( A ) rare , beautiful film .",4
2207,5,( An ) hilarious romantic comedy .,4
2208,4,Never ( sinks ) into exploitation .,3


In [25]:
from sklearn.metrics import f1_score, accuracy_score

In [30]:
def f1_acc(df,pred_column):
    f1_macro  = f1_score(df['label'],df[pred_column],average='macro')
    acc = accuracy_score(df['label'],df[pred_column])
    print("F1 Score : {} \n Accuracy : {}".format(f1_macro,acc))

In [31]:
f1_acc(df_test,"tb_label")

F1 Score : 0.2468141571266554 
 Accuracy : 0.283710407239819


In [35]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mazeltan/nltk_data...


True

In [36]:
vader = SentimentIntensityAnalyzer()


In [37]:
def vader_score(sent,vader):
    return vader.polarity_scores(sent)['compound']

In [41]:
df_test['vader_score'] =  df_test['text'].apply(lambda x : vader_score(x,vader))
df_test

Unnamed: 0,label,text,tb_label,vader_score
0,3,Effective but too-tepid biopic,4,0.2617
1,4,If you sometimes like to go to the movies to h...,4,0.8271
2,5,"Emerges as something rare , an issue movie tha...",4,0.6592
3,3,The film provides some great insight into the ...,4,0.5994
4,5,Offers that rare combination of entertainment ...,3,0.4215
...,...,...,...,...
2205,4,An imaginative comedy/thriller .,4,0.0000
2206,5,"( A ) rare , beautiful film .",4,0.5994
2207,5,( An ) hilarious romantic comedy .,4,0.7845
2208,4,Never ( sinks ) into exploitation .,3,0.0000


In [42]:
df_test['vader_label'] = pd.cut(df_test['vader_score'],bins=5,labels=[1,2,3,4,5])
df_test = df_test.drop('vader_score',axis=1)
df_test

Unnamed: 0,label,text,tb_label,vader_label
0,3,Effective but too-tepid biopic,4,4
1,4,If you sometimes like to go to the movies to h...,4,5
2,5,"Emerges as something rare , an issue movie tha...",4,5
3,3,The film provides some great insight into the ...,4,5
4,5,Offers that rare combination of entertainment ...,3,4
...,...,...,...,...
2205,4,An imaginative comedy/thriller .,4,3
2206,5,"( A ) rare , beautiful film .",4,5
2207,5,( An ) hilarious romantic comedy .,4,5
2208,4,Never ( sinks ) into exploitation .,3,3


In [44]:
f1_acc(df_test,"vader_label")

F1 Score : 0.3136923605939262 
 Accuracy : 0.3158371040723982
