In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

import sys
import os
import pytreebank
import nltk

import tensorflow as tf
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import string
from nltk.corpus import stopwords

# SCRIPT_DIR = os.path.dirname(os.path.abspath('../scripts/scripts/'))
# sys.path.append(os.path.dirname(SCRIPT_DIR))

# from scripts.constants import PATH_TO_DATA, DATA_FILE_NAME

In [2]:
data = pytreebank.load_sst("data/SST2-Data/SST2-Data/trainDevTestTrees_PTB/trees/")

In [3]:
out_path = os.path.join("data/sst_{}.txt")

In [4]:
for cat in ['train','test','dev']:
    with open(out_path.format(cat),"w") as file:
        for item in data[cat]:
            file.write("__label__{}\t{}\n".format(
                item.to_labeled_lines()[0][0] +1,
                item.to_labeled_lines()[0][1]
            ))
    
    print("done with {}".format(file))

done with <_io.TextIOWrapper name='data/sst_train.txt' mode='w' encoding='UTF-8'>
done with <_io.TextIOWrapper name='data/sst_test.txt' mode='w' encoding='UTF-8'>
done with <_io.TextIOWrapper name='data/sst_dev.txt' mode='w' encoding='UTF-8'>


In [5]:
df_train = pd.read_csv("data/sst_train.txt",sep="\t",header=None,names=['label','text'])
df_train['label'] = df_train['label'].str.replace("__label__","")
df_train['label'] = df_train['label'].astype(int).astype("category")

df_test = pd.read_csv("data/sst_test.txt",sep="\t",header=None,names=['label','text'])
df_test['label'] = df_test['label'].str.replace("__label__","")
df_test['label'] = df_test['label'].astype(int).astype("category")

In [6]:
df_train

Unnamed: 0,label,text
0,4,The Rock is destined to be the 21st Century 's...
1,5,The gorgeously elaborate continuation of `` Th...
2,4,Singer/composer Bryan Adams contributes a slew...
3,3,You 'd think by now America would have had eno...
4,4,Yet the act is still charming here .
...,...,...
8539,1,A real snooze .
8540,2,No surprises .
8541,4,We 've seen the hippie-turned-yuppie plot befo...
8542,1,Her fans walked out muttering words like `` ho...


In [7]:
df_test

Unnamed: 0,label,text
0,3,Effective but too-tepid biopic
1,4,If you sometimes like to go to the movies to h...
2,5,"Emerges as something rare , an issue movie tha..."
3,3,The film provides some great insight into the ...
4,5,Offers that rare combination of entertainment ...
...,...,...
2205,4,An imaginative comedy/thriller .
2206,5,"( A ) rare , beautiful film ."
2207,5,( An ) hilarious romantic comedy .
2208,4,Never ( sinks ) into exploitation .


In [7]:
def preprocess_text(text):
    text = text.lower()
    return text

def remove_punctuation(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    filtered_text = [word for word in text.split() if word not in stop_words]
    return " ".join(filtered_text)

In [9]:
df_train['text'] = df_train['text'].apply(preprocess_text)
df_train['text'] = df_train['text'].apply(remove_punctuation)
df_train['text'] = df_train['text'].apply(remove_stopwords)

In [11]:
df_test['text'] = df_test['text'].apply(preprocess_text)
df_test['text'] = df_test['text'].apply(remove_punctuation)
df_test['text'] = df_test['text'].apply(remove_stopwords)

In [12]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"

In [13]:
import tensorflow_hub as hub
hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True)

In [14]:
hub_layer(df_train['text'][:10])

<tf.Tensor: shape=(10, 20), dtype=float32, numpy=
array([[ 4.83856171e-01,  1.25281932e-02,  6.12146020e-01,
        -3.81588101e-01, -7.84492791e-01,  6.13168329e-02,
        -9.87816751e-01, -1.88001141e-01, -1.11080989e-01,
        -1.05911863e+00, -8.73591602e-01,  9.80120182e-01,
        -4.40951884e-01, -4.85567808e-01, -1.48103905e+00,
         6.54968679e-01,  8.00504029e-01, -2.91959167e-01,
        -1.35966527e+00, -2.35040057e-02],
       [ 1.36769378e+00, -3.94448787e-01,  1.43136501e+00,
         3.77309382e-01, -6.99403107e-01,  8.43796283e-02,
        -4.31156218e-01, -4.12688642e-05,  1.15393925e+00,
        -1.58598304e+00,  9.82125551e-02, -6.24312878e-01,
        -4.65623885e-01, -9.61661115e-02, -4.09871757e-01,
        -4.81311798e-01,  2.76458502e-01, -3.26539099e-01,
        -8.79306912e-01, -9.33691740e-01],
       [ 1.25544357e+00, -7.27126539e-01,  1.90472507e+00,
        -2.11792991e-01, -1.63096476e+00, -5.84068358e-01,
        -5.90886354e-01, -3.22077453e-

In [15]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='softmax'))  


In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 20)                400020    
                                                                 
 dense (Dense)               (None, 16)                336       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 400373 (1.53 MB)
Trainable params: 400373 (1.53 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
model.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [18]:
train_dataset = tf.data.Dataset.from_tensor_slices((df_train['text'], df_train['label']))
test_dataset = tf.data.Dataset.from_tensor_slices((df_test['text'], df_test['label']))

# Batch the datasets
batch_size = 512
train_dataset = train_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

In [19]:
df_train

Unnamed: 0,label,text
0,4,rock destined 21st century new conan going mak...
1,5,gorgeously elaborate continuation lord rings t...
2,4,singercomposer bryan adams contributes slew so...
3,3,think america would enough plucky british ecce...
4,4,yet act still charming
...,...,...
8539,1,real snooze
8540,2,surprises
8541,4,seen hippieturnedyuppie plot enthusiastic char...
8542,1,fans walked muttering words like horrible terr...


In [20]:
history = model.fit(
    train_dataset,      
    epochs=100,                                 
    validation_data=test_dataset, 
    verbose=1                                   
)

Epoch 1/100


  return dispatch_target(*args, **kwargs)
  output, from_logits = _get_logits(


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [21]:
def textblob_score(sentence):
    return TextBlob(sentence).sentiment.polarity
df_test['tb_score'] =  df_test['text'].apply(textblob_score)
df_test

Unnamed: 0,label,text,tb_score
0,3,effective tootepid biopic,0.600
1,4,sometimes like go movies fun wasabi good place...,0.500
2,5,emerges something rare issue movie honest keen...,0.450
3,3,film provides great insight neurotic mindset c...,0.275
4,5,offers rare combination entertainment education,0.200
...,...,...,...
2205,4,imaginative comedythriller,0.600
2206,5,rare beautiful film,0.575
2207,5,hilarious romantic comedy,0.250
2208,4,never sinks exploitation,0.050


In [22]:
df_test['tb_label'] = pd.cut(df_test['tb_score'],bins=5,labels=[1,2,3,4,5])
df_test  = df_test.drop(['tb_score'],axis=1)

In [23]:
df_test

Unnamed: 0,label,text,tb_label
0,3,effective tootepid biopic,4
1,4,sometimes like go movies fun wasabi good place...,4
2,5,emerges something rare issue movie honest keen...,4
3,3,film provides great insight neurotic mindset c...,4
4,5,offers rare combination entertainment education,3
...,...,...,...
2205,4,imaginative comedythriller,4
2206,5,rare beautiful film,4
2207,5,hilarious romantic comedy,4
2208,4,never sinks exploitation,3


In [24]:
from sklearn.metrics import f1_score, accuracy_score

In [25]:
def f1_acc(df,pred_column):
    f1_macro  = f1_score(df['label'],df[pred_column],average='macro')
    acc = accuracy_score(df['label'],df[pred_column])
    print("F1 Score : {} \n Accuracy : {}".format(f1_macro,acc))

In [26]:
f1_acc(df_test,"tb_label")

F1 Score : 0.24931670894536953 
 Accuracy : 0.2832579185520362


In [29]:
vader = SentimentIntensityAnalyzer()


In [30]:
def vader_score(sent,vader):
    return vader.polarity_scores(sent)['compound']

In [31]:
df_test['vader_score'] =  df_test['text'].apply(lambda x : vader_score(x,vader))
df_test

Unnamed: 0,label,text,tb_label,vader_score
0,3,effective tootepid biopic,4,0.4767
1,4,sometimes like go movies fun wasabi good place...,4,0.8271
2,5,emerges something rare issue movie honest keen...,4,0.7783
3,3,film provides great insight neurotic mindset c...,4,0.5994
4,5,offers rare combination entertainment education,3,0.4215
...,...,...,...,...
2205,4,imaginative comedythriller,4,0.0000
2206,5,rare beautiful film,4,0.5994
2207,5,hilarious romantic comedy,4,0.7845
2208,4,never sinks exploitation,3,0.0000


In [32]:
df_test['vader_label'] = pd.cut(df_test['vader_score'],bins=5,labels=[1,2,3,4,5])
df_test = df_test.drop('vader_score',axis=1)
df_test

Unnamed: 0,label,text,tb_label,vader_label
0,3,effective tootepid biopic,4,4
1,4,sometimes like go movies fun wasabi good place...,4,5
2,5,emerges something rare issue movie honest keen...,4,5
3,3,film provides great insight neurotic mindset c...,4,5
4,5,offers rare combination entertainment education,3,4
...,...,...,...,...
2205,4,imaginative comedythriller,4,3
2206,5,rare beautiful film,4,5
2207,5,hilarious romantic comedy,4,5
2208,4,never sinks exploitation,3,3


In [33]:
f1_acc(df_test,"vader_label")

F1 Score : 0.30033715825124857 
 Accuracy : 0.3040723981900452
