In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

import sys
import os
import pytreebank
import nltk
import re


from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import f1_score, accuracy_score
from textblob import TextBlob
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import string

from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.python.keras.callbacks import EarlyStopping

### LOADING THE DATASET

In [8]:
data = pytreebank.load_sst("data/SST2-Data/SST2-Data/trainDevTestTrees_PTB/trees/")

In [9]:
out_path = os.path.join("data/sst_{}.txt")

In [10]:
for cat in ['train','test','dev']:
    with open(out_path.format(cat),"w") as file:
        for item in data[cat]:
            file.write("__label__{}\t{}\n".format(
                item.to_labeled_lines()[0][0] +1,
                item.to_labeled_lines()[0][1]
            ))
    
    print("done with {}".format(file))

done with <_io.TextIOWrapper name='data/sst_train.txt' mode='w' encoding='UTF-8'>
done with <_io.TextIOWrapper name='data/sst_test.txt' mode='w' encoding='UTF-8'>
done with <_io.TextIOWrapper name='data/sst_dev.txt' mode='w' encoding='UTF-8'>


In [11]:
df_train = pd.read_csv("data/sst_train.txt",sep="\t",header=None,names=['label','text'])
df_train['label'] = df_train['label'].str.replace("__label__","")
# df_train['label'] = df_train['label'].astype(int).astype("category")
df_train['label'] = (df_train['label'].astype(int) - 1).astype('category')

df_test = pd.read_csv("data/sst_test.txt",sep="\t",header=None,names=['label','text'])
df_test['label'] = df_test['label'].str.replace("__label__","")
# df_test['label'] = df_test['label'].astype(int).astype("category")
df_test['label'] = (df_test['label'].astype(int) - 1).astype('category')

In [39]:
df_train

Unnamed: 0,label,text
0,3,The Rock is destined to be the 21st Century 's...
1,4,The gorgeously elaborate continuation of `` Th...
2,3,Singer/composer Bryan Adams contributes a slew...
3,2,You 'd think by now America would have had eno...
4,3,Yet the act is still charming here .
...,...,...
8539,0,A real snooze .
8540,1,No surprises .
8541,3,We 've seen the hippie-turned-yuppie plot befo...
8542,0,Her fans walked out muttering words like `` ho...


In [40]:
df_test

Unnamed: 0,label,text
0,2,Effective but too-tepid biopic
1,3,If you sometimes like to go to the movies to h...
2,4,"Emerges as something rare , an issue movie tha..."
3,2,The film provides some great insight into the ...
4,4,Offers that rare combination of entertainment ...
...,...,...
2205,3,An imaginative comedy/thriller .
2206,4,"( A ) rare , beautiful film ."
2207,4,( An ) hilarious romantic comedy .
2208,3,Never ( sinks ) into exploitation .


### DATA PREPROCESSING

In [12]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def preprocess_text(text):
    text = text.lower()
    return text

def remove_punctuation(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    filtered_text = [word for word in text.split() if word not in stop_words]
    return " ".join(filtered_text)

In [13]:
df_train['text'] = df_train['text'].apply(strip_html)
df_train['text'] = df_train['text'].apply(preprocess_text)
df_train['text'] = df_train['text'].apply(remove_punctuation)
df_train['text'] = df_train['text'].apply(remove_stopwords)

In [14]:
df_test['text'] = df_test['text'].apply(strip_html)
df_test['text'] = df_test['text'].apply(preprocess_text)
df_test['text'] = df_test['text'].apply(remove_punctuation)
df_test['text'] = df_test['text'].apply(remove_stopwords)

### MULTI-LAYER PERCEPTRON (MLP)

In [17]:
embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=False)  # You can set trainable to True or False based on your needs

model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(5, activation='softmax'))  # Assuming you have 5 classes for multi-class classification

# Use SparseCategoricalCrossentropy for non-one-hot encoded labels
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

# Assuming df_train and df_test are Pandas DataFrames
train_dataset = tf.data.Dataset.from_tensor_slices((df_train['text'], df_train['label']))
test_dataset = tf.data.Dataset.from_tensor_slices((df_test['text'], df_test['label']))

# Batch the datasets
batch_size = 32
train_dataset = train_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    train_dataset,      
    epochs=50,                                 
    validation_data=test_dataset, 
    verbose=1,
    callbacks=[early_stopping]
)


Epoch 1/50
  1/267 [..............................] - ETA: 51s - loss: 1.5873 - accuracy: 0.2188

  output, from_logits = _get_logits(


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


### TEXTBLOB

In [21]:
def textblob_score(sentence):
    return TextBlob(sentence).sentiment.polarity
df_test['tb_score'] =  df_test['text'].apply(textblob_score)
df_test

Unnamed: 0,label,text,tb_score
0,3,effective tootepid biopic,0.600
1,4,sometimes like go movies fun wasabi good place...,0.500
2,5,emerges something rare issue movie honest keen...,0.450
3,3,film provides great insight neurotic mindset c...,0.275
4,5,offers rare combination entertainment education,0.200
...,...,...,...
2205,4,imaginative comedythriller,0.600
2206,5,rare beautiful film,0.575
2207,5,hilarious romantic comedy,0.250
2208,4,never sinks exploitation,0.050


In [22]:
df_test['tb_label'] = pd.cut(df_test['tb_score'],bins=5,labels=[1,2,3,4,5])
df_test  = df_test.drop(['tb_score'],axis=1)

In [23]:
df_test

Unnamed: 0,label,text,tb_label
0,3,effective tootepid biopic,4
1,4,sometimes like go movies fun wasabi good place...,4
2,5,emerges something rare issue movie honest keen...,4
3,3,film provides great insight neurotic mindset c...,4
4,5,offers rare combination entertainment education,3
...,...,...,...
2205,4,imaginative comedythriller,4
2206,5,rare beautiful film,4
2207,5,hilarious romantic comedy,4
2208,4,never sinks exploitation,3


In [24]:
from sklearn.metrics import f1_score, accuracy_score

In [25]:
def f1_acc(df,pred_column):
    f1_macro  = f1_score(df['label'],df[pred_column],average='macro')
    acc = accuracy_score(df['label'],df[pred_column])
    print("F1 Score : {} \n Accuracy : {}".format(f1_macro,acc))

In [26]:
f1_acc(df_test,"tb_label")

F1 Score : 0.24931670894536953 
 Accuracy : 0.2832579185520362


### VADER

In [29]:
vader = SentimentIntensityAnalyzer()

In [30]:
def vader_score(sent,vader):
    return vader.polarity_scores(sent)['compound']

In [31]:
df_test['vader_score'] =  df_test['text'].apply(lambda x : vader_score(x,vader))
df_test

Unnamed: 0,label,text,tb_label,vader_score
0,3,effective tootepid biopic,4,0.4767
1,4,sometimes like go movies fun wasabi good place...,4,0.8271
2,5,emerges something rare issue movie honest keen...,4,0.7783
3,3,film provides great insight neurotic mindset c...,4,0.5994
4,5,offers rare combination entertainment education,3,0.4215
...,...,...,...,...
2205,4,imaginative comedythriller,4,0.0000
2206,5,rare beautiful film,4,0.5994
2207,5,hilarious romantic comedy,4,0.7845
2208,4,never sinks exploitation,3,0.0000


In [32]:
df_test['vader_label'] = pd.cut(df_test['vader_score'],bins=5,labels=[1,2,3,4,5])
df_test = df_test.drop('vader_score',axis=1)
df_test

Unnamed: 0,label,text,tb_label,vader_label
0,3,effective tootepid biopic,4,4
1,4,sometimes like go movies fun wasabi good place...,4,5
2,5,emerges something rare issue movie honest keen...,4,5
3,3,film provides great insight neurotic mindset c...,4,5
4,5,offers rare combination entertainment education,3,4
...,...,...,...,...
2205,4,imaginative comedythriller,4,3
2206,5,rare beautiful film,4,5
2207,5,hilarious romantic comedy,4,5
2208,4,never sinks exploitation,3,3


In [33]:
f1_acc(df_test,"vader_label")

F1 Score : 0.30033715825124857 
 Accuracy : 0.3040723981900452
