In [1]:
import pandas as pd

import nltk

nltk.download("stopwords")

from nltk.corpus import stopwords

import re
from nltk.tokenize.toktok import ToktokTokenizer


import warnings

warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tiexin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_table("train.tsv")
print(df.shape)
df.head(5)

(156060, 4)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
df.describe()

Unnamed: 0,PhraseId,SentenceId,Sentiment
count,156060.0,156060.0,156060.0
mean,78030.5,4079.732744,2.063578
std,45050.785842,2502.764394,0.893832
min,1.0,1.0,0.0
25%,39015.75,1861.75,2.0
50%,78030.5,4017.0,2.0
75%,117045.25,6244.0,3.0
max,156060.0,8544.0,4.0


In [4]:
df["Sentiment"].value_counts()

Sentiment
2    79582
3    32927
1    27273
4     9206
0     7072
Name: count, dtype: int64

In [5]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words("english")

In [6]:
# Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub("\[[^]]*\]", "", text)


# Removing the noisy text
def denoise_text(text):
    text = remove_between_square_brackets(text)
    return text


# Apply function on review column
df["Phrase"] = df["Phrase"].apply(denoise_text)

In [7]:
def remove_special_characters(text):
    pattern = r"[^a-zA-z0-9\s]"
    text = re.sub(pattern, "", text)
    return text


df["Phrase"] = df["Phrase"].apply(remove_special_characters)

In [8]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = " ".join([ps.stem(word) for word in text.split()])
    return text


df["Phrase"] = df["Phrase"].apply(simple_stemmer)

In [9]:
stop = set(stopwords.words("english"))
print(stop)


def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [
            token for token in tokens if token.lower() not in stopword_list
        ]
    filtered_text = " ".join(filtered_tokens)
    return filtered_text


df["Phrase"] = df["Phrase"].apply(remove_stopwords)

{'has', "you'll", 'by', 'does', 'not', 'when', 'him', 'where', "mustn't", 'she', 'very', 'y', 'or', 'd', 'those', 'me', 'itself', 't', 'them', "shouldn't", 'had', "couldn't", 'haven', "needn't", 'so', 'mightn', 'for', "aren't", "wouldn't", "you've", 'on', 'won', 'who', 'is', 'just', 'should', 'hers', 'its', 'over', 'here', 'didn', 'hasn', "it's", 'but', 'couldn', 'theirs', 'ourselves', 'than', 'ma', 'doesn', 'did', 'if', 'these', "isn't", "mightn't", "shan't", 'will', 'which', 'be', 'down', 'shan', "she's", 'aren', 'how', 'it', 'their', 'from', 'm', 'own', 'isn', 'some', 'up', "hadn't", 'because', 'this', 'through', 'ours', "wasn't", 'too', 'themselves', 'wouldn', 'about', 'each', 'most', "weren't", "hasn't", 'why', 'being', "you're", 'yourselves', 'hadn', "should've", 'against', 'needn', 'until', 'was', 'wasn', 'my', 'and', 'then', 'doing', 'we', 'off', 'after', "you'd", 'do', 'before', 'such', 'his', 'whom', 'while', 'there', 'weren', 'any', 'few', 'of', 'nor', 'above', 'a', 'between

In [10]:
df = df[df["Phrase"] != ""]

In [11]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,seri escapad demonstr adag good goos also good...,1
1,2,1,seri escapad demonstr adag good goos,2
2,3,1,seri,2
4,5,1,seri,2
5,6,1,escapad demonstr adag good goos,2


In [12]:
import re

# import shutil
import string

import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from keras import layers

# from keras import losses
# from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [13]:
phrases = df["Phrase"].values
labels = df["Sentiment"].values
dataset = tf.data.Dataset.from_tensor_slices((phrases, labels))

batch_size = 256
seed = 42

train_df, val_df = train_test_split(df, test_size=0.2, random_state=seed)
raw_train_ds = tf.data.Dataset.from_tensor_slices(
    (train_df["Phrase"], train_df["Sentiment"])
).batch(batch_size)
raw_val_ds = tf.data.Dataset.from_tensor_slices(
    (val_df["Phrase"], val_df["Sentiment"])
).batch(batch_size)

In [14]:
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [15]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [16]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [17]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)

In [18]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [19]:
embedding_dim = 32

model = tf.keras.Sequential(
    [
        layers.Embedding(max_features, embedding_dim),
        layers.Dropout(0.2),
        layers.GlobalAveragePooling1D(),
        layers.Dropout(0.2),
        layers.Dense(5),
    ]
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          320000    
                                                                 
 dropout (Dropout)           (None, None, 32)          0         
                                                                 
 global_average_pooling1d (  (None, 32)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense (Dense)               (None, 5)                 165       
                                                                 
Total params: 320165 (1.22 MB)
Trainable params: 320165 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
____________________

In [40]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    # optimizer=tf.keras.optimizers.legacy.Adam(),
    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.0001),
    metrics=["accuracy"],
)

epochs = 10
history = model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [41]:
export_model = tf.keras.Sequential(
    [vectorize_layer, model, layers.Activation("sigmoid")]
)

export_model.compile(
    loss=tf.losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer="adam",
    metrics=["accuracy"],
)

In [42]:
dftest = pd.read_table("test.tsv")
dftest.Phrase = dftest.Phrase.astype(str)
dftest.Phrase = dftest.Phrase.apply(denoise_text)
dftest.Phrase = dftest.Phrase.apply(remove_special_characters)
dftest.Phrase = dftest.Phrase.apply(simple_stemmer)
dftest.Phrase = dftest.Phrase.apply(remove_stopwords)

# for i in range(len(dftest["Phrase"])):
#     if dftest["Phrase"][i] == "":
#         dftest["Phrase"][i] = "average"

In [45]:
preds = export_model.predict(dftest.Phrase.values)
p = np.array([np.argmax(p) for p in preds])
new_df = pd.DataFrame({"PhraseId": dftest["PhraseId"], "Sentiment": p})
new_df.to_csv("output.csv", index=False)

