In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.python.keras import models, layers, optimizers
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re
%matplotlib inline


1. uploading Text

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#load the dataset
data = pd.read_csv("/content/gdrive/MyDrive/Rev_cleaned.csv")
data.head(3)

Unnamed: 0.1,Unnamed: 0,Score,Text
0,0,1,bought vitality canned dog food product found ...
1,1,0,product arrived labeled jumbo salted peanutsth...
2,2,1,confection century light pillowy citrus gelati...


In [None]:
data.shape

(568454, 3)

In [None]:
data = data[1:200000]

In [None]:
data = data[['Score','Text']]
data.head(3)

Unnamed: 0,Score,Text
1,0,product arrived labeled jumbo salted peanutsth...
2,1,confection century light pillowy citrus gelati...
3,0,looking secret ingredient robitussin believe f...


In [None]:
data = data.dropna()

2. Preprocessing Text

In [None]:
import re
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(Text):
    normalized_texts = []
    for text in Text:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
data_texts = normalize_texts(data['Text'])        


3. Train/Validation Split

In [None]:
X = data['Text']
y = data['Score']

In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    X, y, random_state=57643892, test_size=0.2)

In [None]:
MAX_FEATURES = 12000

In [None]:
MAX_FEATURES = 12000
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_texts)
train_texts = tokenizer.texts_to_sequences(train_texts)
test_texts = tokenizer.texts_to_sequences(val_texts)
#test_texts = tokenizer.texts_to_sequences(data_texts)

4. Padding Sequence

In [None]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
train_texts = pad_sequences(train_texts, maxlen=MAX_LENGTH)
#val_texts = pad_sequences(val_texts, maxlen=MAX_LENGTH)
test_texts = pad_sequences(test_texts, maxlen=MAX_LENGTH)

5. Convolutional Neural Network Model


In [None]:
tensorflow.keras.layers.experimental.preprocessing.Normalization

keras.layers.preprocessing.normalization.Normalization

In [None]:
import tensorflow as tf
print(tf.__version__)

2.8.2


In [None]:
#pip install --upgrade tensorflow

In [None]:
def build_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.Conv1D(64, 3, activation='relu')(embedded)
    x = tf.keras.layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = layers.MaxPool1D(5)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
model = build_model()

In [None]:
model.fit(
    train_texts, 
    train_labels, 
    batch_size=128,
    epochs=2,
    validation_data=(test_texts, val_labels), )

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f9a34c889d0>

In [None]:
preds = model.predict(test_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(val_labels, 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(val_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(val_labels, preds)))

Accuracy score: 0.8759
F1 score: 0.9213
ROC AUC score: 0.9118


6. Recurrent Neural Network Model

In [None]:
def build_rnn_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.CuDNNGRU(128, return_sequences=True)(embedded)
    x = layers.CuDNNGRU(128)(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
rnn_model = build_rnn_model()

In [None]:
tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)

True

In [None]:
rnn_model.fit(
    train_texts, 
    train_labels, 
    batch_size=128,
    epochs=2,
    validation_data=(test_texts, val_labels), )

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f95bc261a90>

In [None]:
preds = rnn_model.predict(test_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(val_labels, 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(val_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(val_labels, preds)))

Accuracy score: 0.8897
F1 score: 0.9295
ROC AUC score: 0.9327
