In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from keras import layers
from sklearn.feature_extraction.text import CountVectorizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense


In [2]:
FILENAME = 'fake reviews dataset.csv'

df = pd.read_csv(FILENAME)
df['labels'] = [1 if label=='OR' else 0 for label in df['label']]
df

Unnamed: 0,category,rating,label,text_,labels
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor...",0
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I...",0
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...,0
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i...",0
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...,0
...,...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...,1
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,I wasn't sure exactly what it would be. It is ...,0
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ...",1
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,I liked nothing about this dress. The only rea...,0


In [3]:
corpus = df.text_

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
X_data = X.toarray()

In [4]:
TRAIN_SIZE = 0.8
TRAIN_IDX = int(0.8 * X_data.shape[0])

X_train = X_data[:TRAIN_IDX]
X_test = X_data[TRAIN_IDX:]

y_train = df.labels[:TRAIN_IDX]

In [5]:
def data_generator(X: list, y: list, num_sequences_per_batch: int) -> (list,list):
    '''
    Returns data generator to be used by feed_forward
    https://wiki.python.org/moin/Generators
    https://realpython.com/introduction-to-python-generators/
    
    Yields batches of embeddings and labels to go with them.
    Use one hot vectors to encode the labels 
    (see the to_categorical function)
    
    If for_feedforward is True: 
    Returns data generator to be used by feed_forward
    else: Returns data generator for RNN model
    '''
    # YOUR CODE HERE
    num_samples = len(X)
    
    while True:
        
        for offset in range(0, num_samples, num_sequences_per_batch):
            
            if offset+num_sequences_per_batch <= num_samples:
                
                # Get the batch data
                batch_sequences = X[offset:offset+num_sequences_per_batch]
                batch_labels = y[offset:offset+num_sequences_per_batch]    
                    
                yield np.array(batch_sequences), np.array(batch_labels)


In [6]:
num_sequences_per_batch = 128 # this is the batch size
train_generator = data_generator(list(X_train), y_train, num_sequences_per_batch)

sample = next(train_generator) # this is how you get data out of generators

print(sample[0].shape)
print(sample[1].shape)

(128, 41099)
(128,)


In [10]:
def train_model(data_generator, X, y, save_path, num_sequences_per_batch=128, num_epochs=1):
    
    model = Sequential()
    model.add(SimpleRNN(128, input_shape=(X_data.shape[1], 1)))
    model.add(Dense(1, activation='sigmoid'))

    loss_fn = 'binary_crossentropy'
    model.compile(loss=loss_fn, optimizer='adam', metrics=['accuracy'])
    
    train_generator = data_generator(X, y, num_sequences_per_batch)
    
    history = model.fit(
        x=train_generator,
        steps_per_epoch=len(X) // num_sequences_per_batch,
        epochs=num_epochs
    )                            

    model.save(save_path)

In [11]:
train_model(data_generator, list(X_train), y_train, 'Simple_RNN', num_epochs=2)

Epoch 1/2
  3/252 [..............................] - ETA: 3:31:34 - loss: 0.6947 - accuracy: 0.4948

KeyboardInterrupt: 