In [1]:
#Create .csv File of Amazon Review Data
#@author Chance Simmons
#@version September 2018
import csv
import os
import json

TRAINING = 'Data/training.csv'
TEST = 'Data/test.csv'
TRAINING_LABEL = 'Data/train_label.csv'
TEST_LABEL = 'Data/test_label.csv'
VAL = 'Data/val.csv'
VAL_LABEL = 'Data/val_label.csv'
JSON = 'Data/reviews.json' 
MIN_DATA = 23000
MAX_DATA = 46000
MAX_VAL = 56000

def write_data(rating,text,text_file,label_file):
    """
    Writes the data to a csv file from the json file
    """
    with open(text_file,"a",newline='') as text_file, open(label_file,'a',newline='') as label:
        data_writer = csv.writer(text_file)
        label_writer = csv.writer(label)
        data = [text]
        label_list = [rating]
        data_writer.writerow(data)
        label_writer.writerow(label_list)
        
def sort_data():
    """
    Sorts data based off of postive or negative reviews
    """
    with open(JSON,'r') as json_file:
        total_negative = 0
        total_positive = 0
    
        for element in json_file:
            rating = int(json.loads(element)['overall'])
      
            if rating == 1 or rating == 2 or rating == 3:
                if total_negative < MIN_DATA:
                    write_data(0,json.loads(element)['reviewText'],
                               TRAINING,TRAINING_LABEL)
                    total_negative = total_negative + 1
                elif total_negative < MAX_DATA:
                    write_data(0,json.loads(element)['reviewText'],
                               TEST,TEST_LABEL)
                    total_negative = total_negative + 1
                elif total_negative < MAX_VAL:
                    write_data(0,json.loads(element)['reviewText'],
                               VAL,VAL_LABEL)
                    total_negative = total_negative + 1
            else:
                if total_positive < MIN_DATA:
                    write_data(1,json.loads(element)['reviewText'],
                               TRAINING,TRAINING_LABEL)
                    total_positive = total_positive + 1
                elif total_positive < MAX_DATA:
                    write_data(1,json.loads(element)['reviewText'],
                               TEST,TEST_LABEL)
                    total_positive = total_positive + 1 
                elif total_positive < MAX_VAL:
                    write_data(1,json.loads(element)['reviewText'],
                               VAL,VAL_LABEL)
                    total_positive = total_positive + 1 
                    
def main():
    sort_data()

main()

In [None]:
#Tokenizes the training and test data with Tensorflow
#@author Chance Simmons
#@version October 2018
import tensorflow as tf
import numpy as np
import csv
from tensorflow.python.keras.preprocessing.text import text_to_word_sequence
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

MAX_NUM = 10000
LINE_LENGTH = 250
complete_data = []
TRAIN_DATA = "Data/training.csv"
TEST_DATA = "Data/test.csv"
VAL_DATA = "Data/val.csv"      
TRAIN_TOKEN = "Data/train_token.csv"
TEST_TOKEN = "Data/test_token.csv"
VAL_TOKEN = "Data/val_token.csv"

def get_data(file):
    """
    Gets the data from the csv file and returns an array of the data
    
    Parameters:
        file: the file to read data from
    
    Return:
        data: list of lines from data file
    """
    data = []
    with open(file,'r') as csv_file:
        reader = csv.reader(csv_file)
        for line in reader:
            data.append(str(line))
            complete_data.append(str(line))
    return data

def write_data(file,data):
    """
    Writes the data to the tokenized version of the file
    
    Parameters:
        file(str): the file to write to
        data(str[]): the tokenized data to write to the file
    """
    with open(file,'a',newline='') as csv_file:
        writer = csv.writer(csv_file)
        for line in data:
            writer.writerow(line)
            
def main():
    train_data = get_data(TRAIN_DATA)
    test_data = get_data(TEST_DATA)
    val_data = get_data(VAL_DATA)
   
    tokenizer = Tokenizer(num_words=MAX_NUM, lower=True, split= " ")
    tokenizer.fit_on_texts(complete_data)
    
    train_tokenized = tokenizer.texts_to_sequences(train_data)
    test_tokenized = tokenizer.texts_to_sequences(test_data)
    val_tokenized = tokenizer.texts_to_sequences(val_data)
    
    train_padded = pad_sequences(train_tokenized,maxlen = LINE_LENGTH,
                                 padding = 'post')
    test_padded = pad_sequences(test_tokenized,maxlen = LINE_LENGTH,
                                padding = 'post')
    val_padded = pad_sequences(val_tokenized,maxlen = LINE_LENGTH,
                               padding = 'post')
    
    write_data(TRAIN_TOKEN,train_padded)
    write_data(TEST_TOKEN,test_padded)
    write_data(VAL_TOKEN,val_padded)
    
main()