In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import json
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer 

import torch
import torch.nn as nn
import torch.optim as optim

In [49]:
# reads in the json file, only to the max entries and returns them as json_array, if max entries is set to 0 then it reads the full thing
def read_partial_json_file(filename, max_entries=0, encoding='utf-8'):
    json_array = []
    with open(filename, 'r', encoding=encoding) as file:
        if max_entries == 0:
            for line in file:
                json_array.append(json.loads(line))
        else:
            for _ in range(max_entries):
                line = file.readline()
                if not line:
                    break
                json_array.append(json.loads(line))
    return json_array


def add_missing_keys(json_array):
    for obj in json_array:
        for key in ['stars', 'useful', 'funny', 'cool', 'text']:
            if key not in obj:
                obj[key] = 0
                if key == 'stars':
                    obj[key] = 3
                print("Key {} not found in json".format(key))
    return json_array


# removes specified keys from json array
def remove_keys(json_array, keys_to_remove):
    for obj in json_array:
        for key in keys_to_remove:
            obj.pop(key, None)
    return json_array


def ConvertJSONFileToDataFrame(filename, max_entries=1000, encoding='utf-8'):
    #load in the json array
    json_array = read_partial_json_file(filename, max_entries, encoding)
    #add in the missing keys, will set to 0 for now but a heuristic for this will have to be made.
    json_array = add_missing_keys(json_array)
    df = pd.DataFrame(json_array)
    ColumnsToRemove = ['business_id', 'user_id', 'date', 'review_id']
    df = df.drop(columns=ColumnsToRemove)
    return df


In [50]:
filename = 'yelp_academic_dataset_review.json'
dataset = ConvertJSONFileToDataFrame(filename, 1000)

nltk.download('stopwords')
nltk.download('punkt')

stem = SnowballStemmer("english")
stopWords = stopwords.words('english')

def stemText(text):
    return " ".join([i for i in word_tokenize(text) if not i in stopWords])

#Data preprocessing: convert text to lowercase
X = dataset['text'].map(lambda x: stemText(x.lower()))
#convert star count to categories starting from 0
translation = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}
labels = ['1', '2', '3', '4', '5']
y = dataset['stars'].copy()
y.replace(translation, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=117)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=312)

vectorizer = CountVectorizer(lowercase=True)
vectorizer.fit(X)

X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)

X_train

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ckmfo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ckmfo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


234    best iced latte beignet tried . live music als...
827    classic burger fries . overall pleased experie...
565    best place go gyros ! quality food amazing . a...
661    replaced dive watch battery less 5 minutes rea...
781    hands favorite brewery world ! ! husband stl e...
                             ...                        
599    listened reviews -- kind disappointed . layout...
337    worst service advisors ! used good kelly team ...
80     couple friends stopped late night milkshakes f...
112    westfall replaced roof april . could pleased e...
494    went show seeing university city area . across...
Name: text, Length: 800, dtype: object

In [51]:
#I don't have a GPU on my laptop so this is untestable
if torch.cuda.is_available():
    torchDevice = torch.device('cuda')
else:
    torchDevice = torch.device('cpu')

In [52]:
class NeuralNet(nn.Module):
    def __init__(self, vocabSize, embed_size, layer1size, layer2size, layer3size, dropout, maxWordCt):
        super().__init__()
        
        self.embedding = nn.Embedding(maxWordCt * vocabSize, embed_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.feed = nn.Linear(embed_size, layer1size)
        self.full1 = nn.Linear(layer1size, layer2size)
        self.full2 = nn.Linear(layer2size, layer3size)
        self.full3 = nn.Linear(layer3size, 5) #the output is just projected star count
        
        # #First fully connected layer
        # self.fc1 = torch.nn.Linear(vocabSize,layer1size)
        # #Second fully connected layer
        # self.fc2 = torch.nn.Linear(layer1size,5)
        # #Final output of sigmoid function      
        # self.output = torch.nn.Sigmoid()
    
    def forward(self, text):
        # #return self.linear(text)
        # fc1 = self.fc1(text)
        # fc2 = self.fc2(fc1)
        # output = self.output(fc2)
        # return output[:, -1]
        
        embedded = self.embedding(text)
        x = embedded.view(embedded.shape[0], -1)
        x = self.relu(self.feed(x))
        x = self.dropout(x)
        x = self.relu(self.full1(x))
        x = self.dropout(x)
        x = self.relu(self.full2(x))
        x = self.dropout(x)
        result = self.full3(x)
        return result


In [None]:
model = NeuralNet(X_train_vec.shape[1], 300, 128, 64, 32, dropout=0.5, maxWordCt=100)
