In [149]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import json

import torch
import torch.nn as nn

import numpy as np

from torch.utils.data import Dataset
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader


In [150]:
# reads in the json file, only to the max entries and returns them as json_array, if max entries is set to 0 then it reads the full thing
def read_partial_json_file(filename, max_entries=0, encoding='utf-8'):
    json_array = []
    with open(filename, 'r', encoding=encoding) as file:
        if max_entries == 0:
            for line in file:
                json_array.append(json.loads(line))
        else:
            for _ in range(max_entries):
                line = file.readline()
                if not line:
                    break
                json_array.append(json.loads(line))
    return json_array


def add_missing_keys(json_array):
    for obj in json_array:
        for key in ['stars', 'useful', 'funny', 'cool', 'text']:
            if key in obj:
                if key == 'stars':
                    obj[key] = obj[key]/5
            if key not in obj:
                obj[key] = 0
                if key == 'stars':
                    obj[key] = 3/5
                print("Key {} not found in json".format(key))
    return json_array


# removes specified keys from json array
def remove_keys(json_array, keys_to_remove):
    for obj in json_array:
        for key in keys_to_remove:
            obj.pop(key, None)
    return json_array


def ConvertJSONFileToDataFrame(filename, max_entries=1000, encoding='utf-8'):
    #load in the json array
    json_array = read_partial_json_file(filename, max_entries, encoding)
    #add in the missing keys, will set to 0 for now but a heuristic for this will have to be made.
    json_array = add_missing_keys(json_array)
    df = pd.DataFrame(json_array)
    ColumnsToRemove = ['business_id', 'user_id', 'date', 'review_id']
    df = df.drop(columns=ColumnsToRemove)
    return df


filename = 'yelp_academic_dataset_review.json'
df = ConvertJSONFileToDataFrame(filename)


In [151]:
# Shuffle the data and then split it, keeping 20% aside for testing
filename = 'yelp_academic_dataset_review.json'
dataset = ConvertJSONFileToDataFrame(filename, 10000)

X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['stars'], test_size=0.2)

vectorizer = CountVectorizer(lowercase=True)
vectorizer.fit(dataset['text'])
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)
print(X_train_vec.shape)
print(X_test_vec.shape)

(8000, 24532)
(2000, 24532)


In [152]:
class Network(torch.nn.Module):
    def __init__(self, vocab_size, hidden_units, num_classes):
        super().__init__()
        #First fully connected layer
        self.fc1 = torch.nn.Linear(vocab_size, hidden_units)
        #Second fully connected layer
        self.fc2 = torch.nn.Linear(hidden_units, num_classes)
        #Final output of sigmoid function      
        self.output = torch.nn.Sigmoid()
        self.dropout = nn.Dropout(0.5)


    def forward(self, x):
        fc1 = self.fc1(x)
        fc1 = self.dropout(fc1)
        fc2 = self.fc2(fc1)
        fc2 = self.dropout(fc2)
        output = self.output(fc2)
        return output[:, -1]


In [153]:
X_train_tensor = torch.from_numpy(X_train_vec.todense()).float()
X_test_tensor = torch.from_numpy(X_test_vec.todense()).float()
Y_train_tensor = torch.from_numpy(np.array(y_train))
Y_test_tensor = torch.from_numpy(np.array(y_test))

trainData = TensorDataset(X_train_tensor, Y_train_tensor)
print(trainData[0])
trainLoader = DataLoader(trainData, batch_size=16, shuffle=True)

(tensor([0., 0., 0.,  ..., 0., 0., 0.]), tensor(0.8000, dtype=torch.float64))


In [154]:
m2 = Network(X_train_vec.shape[1], 128, 5)
optimizer = torch.optim.SGD(m2.parameters(), lr=0.001)  #Initialize loss function
loss_fun = nn.BCELoss(reduction='mean')

m2.train()

for i in range(5):
    for x_batch, y_batch in trainLoader:
        m2.train()
        y_pred = m2(x_batch)
        loss = loss_fun(y_pred, y_batch.float())
        # print("pred: {} batch: {}".format(y_pred, y_batch.float()))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

print('After {} epoch training loss is {}'.format(i, loss.item()))

print(X_test)
print(y_test)
print(m2(X_test_tensor))


After 4 epoch training loss is 0.5483723878860474
6539    THE GOOD: great food for a decent price.  We g...
8069    The tea is not that good...but the cheese crea...
747     Friday, July 22 we decided to give this eatery...
2430    If this spot plans to make it they need to hav...
873     Had not been for years.\n\nWe went for lunch. ...
                              ...                        
5887    We sat on the patio on one of the most beautif...
8119    The man who worked on me did a wonderful job. ...
5562    I come here all the time never had a problem. ...
7681    Excellent little old school bakery. Maybe the ...
5864    I took my kiddos to incredible pizza today.  W...
Name: text, Length: 2000, dtype: object
6539    0.4
8069    0.6
747     0.6
2430    0.4
873     0.4
       ... 
5887    1.0
8119    1.0
5562    0.8
7681    1.0
5864    0.4
Name: stars, Length: 2000, dtype: float64
tensor([0.7183, 0.7450, 0.7901,  ..., 0.7087, 0.5000, 0.6308],
       grad_fn=<SelectBackward0>)
