In [209]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import json

import torch
import torch.nn as nn

import numpy as np

from torch.utils.data import Dataset
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

In [210]:
# reads in the json file, only to the max entries and returns them as json_array, if max entries is set to 0 then it reads the full thing
def read_partial_json_file(filename, max_entries=0, encoding='utf-8'):
    json_array = []
    with open(filename, 'r', encoding=encoding) as file:
        if max_entries == 0:
            for line in file:
                json_array.append(json.loads(line))
        else:
            for _ in range(max_entries):
                line = file.readline()
                if not line:
                    break
                json_array.append(json.loads(line))
    return json_array

def add_missing_keys(json_array):
    for obj in json_array:
        for key in ['stars', 'useful', 'funny', 'cool', 'text']:
            if key not in obj:
                obj[key] = 0
                if key == 'stars':
                    obj[key] = 3
                print("Key {} not found in json".format(key))
    return json_array

# removes specified keys from json array
def remove_keys(json_array, keys_to_remove):
    for obj in json_array:
        for key in keys_to_remove:
            obj.pop(key, None)
    return json_array

def ConvertJSONFileToDataFrame(filename, max_entries=1000, encoding='utf-8'):
    #load in the json array
    json_array = read_partial_json_file(filename, max_entries, encoding)
    #add in the missing keys, will set to 0 for now but a heuristic for this will have to be made.
    json_array = add_missing_keys(json_array)
    df = pd.DataFrame(json_array)
    ColumnsToRemove = ['business_id', 'user_id', 'date', 'review_id']
    df = df.drop(columns=ColumnsToRemove)
    return df

In [211]:

filename = 'yelp_academic_dataset_review.json'
df = ConvertJSONFileToDataFrame(filename)
#df.head(10)

# X = df[['useful', 'funny', 'cool', 'text']]
# y = df[['stars']]

X = df['text']
y = df['stars']

X_im, X_valid, y_im, y_valid = train_test_split(X, y, test_size=0.2, random_state=117)
X_train, X_test, y_train, y_test = train_test_split(X_im, y_im, test_size=0.2, random_state=312)

vectorizer = CountVectorizer(lowercase=False)
vectorizer.fit(df['text'])

X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print(X_train_vec.shape)
print(X_test_vec.shape)

X_train.head(10)

# json_array = read_partial_json_file(filename, 1000, 'utf-8')
# json_array = add_missing_keys(json_array)
# json_array = remove_keys(json_array, ['business_id', 'user_id', 'date', 'review_id'])
# 
# print(json_array[0])

X_train_tensor = torch.from_numpy(X_train_vec.todense()).float()
X_test_tensor = torch.from_numpy(X_test_vec.todense()).float()
Y_train_tensor = torch.from_numpy(np.array(y_train))
Y_test_tensor = torch.from_numpy(np.array(y_test))

print(X_train_tensor.shape)
print(Y_train_tensor.shape)

(640, 9701)
(160, 9701)
torch.Size([640, 9701])
torch.Size([640])


In [212]:
#I don't have a GPU on my laptop so this is untestable
if torch.cuda.is_available():
    torchDevice = torch.device('cuda')
else:
    torchDevice = torch.device('cpu')

In [213]:
class NeuralNet(nn.Module):
    def __init__(self, vocabSize, embed_size, layer1size, layer2size, layer3size, dropout, maxWordCt):
        super().__init__()
        
        self.embedding = nn.Embedding(maxWordCt * vocabSize, embed_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.feed = nn.Linear(embed_size, layer1size)
        self.full1 = nn.Linear(layer1size, layer2size)
        self.full2 = nn.Linear(layer2size, layer3size)
        self.full3 = nn.Linear(layer3size, 5) #the output is just projected star count
        
        #First fully connected layer
        self.fc1 = torch.nn.Linear(vocabSize,layer1size)
        #Second fully connected layer
        self.fc2 = torch.nn.Linear(layer1size,5)
        #Final output of sigmoid function      
        self.output = torch.nn.Sigmoid()
    
    def forward(self, text):
        #return self.linear(text)
        fc1 = self.fc1(text)
        fc2 = self.fc2(fc1)
        output = self.output(fc2)
        return output[:, -1]
        
        embedded = self.embedding(text)
        x = embedded.view(embedded.shape[0], -1)
        x = self.relu(self.feed(x))
        x = self.dropout(x)
        x = self.relu(self.full1(x))
        x = self.dropout(x)
        x = self.relu(self.full2(x))
        x = self.dropout(x)
        result = self.full3(x)
        return result
        

In [214]:
class Network(torch.nn.Module):    
    def __init__(self,vocab_size,hidden_units,num_classes): 
      super().__init__()
      #First fully connected layer
      self.fc1 = torch.nn.Linear(vocab_size,hidden_units)
      #Second fully connected layer
      self.fc2 = torch.nn.Linear(hidden_units,num_classes)
      #Final output of sigmoid function      
      self.output = torch.nn.Sigmoid()
    def forward(self,x):
      fc1 = self.fc1(x)
      fc2 = self.fc2(fc1)
      output = self.output(fc2)
      return output[:, -1]

In [215]:
model = NeuralNet(X_train_vec.shape[1], 300, 128, 64, 32, dropout=0.5, maxWordCt=100)

#print(X_train_tensor)

pred = model(X_train_tensor)
print(pred)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (640x9701 and 300x128)

In [None]:
trainData = TensorDataset(X_train_tensor, Y_train_tensor)
print(trainData[0])

In [None]:
trainLoader = DataLoader(trainData, batch_size=16, shuffle=True)

In [None]:
#Initialize optimizer
optimizer =torch.optim.SGD(model.parameters(), lr=0.001)#Initialize loss function
loss_fun = nn.BCELoss() 

model.train()

for i in range(5):   
    for x_batch,y_batch in trainLoader:       
        model.train()
        y_pred = model(x_batch)
        loss = loss_fun(y_pred,y_batch.float())
        print(loss)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
print('After {} epoch training loss is {}'.format(i,loss.item()))

In [None]:
m2 = Network(X_train_vec.shape[1], 3, 5)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)  #Initialize loss function
loss_fun = nn.BCELoss()

model.train()

for i in range(5):
    for x_batch, y_batch in trainLoader:
        model.train()
        y_pred = model(x_batch)
        loss = loss_fun(y_pred, y_batch.float())
        print(loss)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

print('After {} epoch training loss is {}'.format(i, loss.item()))
