In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import json
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

import torch
import torch.nn as nn
import torch.optim as optim



In [2]:
# reads in the json file, only to the max entries and returns them as json_array, if max entries is set to 0 then it reads the full thing
def read_partial_json_file(filename, max_entries=0, encoding='utf-8'):
    json_array = []
    with open(filename, 'r', encoding=encoding) as file:
        if max_entries == 0:
            for line in file:
                json_array.append(json.loads(line))
        else:
            for _ in range(max_entries):
                line = file.readline()
                if not line:
                    break
                json_array.append(json.loads(line))
    return json_array


def add_missing_keys(json_array):
    for obj in json_array:
        for key in ['stars', 'useful', 'funny', 'cool', 'text']:
            if key not in obj:
                obj[key] = 0
                if key == 'stars':
                    obj[key] = 3
                print("Key {} not found in json".format(key))
    return json_array


# removes specified keys from json array
def remove_keys(json_array, keys_to_remove):
    for obj in json_array:
        for key in keys_to_remove:
            obj.pop(key, None)
    return json_array


def ConvertJSONFileToDataFrame(filename, max_entries=1000, encoding='utf-8'):
    #load in the json array
    json_array = read_partial_json_file(filename, max_entries, encoding)
    #add in the missing keys, will set to 0 for now but a heuristic for this will have to be made.
    json_array = add_missing_keys(json_array)
    df = pd.DataFrame(json_array)
    ColumnsToRemove = ['business_id', 'user_id', 'date', 'review_id']
    df = df.drop(columns=ColumnsToRemove)
    return df


In [3]:
filename = 'yelp_academic_dataset_review.json'
dataset = ConvertJSONFileToDataFrame(filename, 5000)

# dataset.drop(dataset[(dataset.stars > 1) & (dataset.stars < 5)].index, inplace=True)

nltk.download('stopwords')
nltk.download('punkt')

stem = SnowballStemmer("english")
stopWords = stopwords.words('english')

def stemText(text):
    return " ".join([i for i in word_tokenize(text) if not i in stopWords])

#Data preprocessing: convert text to lowercase
X = dataset['text'].map(lambda x: stemText(x.lower()))
#convert star count to categories starting from 0
translation = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}
labels = ['1', '2', '3', '4', '5']
y = dataset['stars'].copy()
y.replace(translation, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=117)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=312)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ckmfo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ckmfo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
vec = CountVectorizer()
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

In [5]:
class NNClassifier(nn.Module):
    def __init__(self, inputSize, layer2size, layer3size):
        super(NNClassifier, self).__init__()
        self.fc1 = nn.Linear(inputSize, layer2size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(layer2size, layer3size)
        self.fc3 = nn.Linear(layer3size, 5)
        self.dropout = nn.Dropout(0.3)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x

In [6]:
X_train_tensor = torch.tensor(X_train_vec.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.long)
X_test_tensor = torch.tensor(X_test_vec.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long)

In [7]:
model = NNClassifier(X_train_vec.shape[1], 128, 64)

In [8]:
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report

#choo choo
trainDataset = TensorDataset(X_train_tensor, y_train_tensor)
trainLoader = DataLoader(trainDataset, batch_size=64, shuffle=True)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

trainingEpochs = 20
for i in range(trainingEpochs):
    sumLoss = 0.0
    for text, stars in trainLoader:
        optimizer.zero_grad()
        result = model(text)
        loss = criterion(result, stars)
        loss.backward()
        optimizer.step()
        sumLoss += loss.item()
    print("Loss: {}".format(sumLoss))

outputs = model(X_test_tensor)
wasteTensors, predicted = torch.max(outputs, 1)
y_test_np = y_test_tensor.numpy()
predicted_np = predicted.numpy()
print(classification_report(y_test_np, predicted_np))

Loss: 94.533775806427
Loss: 81.22799265384674
Loss: 72.84832310676575
Loss: 67.0091964006424
Loss: 63.14890170097351
Loss: 60.82495927810669
Loss: 59.6997926235199
Loss: 59.12156391143799
Loss: 58.8633993268013
Loss: 58.732957541942596
Loss: 58.5247318148613
Loss: 58.428956389427185
Loss: 58.38627713918686
Loss: 58.35429239273071
Loss: 58.3017703294754
Loss: 58.302753031253815
Loss: 58.26812690496445
Loss: 58.20780748128891
Loss: 58.155430018901825
Loss: 58.12505620718002
              precision    recall  f1-score   support

           0       0.68      0.72      0.70        47
           1       0.41      0.29      0.34        41
           2       0.36      0.34      0.35        59
           3       0.40      0.45      0.43       119
           4       0.73      0.72      0.72       234

    accuracy                           0.58       500
   macro avg       0.52      0.51      0.51       500
weighted avg       0.58      0.58      0.57       500


In [9]:
X_valid_vec = vec.transform(X_valid)
X_valid_tensor = torch.tensor(X_valid_vec.toarray(), dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid.to_numpy(), dtype=torch.long)


outputs = model(X_valid_tensor)
wasteTensors, predicted = torch.max(outputs, 1)
y_valid_np = y_valid_tensor.numpy()
predicted_np = predicted.numpy()
print(classification_report(y_valid_np, predicted_np))

              precision    recall  f1-score   support

           0       0.65      0.69      0.67        51
           1       0.29      0.21      0.24        52
           2       0.25      0.20      0.22        71
           3       0.26      0.33      0.29       101
           4       0.70      0.69      0.69       225

    accuracy                           0.50       500
   macro avg       0.43      0.42      0.42       500
weighted avg       0.50      0.50      0.50       500


In [10]:
class NNRegressor(nn.Module):
    def __init__(self, inputSize, layer2size, layer3size):
        super(NNRegressor, self).__init__()
        self.fc1 = nn.Linear(inputSize, layer2size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(layer2size, layer3size)
        self.fc3 = nn.Linear(layer3size, 1)
        self.dropout = nn.Dropout(0.3)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x

In [11]:
y = dataset['cool']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=117)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=312)

y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.float32).unsqueeze(1)
y_valid_tensor = torch.tensor(y_valid.to_numpy(), dtype=torch.float32).unsqueeze(1)

In [12]:
coolModel = NNRegressor(X_train_vec.shape[1], 128, 64)

In [13]:
from sklearn.metrics import r2_score

trainDataset = TensorDataset(X_train_tensor, y_train_tensor)
trainLoader = DataLoader(trainDataset, batch_size=64, shuffle=True)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


for i in range(trainingEpochs):
    sumLoss = 0.0
    for text, coolness in trainLoader:
        optimizer.zero_grad()
        result = coolModel(text)
        loss = criterion(result, coolness)
        loss.backward()
        optimizer.step()
        sumLoss += loss.item()
    print("Loss: {}".format(sumLoss))

Loss: 86.9375
Loss: 86.59375
Loss: 86.640625
Loss: 88.34375
Loss: 86.640625
Loss: 88.5
Loss: 86.71875
Loss: 88.28125
Loss: 88.90625
Loss: 86.609375
Loss: 86.9375
Loss: 86.609375
Loss: 86.59375
Loss: 86.578125
Loss: 86.765625
Loss: 86.578125
Loss: 88.203125
Loss: 86.59375
Loss: 86.578125
Loss: 86.578125


In [14]:
outputs = coolModel(X_test_tensor)
wasteTensors, predicted = torch.max(outputs, 1)
y_test_np = y_test_tensor.numpy()
predicted_np = predicted.numpy()
print(classification_report(y_test_np, predicted_np))

              precision    recall  f1-score   support

         0.0       0.80      1.00      0.89       401
         1.0       0.00      0.00      0.00        65
         2.0       0.00      0.00      0.00        22
         3.0       0.00      0.00      0.00         5
         4.0       0.00      0.00      0.00         3
         5.0       0.00      0.00      0.00         2
        10.0       0.00      0.00      0.00         1
        11.0       0.00      0.00      0.00         1

    accuracy                           0.80       500
   macro avg       0.10      0.12      0.11       500
weighted avg       0.64      0.80      0.71       500


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
