# Resturant Review Sentiment - Neural Network
### Matthew Newton
* It would be interesting to see how well a neural network can fit to this dataset.
* The dataset is structured but also has text as an unstructured datatype.
* PyTorch is used to implement the neural network with TfidfVectorizer to create the text features as in the linear model, a word embedding such as Glove or Word2Vec would be more effective but more computationally expensive.

In [51]:
import pandas as pd
import pickle
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [52]:
df_review = pd.read_pickle("./cleaned_data/reviews_cleaned_nltk.pickle")

In [53]:
# Split training data into train data, cross validation and test data
df_review = df_review.dropna()
df_review = df_review[:50000] # For testing use subset of total dataset
features = ['text', 'title', 'type', 'priceInterval', 'date', 'review_length', 'rest_rating']
X_train, X_cv, y_train, y_cv = train_test_split(df_review[features], df_review['rating'], test_size = 0.30, random_state = 0)
X_cv, X_test, y_cv, y_test = train_test_split(X_cv[features], y_cv, test_size = 0.50, random_state = 0)

In [54]:
# Preprocess text features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000) 
X_train_text = X_train['text'] + ' ' + X_train['title']  # Combine review and title

# Fit and transform the review + review title combined text
X_train_tfidf = vectorizer.fit_transform(X_train_text).toarray()

# Encode categorical features (Type)
le = LabelEncoder()
X_train['type'] = le.fit_transform(X_train['type'])
# Define a custom transformation function for handling unseen labels
def safe_transform(label_encoder, series):
    return series.apply(lambda x: label_encoder.transform([x])[0] if x in label_encoder.classes_ else -1)

# Normalize numerical features (Price, Date, Review Length, Restaurant Rating)
scaler = StandardScaler()
X_train[['priceInterval', 'date', 'review_length', 'rest_rating']] = scaler.fit_transform(
    X_train[['priceInterval', 'date', 'review_length', 'rest_rating']]
)

# Concatenate all features (TF-IDF text features, numerical and categorical features)
X_train_combined = np.hstack((
    X_train_tfidf,  # Text features
    X_train[['type', 'priceInterval', 'date', 'review_length', 'rest_rating']].values  # Other features
))

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_combined, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values - 1, dtype=torch.long) 


In [55]:
# Build the Neural Network Model
class ReviewNN(nn.Module):
    def __init__(self, input_dim, output_dim=5):
        super(ReviewNN, self).__init__()
#         self.fc1 = nn.Linear(input_dim, 256)
#         self.fc2 = nn.Linear(256, 128)
#         self.fc3 = nn.Linear(128, 64)
#         self.fc4 = nn.Linear(64, output_dim)
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 32)
        self.fc5 = nn.Linear(32, 32)
        self.fc4 = nn.Linear(32, output_dim)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.softmax = nn.Softmax(dim=output_dim)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.relu(self.fc5(x))
        x = self.dropout(x)
        x = self.relu(self.fc5(x))
        x = self.fc4(x)
        #x = self.softmax
        return x

In [56]:
# Initialize and train the model
input_dim = X_train_combined.shape[1]  # Number of input features
model = ReviewNN(input_dim=input_dim)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.001) #

In [57]:
# Training loop
def train_model(model, X_train_tensor, y_train_tensor, epochs=10, batch_size=64):
    model.train()
    for epoch in range(epochs):
        permutation = torch.randperm(X_train_tensor.size()[0])
        
        for i in range(0, X_train_tensor.size()[0], batch_size):
            indices = permutation[i:i + batch_size]
            batch_X, batch_y = X_train_tensor[indices], y_train_tensor[indices]
            
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')

train_model(model, X_train_tensor, y_train_tensor, epochs=50)

Epoch 1/50, Loss: 1.2747
Epoch 2/50, Loss: 1.2514
Epoch 3/50, Loss: 1.1312
Epoch 4/50, Loss: 1.0853
Epoch 5/50, Loss: 1.0632
Epoch 6/50, Loss: 0.9618
Epoch 7/50, Loss: 0.7473
Epoch 8/50, Loss: 0.8003
Epoch 9/50, Loss: 0.9647
Epoch 10/50, Loss: 0.9323
Epoch 11/50, Loss: 0.7705
Epoch 12/50, Loss: 0.7622
Epoch 13/50, Loss: 0.6575
Epoch 14/50, Loss: 0.7578
Epoch 15/50, Loss: 0.5959
Epoch 16/50, Loss: 0.7738
Epoch 17/50, Loss: 0.9130
Epoch 18/50, Loss: 0.7581
Epoch 19/50, Loss: 0.7252
Epoch 20/50, Loss: 0.9411
Epoch 21/50, Loss: 0.7806
Epoch 22/50, Loss: 0.7006
Epoch 23/50, Loss: 0.8247
Epoch 24/50, Loss: 0.9252
Epoch 25/50, Loss: 0.8686
Epoch 26/50, Loss: 0.7419
Epoch 27/50, Loss: 0.6852
Epoch 28/50, Loss: 0.8023
Epoch 29/50, Loss: 0.9289
Epoch 30/50, Loss: 0.7991
Epoch 31/50, Loss: 0.7831
Epoch 32/50, Loss: 0.7609
Epoch 33/50, Loss: 0.7393
Epoch 34/50, Loss: 0.7334
Epoch 35/50, Loss: 0.8517
Epoch 36/50, Loss: 0.8256
Epoch 37/50, Loss: 0.8891
Epoch 38/50, Loss: 0.7836
Epoch 39/50, Loss: 0.

In [58]:
# Preprocess validation and test data using the same vectorizer, label encoder, and scaler
X_cv_text = X_cv['text'] + ' ' + X_cv['title']
X_cv_tfidf = vectorizer.transform(X_cv_text).toarray()
X_test_text = X_test['text'] + ' ' + X_test['title']
X_test_tfidf = vectorizer.transform(X_test_text).toarray()

# Encode restaurant type using the same LabelEncoder
X_cv['type'] = safe_transform(le, X_cv['type'])
X_test['type'] = safe_transform(le, X_test['type'])

# Scale numerical features using the same StandardScaler
X_cv[['priceInterval', 'date', 'review_length', 'rest_rating']] = scaler.transform(
    X_cv[['priceInterval', 'date', 'review_length', 'rest_rating']]
)
X_test[['priceInterval', 'date', 'review_length', 'rest_rating']] = scaler.transform(
    X_test[['priceInterval', 'date', 'review_length', 'rest_rating']]
)

# Combine all features for validation and test sets
X_cv_combined = np.hstack((
    X_cv_tfidf,  # Text features
    X_cv[['type', 'priceInterval', 'date', 'review_length', 'rest_rating']].values 
))

X_test_combined = np.hstack((
    X_test_tfidf,  # Text features
    X_test[['type', 'priceInterval', 'date', 'review_length', 'rest_rating']].values  
))

# Convert to PyTorch tensors
X_cv_tensor = torch.tensor(X_cv_combined, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_combined, dtype=torch.float32)
y_cv_tensor = torch.tensor(y_cv.values - 1, dtype=torch.long) 
y_test_tensor = torch.tensor(y_test.values - 1, dtype=torch.long)


In [59]:
def evaluate_model(model, X_tensor, y_tensor):
    model.eval()  
    with torch.no_grad():  
        outputs = model(X_tensor)
        _, predictions = torch.max(outputs, dim=1) 

    # Calculate accuracy
    correct = (predictions == y_tensor).sum().item()
    total = y_tensor.size(0)
    accuracy = correct / total
    print(f'Accuracy: {accuracy * 100:.2f}%')
    
# Evaluate on training set
evaluate_model(model, X_train_tensor, y_train_tensor)

# Evaluate on validation set
evaluate_model(model, X_cv_tensor, y_cv_tensor)

# Evaluate on test set
evaluate_model(model, X_test_tensor, y_test_tensor)


Accuracy: 72.06%
Accuracy: 66.53%
Accuracy: 66.36%


In [60]:
model.eval()

with torch.no_grad():
    outputs = model(X_cv_tensor)  
    _, predictions = torch.max(outputs, dim=1)  

# Convert predictions and true labels to NumPy arrays
y_cv_true = y_cv_tensor.numpy() 
y_cv_pred = predictions.numpy() 

# Evaluation metrics
print("Accuracy:", accuracy_score(y_cv_true, y_cv_pred))
print("\nClassification Report:\n", classification_report(y_cv_true, y_cv_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_cv_true, y_cv_pred))

Accuracy: 0.6653333333333333

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.80      0.70       522
           1       0.44      0.02      0.04       438
           2       0.48      0.43      0.45       803
           3       0.54      0.46      0.50      2063
           4       0.75      0.89      0.82      3674

    accuracy                           0.67      7500
   macro avg       0.57      0.52      0.50      7500
weighted avg       0.64      0.67      0.64      7500


Confusion Matrix:
 [[ 417    2   75   26    2]
 [ 194    8  187   42    7]
 [  53    6  342  354   48]
 [   4    2   89  947 1021]
 [   4    0   19  375 3276]]


In [61]:
model.eval()

with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predictions = torch.max(outputs, dim=1) 

# Convert predictions and true labels to NumPy arrays
y_test_true = y_test_tensor.numpy()  
y_test_pred = predictions.numpy()  

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test_true, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test_true, y_test_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_true, y_test_pred))

Accuracy: 0.6636

Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.82      0.70       537
           1       0.58      0.04      0.07       472
           2       0.54      0.44      0.48       847
           3       0.52      0.47      0.49      1988
           4       0.76      0.88      0.81      3656

    accuracy                           0.66      7500
   macro avg       0.60      0.53      0.51      7500
weighted avg       0.65      0.66      0.64      7500


Confusion Matrix:
 [[ 442    7   57   29    2]
 [ 216   18  179   54    5]
 [  62    6  372  364   43]
 [   3    0   72  931  982]
 [   0    0   14  428 3214]]
