In [1]:
%%capture
from google.colab import drive
drive.mount("/content/drive")
%cd /content/drive/MyDrive/nyu-stuff/2023-fall/DS-UA\ 301\ Advanced\ DS/final_project

%pip install -r requirements.txt
%pip install wandb -Uq

In [2]:
import pandas
import numpy
import json5
import functools
import json
from matplotlib import pyplot
from typing import Optional, Any, Coroutine, Union, Dict, List
import nltk
from nltk.corpus import stopwords
import string
import contractions
import torch
import wandb
import sklearn
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import datetime
from sklearn.model_selection import train_test_split
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

nltk.download("averaged_perceptron_tagger")
nltk.download("universal_tagset")
nltk.download("stopwords")
nltk.download("punkt")

def get_best_torch_device():
    if torch.cuda.is_available():  # CUDA
        return "cuda"
    elif torch.backends.mps.is_available():  # Apple Silicon / GPU
        return "mps"
    else:  # Fallback to CPU
        return "cpu"

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
class FairDataset(torch.utils.data.Dataset):
    def __init__(self, df_input):
        df_x = df_input[
            [
                c
                for c in df_input.columns
                if c not in ["review_text", "rating", "length"]
            ]
        ]
        df_y = df_input["rating"]
        self.features = torch.tensor(df_x.values, dtype=torch.float32).to(device)
        self.labels = torch.tensor(df_y.values, dtype=torch.float32).to(device)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

def spawn_layers(
    n_layers,
    input_size,
    output_size,
    is_batch_norm=True,
    activation_func=nn.ReLU,
    output_func=nn.LogSoftmax,
):
    layers = []
    prev_hidden = input_size

    for i in range(n_layers):
        hidden_size = int((prev_hidden + output_size) / 2)

        layers += [
            nn.Linear(in_features=prev_hidden, out_features=hidden_size),
            nn.BatchNorm1d(hidden_size) if is_batch_norm else nn.Identity(),
            activation_func(),
        ]

        prev_hidden = hidden_size

    return layers + [output_func(dim=1)]

# Spawns a model we trained
def spawn_model(props):


    chkpt = props["chkpt"]

    params = chkpt["params"]

    batch_size = params["batch_size"]
    n_layers = params["n_layers"]
    learning_rate = params["learning_rate"]
    n_epochs = params["n_epochs"]
    is_batch_norm = params["is_batch_norm"]
    input_size = chkpt["input_size"]
    output_size = chkpt["output_size"]

    optimizer_func = (
        torch.optim.Adam if params["optimizer"] == "adam" else torch.optim.SGD
    )
    activation_func = (
        nn.LeakyReLU if params["activation_func"] == "leaky_relu" else nn.ReLU
    )

    model = nn.Sequential(
        *spawn_layers(
            n_layers,
            input_size,
            output_size,
            activation_func=activation_func,
            is_batch_norm=is_batch_norm,
        )
    )

    model.load_state_dict(chkpt["model_state_dict"])
    model.to(get_best_torch_device())
    model.eval()

    return model

def time_now():
    return datetime.datetime.now()


def get_elapsed_time_ms(start_time):
    now = time_now()

    if not isinstance(start_time, datetime.datetime):
        start_time = time_now()

    diff = now - start_time

    return float(diff.total_seconds()) * 1000.0

In [4]:
# Load BART model
from transformers import pipeline

classifier = pipeline(
    "zero-shot-classification", model="facebook/bart-large-mnli", device_map="auto"
)

In [5]:
# Load our neural network

trained_models = torch.load(
    "data/sweep_result_loss_mp_fast_final_5.pickle",
    map_location=torch.device(get_best_torch_device())
  )

trained_models_sorted = dict(sorted(trained_models.items()))
keys = list(trained_models_sorted.keys())
props_best = trained_models[keys[0]]

best_model = spawn_model(props_best)
tags = list(props_best["chkpt"]["input_columns"])

In [6]:
@functools.lru_cache
def predict(prompt):
  # Power up BART model
  predict_tag_scores = classifier(prompt,tags)
  labels, scores = predict_tag_scores["labels"], predict_tag_scores["scores"]

  # Run our neural network to make predictions
  sorted_input = [scores[labels.index(c)] for c in tags]

  df_input = pandas.DataFrame([sorted_input], columns=tags)

  output = best_model(torch.tensor(df_input.values,dtype=torch.float32).to(get_best_torch_device()))
  _, predicted = torch.max(output, 1)

  return predicted.detach().cpu().numpy()[0], labels, scores

def run_model(prompt):

  prompt = prompt.strip().replace("\n", "")

  start_time = time_now()

  score, labels, scores = predict(prompt)

  printmd(f"# Our prediction: {score} star(s)")
  printmd("## Elapsed time: %.2f s" % (get_elapsed_time_ms(start_time) / 1000))


  df_tags = pandas.DataFrame(data=[[prompt, *scores]], columns=["review_text", *labels])

  # Display Pandas DataFrame on BART moodel results
  print("Here are the tag scores from the bart-large-mnli model:")
  display(df_tags)

In [7]:
# TODO: Modify Input here

# A 4 star review on a used phone
run_model("""
    Good phone! No scratches or dents
    but the OS on the phone is a bit laggy at times and network is just okay
""")

# Our prediction: 4 star(s)

## Elapsed time: 1.28 s

Here are the tag scores from the bart-large-mnli model:


Unnamed: 0,review_text,above average,commendable,good,satisfactory,satisfying,okay,impressive,acceptable,top-notch,...,outstanding,neutral,amazing,mediocre,unsatisfactory,defective,poor,disappointing,bad,worst
0,Good phone! No scratches or dents but the O...,0.173942,0.160625,0.151689,0.102326,0.092989,0.079534,0.057405,0.040864,0.037394,...,0.00835,0.005307,0.002537,0.002246,0.001227,0.000954,0.000866,0.000859,0.00061,0.000303


In [8]:
# A 5 star review on gaming
run_model("""
  RAZER defines what it means to build a gaming laptop!
  Good framerates for league of legends,
  and RTX 4080 can definitely push Cyperpunk 2077 to next level on visuals.
""")

# Our prediction: 5 star(s)

## Elapsed time: 0.60 s

Here are the tag scores from the bart-large-mnli model:


Unnamed: 0,review_text,top-notch,outstanding,impressive,excellent,fantastic,amazing,commendable,good,satisfying,...,below expectations,defective,average,bad,unsatisfactory,poor,mediocre,disappointing,so-so,worst
0,RAZER defines what it means to build a gaming ...,0.203616,0.133576,0.122572,0.111534,0.093344,0.083704,0.082852,0.077779,0.051602,...,0.00053,0.000339,0.00025,0.000249,0.000235,0.000225,0.000213,0.0002,0.000149,0.00014


In [9]:
# A 2 star review on FOOD
run_model("""
  I haven't used the ham base.
  It is loaded with MSG.
  I did not realize this when I ordered it and can not return it because it is a food item
""")

# Our prediction: 2 star(s)

## Elapsed time: 0.59 s

Here are the tag scores from the bart-large-mnli model:


Unnamed: 0,review_text,defective,bad,unsatisfactory,below expectations,disappointing,poor,so-so,impressive,mediocre,...,amazing,satisfying,okay,satisfactory,commendable,average,outstanding,excellent,top-notch,fantastic
0,I haven't used the ham base. It is loaded wi...,0.295748,0.191148,0.149395,0.077134,0.075553,0.064219,0.044539,0.01698,0.015696,...,0.004527,0.00425,0.003816,0.003407,0.003112,0.002699,0.002465,0.00182,0.001662,0.001516


In [10]:
# A 5 star review on FOOD
run_model("""
  I have bought several of the Vitality canned dog food products and have found them all to be of good quality.
  The product looks more like a stew than a processed meat and it smells better.
  My Labrador is finicky and she appreciates this product better than most.
""")

# Our prediction: 4 star(s)

## Elapsed time: 0.83 s

Here are the tag scores from the bart-large-mnli model:


Unnamed: 0,review_text,good,satisfactory,commendable,acceptable,satisfying,impressive,top-notch,above average,excellent,...,below expectations,average,mediocre,bad,poor,so-so,unsatisfactory,disappointing,defective,worst
0,I have bought several of the Vitality canned d...,0.235447,0.154767,0.129312,0.087489,0.086191,0.079323,0.046365,0.042602,0.0387,...,0.004838,0.002504,0.001848,0.001804,0.001546,0.001435,0.001166,0.000844,0.000823,0.000288


In [11]:
# Run again! Our implementation has caching built-in so second time will be extremely quick
run_model("""
  I have bought several of the Vitality canned dog food products and have found them all to be of good quality.
  The product looks more like a stew than a processed meat and it smells better.
  My Labrador is finicky and she appreciates this product better than most.
""")

# Our prediction: 4 star(s)

## Elapsed time: 0.00 s

Here are the tag scores from the bart-large-mnli model:


Unnamed: 0,review_text,good,satisfactory,commendable,acceptable,satisfying,impressive,top-notch,above average,excellent,...,below expectations,average,mediocre,bad,poor,so-so,unsatisfactory,disappointing,defective,worst
0,I have bought several of the Vitality canned d...,0.235447,0.154767,0.129312,0.087489,0.086191,0.079323,0.046365,0.042602,0.0387,...,0.004838,0.002504,0.001848,0.001804,0.001546,0.001435,0.001166,0.000844,0.000823,0.000288
