In [6]:
from google.colab import drive

drive.mount("/content/drive")

%cd /content/drive/MyDrive/nyu-stuff/2023-fall/DS-UA\ 301\ Advanced\ DS/final_project

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/nyu-stuff/2023-fall/DS-UA 301 Advanced DS/final_project


In [7]:
%%capture
%pip install -r requirements.txt

In [8]:
import pandas
import numpy
import json5
import functools
import json
from matplotlib import pyplot
from typing import Optional, Any, Coroutine, Union, Dict, List
import nltk
from nltk.corpus import stopwords
import string
import contractions
import torch
import random

nltk.download("averaged_perceptron_tagger")
nltk.download("universal_tagset")
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
from transformers import pipeline

classifier = pipeline(
    "zero-shot-classification", model="facebook/bart-large-mnli", device_map="auto"
)

In [10]:
class FairDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.data = df["review_text"].tolist()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [11]:
sequence_to_classify = "one fine day I will see the world"
candidate_labels = ["travel", "cooking", "dancing"]
classifier(sequence_to_classify, candidate_labels)

{'sequence': 'one fine day I will see the world',
 'labels': ['travel', 'dancing', 'cooking'],
 'scores': [0.9946518540382385, 0.003005197737365961, 0.0023429866414517164]}

In [12]:
df_comments = pandas.read_csv("data/balanced_data.csv")

dataset = FairDataset(df_comments)
h_batch_size = 32

loader = torch.utils.data.DataLoader(dataset, batch_size=h_batch_size)
# n_batches = len(batches)

In [13]:
# Let's do word frequency
freqs = {}

puncs = set(string.punctuation)
stopwords = set(nltk.corpus.stopwords.words("english"))
bad_tokens = {*stopwords, *puncs}

word_freq_by_rating = {}

for n_group, rating in enumerate([1,2,3,4,5]):
  subset = df_comments[df_comments["rating"] == rating]

  freqs = {}

  # Go through each comment in the dataset
  for idx, comment in enumerate(subset["review_text"]):
    print(f"\rPROGRESS: {idx + 1 + n_group * len(subset)} / {len(df_comments)}", end="")

    word_tokens = nltk.word_tokenize(comment)
    filtered_sentence = [w for w in word_tokens if w not in bad_tokens]

    # Go through each word in a comment
    for word in filtered_sentence:
      if word not in freqs:
          freqs[word] = 1
      else:
          freqs[word] += 1

  # Sort words by their frequency in decreasing order
  sorted_freqs = dict(sorted(freqs.items(), key=lambda item: item[1], reverse=True))

  # Don't worry about invidual characters
  filtered_freqs = {k: v for k, v in sorted_freqs.items() if len(k) > 1}

  # Do semantic analysis on the words
  pos_dict = dict(nltk.pos_tag(list(filtered_freqs.keys()), tagset="universal"))

  # Extract Emotional words
  emotion_word_freqs = {
    k: v for k, v in filtered_freqs.items() if pos_dict[k] in ["ADJ", "ADV"]
  }

  word_freq_by_rating[rating] = emotion_word_freqs

PROGRESS: 50000 / 50000

In [14]:
# Trim the lists
trimmed = {}

for k,v in word_freq_by_rating.items():

  random_keys = list(v.keys())

  trimed_inter = {key : v[key] for key in list(v.keys())[:25]}

  trimmed[k] = trimed_inter

In [15]:
trimmed

{1: {'like': 2171,
  'even': 1690,
  'back': 1669,
  'fit': 1533,
  'good': 1388,
  'cheap': 1169,
  'well': 1085,
  'really': 1013,
  'first': 995,
  'plastic': 893,
  'never': 893,
  'better': 866,
  'tried': 862,
  'much': 843,
  'great': 816,
  'worth': 728,
  'Not': 706,
  'still': 689,
  'also': 688,
  'new': 678,
  'bad': 670,
  'ordered': 621,
  'little': 591,
  'So': 567,
  'different': 549},
 2: {'good': 2193,
  'back': 2035,
  'really': 1780,
  'fit': 1755,
  'well': 1652,
  'great': 1479,
  'much': 1401,
  'quality': 1335,
  'even': 1297,
  'better': 1295,
  'first': 1097,
  'hard': 1082,
  'also': 1043,
  'little': 1007,
  'still': 942,
  'nice': 924,
  'look': 884,
  'device': 855,
  'right': 784,
  'want': 739,
  'pretty': 729,
  'button': 714,
  'new': 707,
  'enough': 704,
  'tried': 704},
 3: {'good': 2680,
  'well': 2020,
  'really': 1846,
  'great': 1613,
  'back': 1599,
  'little': 1548,
  'much': 1515,
  'fit': 1496,
  'better': 1308,
  'nice': 1225,
  'cover': 11

In [16]:
import itertools

synthetic_classes = list(
    set(
        itertools.chain(*[list(v.keys()) for k,v in trimmed.items()])
        )
    )

In [17]:
len(synthetic_classes)

49

In [18]:
classes = random.sample(synthetic_classes,30)

In [19]:
classes

['still',
 'look',
 'really',
 'want',
 'best',
 'plastic',
 'tried',
 'far',
 'long',
 'cover',
 'better',
 'little',
 'small',
 'good',
 'works',
 'quality',
 'hard',
 'also',
 'So',
 'first',
 'button',
 'back',
 'right',
 'Not',
 'enough',
 'different',
 'easy',
 'perfectly',
 'recommend',
 'bad']

In [20]:
one_star_keywords = ["disappointing", "poor", "defective", "worst", "bad"]

# Keywords for Two-Star Reviews
two_star_keywords = ["average", "mediocre", "okay", "below expectations", "unsatisfactory"]

# Keywords for Three-Star Reviews
three_star_keywords = ["satisfactory", "average", "neutral", "acceptable", "so-so"]

# Keywords for Four-Star Reviews
four_star_keywords = ["good", "satisfying", "impressive", "above average", "commendable"]

# Keywords for Five-Star Reviews
five_star_keywords = ["excellent", "amazing", "outstanding", "top-notch", "fantastic"]

classes = list(set(one_star_keywords + two_star_keywords + three_star_keywords + four_star_keywords + five_star_keywords))

In [21]:
classes

['worst',
 'okay',
 'average',
 'top-notch',
 'satisfactory',
 'below expectations',
 'unsatisfactory',
 'bad',
 'so-so',
 'neutral',
 'satisfying',
 'amazing',
 'impressive',
 'fantastic',
 'poor',
 'good',
 'outstanding',
 'mediocre',
 'commendable',
 'acceptable',
 'disappointing',
 'excellent',
 'above average',
 'defective']

In [28]:
review_texts = df_comments["review_text"].tolist()
label_rows = []

n_entries = len(dataset)
idx = 0
diff = 0

torch.cuda.empty_cache()

for r in classifier(dataset, classes, batch_size=h_batch_size):

    print(f"\rPROGRESS: {idx + 1} / {n_entries}", end="")
    row = [r["sequence"]]

    for c in classes:
        row.append(r["scores"][r["labels"].index(c)])

    label_rows.append(row)
    idx += 1

    # Purge cache once in a while
    if idx - diff > h_batch_size * 8:
        torch.cuda.empty_cache()
        diff = idx

    if idx % 2000 == 0:
        df_labels = pandas.DataFrame(label_rows, columns=["review_text"] + classes)
        df_labels.to_csv(f"data/label_temp/labels_new_{idx}.csv", index=False)



PROGRESS: 50000 / 50000

In [29]:
df_labels = pandas.DataFrame(label_rows, columns=["review_text"] + classes)
df_labels.to_csv("data/labels_v2.csv", index=False)

print("\nAll Done.")


All Done.


In [30]:
df_merged = pandas.concat(
    [df_comments, df_labels.drop(columns=["review_text"])], axis=1
)

In [31]:
df_merged.to_csv("data/balanced_with_labels_v2.csv", index=False)