In [None]:
! pip install implicit
! pip install transformers



# Clean Data

In [None]:
import pandas as pd, numpy as np
import gzip
import os
import json
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files
import random
from collections import defaultdict

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))

from google.colab import data_table
data_table.enable_dataframe_formatter()


In [None]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path, sorted_indices):
  df = {}
  i = 0
  j = 0
  for d in parse(path):
      if i == sorted_indices[j]:
          df[i] = d
          j += 1
      i += 1
      # Optional: Break if all needed rows are read
      if len(df) == len(sorted_indices):
          break

      if i % 1000000 == 0:
        print(i // 1000000)
  return pd.DataFrame.from_dict(df, orient='index')


In [None]:
random_indices = set(random.sample(range(41130001), 2000000))
sorted_indices = sorted(list(random_indices))

In [None]:
df = getDF4('data/kcore_5.json.gz',sorted_indices)
# Count the occurrences of each 'asin'
asin_counts = df['asin'].value_counts()

# Filter the counts to find asins that appear at least 5 times
asins_to_keep = asin_counts[asin_counts >= 5].index

# Filter the original DataFrame to only include those asins
df = df[df['asin'].isin(asins_to_keep)]
len(df['reviewerID'].unique())
df = df.drop(['reviewTime', 'helpful'], axis=1)
df

In [None]:
data = {}

with gzip.open('data/metadata.json.gz', 'rb') as file:
    for i, line in enumerate(file):

        data[i]= eval(line.decode('utf-8'))

md = pd.DataFrame.from_dict(data, orient='index')
mddrop = md.drop(columns=['salesRank', 'description', 'related', 'brand'])
mddrop

In [None]:
merged_df = pd.merge(df, mddrop, on='asin',how='left')
merged_df['review_length'] = merged_df['reviewText'].apply(len)
df = merged_df

In [None]:
df.shape

(3060220, 12)

In [None]:
import ast

def parse_cat_to_set(categories_string):

  if pd.isnull(categories_string):
    # If the string is null, return an empty set
    return set()

  # Convert the string to a list of lists
  categories = ast.literal_eval(categories_string)

  # Initialize an empty set
  unique_elements = set()

  # Iterate over each inner list and add elements to the set
  for category in categories:
      unique_elements.update(category)

  return unique_elements


In [None]:
df['cat'] = df['categories'].apply(parse_cat_to_set)

In [None]:
df = df.drop(columns=['categories','review_length','reviewerName'])

## Sentiment Analysis

In [None]:
from transformers import pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True, truncation = True, device=0)

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



In [None]:
def get_unexpectedness(text):

  output = classifier(text)

  # Return the unexpectedness score
  return output[0][6]['score']

def get_batch_unexpectedness(text_arr):

  outputs = classifier(text_arr)

  # Return the unexpectedness score
  result = [output[6]['score'] for output in outputs]

  return result

In [None]:
# Replace nan value with empty string
text_lst = list(df['reviewText'])
for i,t in enumerate(text_lst):
  if type(t) is not str:
    text_lst[i] = ""

In [None]:
df['unexpectedness'] = get_batch_unexpectedness(text_lst)

In [None]:
df.to_csv('data/cleaned_df.csv',index=False)