In [4]:
import os

import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
import seaborn as sns

import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")
from nltk import FreqDist
import spacy

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import itertools
from nltk.tokenize import sent_tokenize
import re
import textstat
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

2023-04-09 10:35:38.152307: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /Users/kevinxu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kevinxu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kevinxu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/kevinxu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
path = 'rawdata/PC.tsv.gz'
raw_df = pd.read_csv(path,sep='\t', on_bad_lines='skip')

# 1 Preprocessing

In [22]:
# 1.1 only get total_votes >= 50
df = raw_df[raw_df['total_votes'] >= 40]

In [23]:
df.shape

(38814, 16)

In [24]:
df.columns.tolist()

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date',
 'year']

In [25]:
df.dtypes

marketplace           object
customer_id            int64
review_id             object
product_id            object
product_parent         int64
product_title         object
product_category      object
star_rating            int64
helpful_votes          int64
total_votes            int64
vine                  object
verified_purchase     object
review_headline       object
review_body           object
review_date           object
year                 float64
dtype: object

In [26]:
# 1.2 Removing Nulls & duplicate
df.isnull().sum()
df = df.dropna()


In [27]:
# 1.3 Preparing columns
# review
df['review'] = df['review_headline'] + " " + df['review_body']
df=df.drop(['review_headline', 'review_body'], axis=1)
df = df.drop_duplicates(subset=['review'])

In [28]:
# review_date type
# df['review_date'] =  pd.to_datetime(df['unixReviewTime'],unit='s')

In [29]:
# vote ratio  => label
df['vote_ratio'] = df['helpful_votes'] / df['total_votes']
df=df.drop(['helpful_votes', 'total_votes'], axis=1)

In [30]:
threshold = 0.8
df['helpfulness'] = [1 if x >= threshold else 0 for x in df['vote_ratio']]

In [31]:
# verified purchase to 1 0
df['verified_purchase'] = [1 if x == 'Y' else 0 for x in df['verified_purchase']]

In [32]:
# verified purchase to 1 0
df['vine'] = [1 if x == 'Y' else 0 for x in df['vine']]

In [33]:
# sentence tokenize
df['sentence_count'] = df.apply(lambda row: sent_tokenize(row['review']), axis=1).apply(lambda x: len(x))

In [42]:
# word count
punct_remover = str.maketrans('','', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
## remove num with word, punctuaction, stopword
# def extract_words(sentence):
#     sentence_nodigits = re.sub("\d+", "", sentence)
#     words = re.sub("[^\w]", " ",  sentence_nodigits).split()
#     words_punct = [w.translate(punct_remover) for w in words]
#     # words_stop = [w for w in words_punct if w not in stopwords.words('english')]
#     # words_len = [word for word in words_stop if len(word) > 2]
#     return words_len

# df['words'] = [extract_words(x) for x in df['review']]
# df['word_count'] = [len(x) for x in df['words']]

df['words'] = df['review'].apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords.words('english')]))
df['words'].head(5)

414     Cannot used headphones If plan use iPad mini h...
939     Shorted 2 days use Worked 2 days center phone ...
1112    Same performance $5 cable I've never seen many...
1399    1) It sturdy enough used keyring It works 2 dr...
1843    Fan good job cooling. The fan works fine, I'll...
Name: words, dtype: object

In [None]:
# Automated Readability Index (ARI)
df['ARI'] = [textstat.automated_readability_index(x) for x in df['review']]

In [None]:
## bag of words

In [None]:
vocabulary = list(itertools.chain.from_iterable(out_data['words'].tolist()))
vocab = Counter()
vocab.update(vocabulary)
commom_word_threshold = 5000
tokens = [k for k,c in vocab.most_common(commom_word_threshold)]

In [None]:
vectorizer = CountVectorizer(max_features=5000, vocabulary=tokens)

In [None]:
out_data['words_text'] = out_data.apply(lambda row: re.sub(r'\[',"",(str(row['words']))), axis=1)
out_data['words_text'] = out_data.apply(lambda row: re.sub(r'\'',"",(str(row['words_text']))), axis=1)
out_data['words_text'] = out_data.apply(lambda row: re.sub(r',',"",(str(row['words_text']))), axis=1)
out_data['words_text'] = out_data.apply(lambda row: re.sub(r'\]',"",(str(row['words_text']))), axis=1)

In [None]:
pd.set_option('display.max_colwidth', None)
out_data['words_text'].head(n=1)

In [None]:
train_features = vectorizer.fit_transform(out_data['words_text'])

In [None]:
train_features.shape

In [221]:
a = out_data
a = a[a['words_text'].str.len() > 0]
a['words_text'].isnull().values.any()

False

In [222]:
# data after cleaning we need
output_data = out_data[out_data['words_text'].str.len() > 0]
output_data.to_csv('data/out.csv') 

In [198]:
# reference
# Sthanu Ramakrishnan https://webpages.charlotte.edu/sramak11/6156/6156_Group_Project.html#topic=0&lambda=1&term=
# https://cs229.stanford.edu/proj2014/Jordan%20Rodak,%20Minna%20Xiao,%20Steven%20Longoria,%20Predicting%20Helpfulness%20Ratings%20of%20Amazon%20Product%20Reviews.pdf
# https://stackoverflow.com/questions/29244286/how-to-flatten-a-2d-list-to-1d-without-using-numpy
# https://towardsdatascience.com/predicting-the-helpfulness-of-peer-written-product-reviews-ef7a0dfea2c3