# Amazon reviews classification [sklearn]
Binary classification of product's review 'helpfulness' (quality).
Reference: <https://t-lanigan.github.io/amazon-review-classifier/>

## Import libraries

In [1]:
import string

import giskard
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from giskard import wrap_dataset, wrap_model
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

## Notebook-level settings

In [2]:
# Disable chained assignment warning.
pd.options.mode.chained_assignment = None

## Define constants

In [3]:
# Constants.
RANDOM_SEED = 0
TEST_RATIO = 0.2

TARGET_THRESHOLD = 0.5
TARGET_NAME = "isHelpful"

# Paths.
DATASET_URL = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Apps_for_Android_5.json.gz'

## Load and preprocess Dataset

In [4]:
def download_data(**kwargs: dict) -> pd.DataFrame:
    """Download the dataset using URL."""
    print(f"Downloading dataset from {DATASET_URL}")

    _df = pd.read_json(DATASET_URL, compression="gzip", lines=True, **kwargs)

    print(f"Dataset was loaded!")

    return _df

In [5]:
def preprocess_data(_df: pd.DataFrame) -> pd.DataFrame:
    """Perform data-preprocessing steps."""
    print(f"Start data preprocessing...")

    # Select columns.
    _df = _df[["reviewText", "helpful"]]

    # Extract numbers of helpful and total votes.
    _df['helpful_ratings'] = _df.helpful.apply(lambda x: x[0])
    _df['total_ratings'] = _df.helpful.apply(lambda x: x[1])

    # Filter unreasonable comments.
    _df = _df[_df.total_ratings > 10]

    # Create target column.
    _df[TARGET_NAME] = np.where((_df.helpful_ratings / _df.total_ratings) > TARGET_THRESHOLD, 1, 0)

    # Delete columns we don't need anymore.
    _df.drop(columns=["helpful", 'helpful_ratings', 'total_ratings'], inplace=True)

    print("Data preprocessing finished!")

    return _df

In [6]:
reviews_df = download_data()
reviews_df = preprocess_data(reviews_df)
reviews_df

Downloading dataset from http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Apps_for_Android_5.json.gz
Dataset was loaded!
Start data preprocessing...
Data preprocessing finished!


Unnamed: 0,reviewText,isHelpful
33,"I did the 7-day trial and was impressed, but a...",1
94,"This app is great! It has a daily verse, a da...",1
284,This app simply latches on to Google maps and ...,0
288,"Unable to move to sd card, uninstalled. Cannot...",0
296,With all of the idiocy in almost every free ap...,1
...,...,...
752842,"Yes, it is! Love the sound effects. game grows...",1
752843,amazing. 1track but hard enough so it doesn't ...,1
752846,"I'd give it -5 stars if I could. I mean, I lov...",1
752914,This game is one of the most realistic dinosau...,1


## Train-test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(reviews_df[["reviewText"]], reviews_df[TARGET_NAME],
                                                    test_size=TEST_RATIO, random_state=RANDOM_SEED)

## Wrap test dataset

In [8]:
test_dataset = pd.concat([X_test, y_test], axis=1)
wrapped_dataset = wrap_dataset(test_dataset,
                               name="reviews", target=TARGET_NAME, column_types={"reviewText": "text"})

Your 'pandas.DataFrame' dataset is successfully wrapped by Giskard's 'Dataset' wrapper class.


## Define preprocessing pipeline

In [9]:
def make_lowercase(x):
    """Lower an input string."""
    x = x.reviewText.apply(lambda row: row.lower())
    return x

def remove_punctuation(x):
    """Remove punctuation from input string."""
    x.apply(lambda row: row.translate(str.maketrans('', '', string.punctuation)))
    return x

stemmer = SnowballStemmer("english")
def tokenizer(x):
    """Define string tokenization logic."""
    x = x.split()
    stems = list()
    [stems.append(stemmer.stem(word)) for word in x]
    return stems

vectorizer = TfidfVectorizer(tokenizer=tokenizer, stop_words='english', ngram_range=(1, 1), min_df=0.01)

In [10]:
preprocessor = Pipeline(steps=[
    ("lowercase", FunctionTransformer(make_lowercase)),
    ("punctuation", FunctionTransformer(remove_punctuation)),
    ("vectorizer", vectorizer)
])

## Build estimator

In [11]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("estimator", LogisticRegression(random_state=RANDOM_SEED))
])

In [12]:
pipeline.fit(X_train, y_train)



In [13]:
train_metric = roc_auc_score(y_train, pipeline.predict_proba(X_train)[:, 1].T)
test_metric = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1].T)

print(f"Train ROC-AUC score: {train_metric}")
print(f"Test ROC-AUC score: {test_metric}")

Train ROC-AUC score: 0.7602054998587102
Test ROC-AUC score: 0.7400220231290606


## Wrap model

In [14]:
wrapped_model = wrap_model(model=pipeline,
                           model_type="classification",
                           feature_names=["reviewText"],
                           name="review_helpfulness_predictor",
                           classification_threshold=0.5,
                           classification_labels=pipeline.classes_)

Your 'sklearn' model is successfully wrapped by Giskard's 'SKLearnModel' wrapper class.


## Scan model

In [15]:
results = giskard.scan(model=wrapped_model, dataset=wrapped_dataset)



Hint: "Your target variable values are numeric. It is recommended to have Human readable string as your target values to make results more understandable in Giskard."
Your model is successfully validated.


In [16]:
display(results)