# Amazon reviews classification [sklearn]
Binary classification of product's review 'helpfulness' (quality).
Reference: <https://t-lanigan.github.io/amazon-review-classifier/>

## Install necessary packages

In [None]:
!pip install nltk giskard

## Import libraries

In [None]:
import string

import giskard
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from giskard import wrap_dataset, wrap_model
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

## Notebook-level settings

In [None]:
# Disable chained assignment warning.
pd.options.mode.chained_assignment = None

## Define constants

In [None]:
# Constants.
RANDOM_SEED = 0
TEST_RATIO = 0.2

TARGET_THRESHOLD = 0.5
TARGET_NAME = "isHelpful"

# Paths.
DATASET_URL = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Apps_for_Android_5.json.gz'

## Load and preprocess Dataset

In [None]:
def download_data(**kwargs: dict) -> pd.DataFrame:
    """Download the dataset using URL."""
    print(f"Downloading dataset from {DATASET_URL}")

    _df = pd.read_json(DATASET_URL, compression="gzip", lines=True, **kwargs)

    print(f"Dataset was loaded!")

    return _df

In [None]:
def preprocess_data(_df: pd.DataFrame) -> pd.DataFrame:
    """Perform data-preprocessing steps."""
    print(f"Start data preprocessing...")

    # Select columns.
    _df = _df[["reviewText", "helpful"]]

    # Extract numbers of helpful and total votes.
    _df['helpful_ratings'] = _df.helpful.apply(lambda x: x[0])
    _df['total_ratings'] = _df.helpful.apply(lambda x: x[1])

    # Filter unreasonable comments.
    _df = _df[_df.total_ratings > 10]

    # Create target column.
    _df[TARGET_NAME] = np.where((_df.helpful_ratings / _df.total_ratings) > TARGET_THRESHOLD, 1, 0)

    # Delete columns we don't need anymore.
    _df.drop(columns=["helpful", 'helpful_ratings', 'total_ratings'], inplace=True)

    print("Data preprocessing finished!")

    return _df

In [None]:
reviews_df = download_data()
reviews_df = preprocess_data(reviews_df)

## Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reviews_df[["reviewText"]], reviews_df[TARGET_NAME],
                                                    test_size=TEST_RATIO, random_state=RANDOM_SEED)

## Wrap test dataset

In [None]:
test_dataset = pd.concat([X_test, y_test], axis=1)
wrapped_dataset = wrap_dataset(test_dataset,
                               name="reviews", target=TARGET_NAME, column_types={"reviewText": "text"})

## Define preprocessing pipeline

In [None]:
def make_lowercase(x):
    """Lower an input string."""
    x = x.reviewText.apply(lambda row: row.lower())
    return x

def remove_punctuation(x):
    """Remove punctuation from input string."""
    x.apply(lambda row: row.translate(str.maketrans('', '', string.punctuation)))
    return x

stemmer = SnowballStemmer("english")
def tokenizer(x):
    """Define string tokenization logic."""
    x = x.split()
    stems = list()
    [stems.append(stemmer.stem(word)) for word in x]
    return stems

vectorizer = TfidfVectorizer(tokenizer=tokenizer, stop_words='english', ngram_range=(1, 1), min_df=0.01)

In [None]:
preprocessor = Pipeline(steps=[
    ("lowercase", FunctionTransformer(make_lowercase)),
    ("punctuation", FunctionTransformer(remove_punctuation)),
    ("vectorizer", vectorizer)
])

## Build estimator

In [None]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("estimator", LogisticRegression(random_state=RANDOM_SEED))
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
train_metric = roc_auc_score(y_train, pipeline.predict_proba(X_train)[:, 1].T)
test_metric = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1].T)

print(f"Train ROC-AUC score: {train_metric}")
print(f"Test ROC-AUC score: {test_metric}")

## Wrap model

In [None]:
wrapped_model = wrap_model(model=pipeline,
                           model_type="classification",
                           feature_names=["reviewText"],
                           name="review_helpfulness_predictor",
                           classification_threshold=0.5,
                           classification_labels=pipeline.classes_)

## Scan model

In [None]:
results = giskard.scan(model=wrapped_model, dataset=wrapped_dataset)

In [None]:
display(results)