# Amazon reviews classification [sklearn]
* Binary classification of product's review 'helpfulness' (quality).
* Reference notebook: <https://t-lanigan.github.io/amazon-review-classifier/>
* Dataset: <http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Apps_for_Android_5.json.gz>

## Import libraries

In [None]:
import string

import giskard
import numpy as np
import pandas as pd
from giskard import GiskardClient
from giskard import Dataset, Model
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from giskard.client.giskard_client import GiskardError
from sklearn.feature_extraction.text import TfidfVectorizer

## Notebook-level settings

In [None]:
# Disable chained assignment warning.
pd.options.mode.chained_assignment = None

## Define constants

In [None]:
# Constants.
RANDOM_SEED = 0
TEST_RATIO = 0.2

TARGET_THRESHOLD = 0.5
TARGET_NAME = "isHelpful"

# Paths.
DATASET_URL = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Apps_for_Android_5.json.gz'

# Giskard platform credentials.
GISKARD_URL = "http://localhost:19000"
GISKARD_ACCESS_TOKEN = ""
GISKARD_PROJECT_KEY = "amazon_review_classification"

## Load and preprocess Dataset

In [None]:
def download_data(**kwargs) -> pd.DataFrame:
    """Download the dataset using URL."""
    print(f"Downloading dataset from {DATASET_URL}")

    _df = pd.read_json(DATASET_URL, compression="gzip", lines=True, **kwargs)

    print(f"Dataset was loaded!")

    return _df

In [None]:
def preprocess_data(_df: pd.DataFrame) -> pd.DataFrame:
    """Perform data-preprocessing steps."""
    print(f"Start data preprocessing...")

    # Select columns.
    _df = _df[["reviewText", "helpful"]]

    # Remove Null-characters (x00) from the dataset.
    _df.reviewText = _df.reviewText.apply(lambda x: x.replace("\x00", ""))

    # Extract numbers of helpful and total votes.
    _df['helpful_ratings'] = _df.helpful.apply(lambda x: x[0])
    _df['total_ratings'] = _df.helpful.apply(lambda x: x[1])

    # Filter unreasonable comments.
    _df = _df[_df.total_ratings > 10]

    # Create target column.
    _df[TARGET_NAME] = np.where((_df.helpful_ratings / _df.total_ratings) > TARGET_THRESHOLD, 1, 0).astype(int)

    # Delete columns we don't need anymore.
    _df.drop(columns=["helpful", 'helpful_ratings', 'total_ratings'], inplace=True)

    print("Data preprocessing finished!")

    return _df

In [None]:
reviews_df = download_data(nrows=20000)
reviews_df = preprocess_data(reviews_df)

## Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reviews_df[["reviewText"]], reviews_df[TARGET_NAME],
                                                    test_size=TEST_RATIO, random_state=RANDOM_SEED)

## Wrap dataset with giskard

In [None]:
test_dataset = pd.concat([X_test, y_test], axis=1)
wrapped_dataset = Dataset(test_dataset,
                          name="reviews", target=TARGET_NAME, column_types={"reviewText": "text"})

## Define preprocessing pipeline

In [None]:
def remove_punctuation(x):
    """Remove punctuation from input string."""
    x = x.reviewText.apply(lambda row: row.translate(str.maketrans('', '', string.punctuation)))
    return x

vectorizer = TfidfVectorizer(stop_words='english', min_df=0.01)

In [None]:
preprocessor = Pipeline(steps=[
    ("punctuation", FunctionTransformer(remove_punctuation)),
    ("vectorizer", vectorizer)
])

## Build estimator

In [None]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("estimator", LogisticRegression(random_state=RANDOM_SEED))
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
train_metric = roc_auc_score(y_train, pipeline.predict_proba(X_train)[:, 1].T)
test_metric = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1].T)

print(f"Train ROC-AUC score: {train_metric}")
print(f"Test ROC-AUC score: {test_metric}")

## Prepare prediction function

In [None]:
def prediction_function(df: pd.DataFrame) -> np.ndarray:
    return pipeline.predict_proba(df)

## Wrap model with giskard

In [None]:
wrapped_model = Model(model=prediction_function,
                      model_type="classification",
                      feature_names=["reviewText"],
                      name="review_helpfulness_predictor",
                      classification_threshold=0.5,
                      classification_labels=[0, 1])

In [None]:
# Validate wrapped model.
wrapped_predict = wrapped_model.predict(wrapped_dataset).raw[:, 1].T
wrapped_test_metric = roc_auc_score(y_test, wrapped_predict)
print(f"Wrapped Test ROC-AUC score: {wrapped_test_metric}")

## Scan model with giskard

In [None]:
results = giskard.scan(model=wrapped_model, dataset=wrapped_dataset)

In [None]:
display(results)

## Upload model and dataset to the Giskard platform

In [None]:
# Define project key.
project_key = "amazon_review_classification"

# Init new giskard client.
client = GiskardClient(GISKARD_URL, GISKARD_ACCESS_TOKEN)

# Create or fetch a project by its key.
try:
    project = client.create_project(GISKARD_PROJECT_KEY,
                                    name="AMAZON_REVIEW_CLASSIFICATION",
                                    description="Task of classifying review's helpfulness.")
except GiskardError as e:
    print(f"Project with key '{GISKARD_PROJECT_KEY}' already exists. Trying to get it.")
    project = client.get_project(GISKARD_PROJECT_KEY)

# Upload the model and the dataset.
model_id = wrapped_model.upload(client, project_key)
dataset_id = wrapped_dataset.upload(client, project_key)