In [None]:
import os
import sys
import gzip
import string
from urllib.request import urlretrieve

import giskard
import sklearn
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FunctionTransformer

from giskard import wrap_dataset, wrap_model

In [None]:
sklearn.set_config(transform_output="default")

In [None]:
# Define constants.
RANDOM_SEED = 42
DATASET_NAME = 'reviews_Apps_for_Android_5.json.gz'
url = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/'

### Load Dataset

In [None]:
last_percent_reported = None
def download_progress_hook(count, block_size, total_size):
    """A hook to report the progress of a download. This is mostly intended for users with
    slow internet connections. Reports every 1% change in download progress.
    """
    global last_percent_reported
    percent = int(count * block_size * 100 / total_size)

    if last_percent_reported != percent:
        if percent % 5 == 0:
            sys.stdout.write("%s%%" % percent)
            sys.stdout.flush()
        else:
            sys.stdout.write(".")
            sys.stdout.flush()

        last_percent_reported = percent

def maybe_download(filename, expected_bytes, force=False):
    """Download a file if not present, and make sure it's the right size."""
    if force or not os.path.exists(filename):
        print('Attempting to download:', filename)
        filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
        print('\nDownload Complete!')

    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')

    return filename

test = maybe_download(DATASET_NAME, 95509687)

In [None]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    data = dict()

    for d in parse(path):
        data[i] = d
        i += 1

    return pd.DataFrame.from_dict(data, orient='index')

# Load pickle if exists, extract and pickle if it doesn't.
pickle_file_name = (DATASET_NAME[:-8] + '.pickle')
if os.path.exists(pickle_file_name):
    print ('Pickled file already present, loading...')
    data = pd.read_pickle(pickle_file_name)
    print ('Pickle file loaded.')
else:
    data = get_df(DATASET_NAME)
    data.to_pickle(pickle_file_name)

In [None]:
data.head()

In [None]:
# Select the columns.
df = data.loc[:, ["reviewText", "overall", "helpful"]].copy()

# Split numerator and denominator.
df['helpful_ratings'] = df['helpful'].apply(lambda x: x[0])
df['total_ratings'] = df['helpful'].apply(lambda x: x[1])

# Delete un-needed 'helpful' category.
del df['helpful']

In [None]:
df

In [None]:
# Filter unreasonable comments.
df = df[(df.total_ratings > 10)]
df.shape

In [None]:
# Create target column.
threshold = 0.7
df.loc[:, 'Helpful'] = np.where(df.loc[:, 'helpful_ratings'] \
                                 / df.loc[:, 'total_ratings'] > threshold, 1, 0)

In [None]:
X = df["reviewText"].copy()
y = df["Helpful"].copy()

### Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

### Wrap test dataset

In [None]:
raw_dataset = pd.concat([X_test, y_test], axis=1)

In [None]:
wrapped_dataset = wrap_dataset(raw_dataset,
                               name="reviews",
                               target="Helpful",
                               column_types={"reviewText": "text"})

### Define preprocessing pipeline

In [None]:
def make_lowercase(x):
    if isinstance(x, pd.DataFrame):
        x = x.iloc[:, 0]
    x = x.apply(lambda row: row.lower())
    return x

def remove_punctuation(x):
    def _remove_punctuation(text):
        return text.translate(str.maketrans('', '', string.punctuation))

    x.apply(lambda row: _remove_punctuation(row))
    return x

In [None]:
stemmer = SnowballStemmer("english")
def tokenizer(x):
    x = x.split()
    stems = list()
    [stems.append(stemmer.stem(word)) for word in x]
    return stems

vectorizer = TfidfVectorizer(tokenizer=tokenizer, stop_words='english', ngram_range=(1, 1), min_df=0.01)

In [None]:
preprocessor = Pipeline(steps=[
    ("lowercase", FunctionTransformer(make_lowercase)),
    ("punctuation", FunctionTransformer(remove_punctuation)),
    ("vectorizer", vectorizer)
])

### Build estimator

In [None]:
estimator = LogisticRegression(random_state=RANDOM_SEED)

In [None]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("estimator", estimator)
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1].T)

### Wrap model

In [None]:
wrapped_model = wrap_model(model=pipeline,
                           model_type="classification",
                           feature_names=["reviewText"],
                           name="review_helpfulness_predictor")

In [None]:
results = giskard.scan(model=wrapped_model, dataset=wrapped_dataset)

In [None]:
display(results)

In [None]:
print(y_test.value_counts())

In [None]:
from giskard import GiskardClient

# Create a project
url = "http://localhost:19000"
token = ""
client = GiskardClient(url, token)
# your_project = client.create_project("amazon_review", "AMAZON_REVIEW", "AMAZON_REVIEW_CLASSIFICATION")
your_project = client.get_project("amazon_review")

# Upload your model and dataset
model_id = wrapped_model.upload(client, "amazon_review")
dataset_id = wrapped_dataset.upload(client, "amazon_review")

In [None]:
len(X_test)