In [1]:
import os
import sys
import gzip
import string
from urllib.request import urlretrieve

import giskard
import sklearn
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FunctionTransformer

from giskard import wrap_dataset, wrap_model

In [2]:
sklearn.set_config(transform_output="default")

In [3]:
# Define constants.
RANDOM_SEED = 42
DATASET_NAME = 'reviews_Apps_for_Android_5.json.gz'
url = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/'

### Load Dataset

In [4]:
last_percent_reported = None
def download_progress_hook(count, block_size, total_size):
    """A hook to report the progress of a download. This is mostly intended for users with
    slow internet connections. Reports every 1% change in download progress.
    """
    global last_percent_reported
    percent = int(count * block_size * 100 / total_size)

    if last_percent_reported != percent:
        if percent % 5 == 0:
            sys.stdout.write("%s%%" % percent)
            sys.stdout.flush()
        else:
            sys.stdout.write(".")
            sys.stdout.flush()

        last_percent_reported = percent

def maybe_download(filename, expected_bytes, force=False):
    """Download a file if not present, and make sure it's the right size."""
    if force or not os.path.exists(filename):
        print('Attempting to download:', filename)
        filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
        print('\nDownload Complete!')

    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')

    return filename

test = maybe_download(DATASET_NAME, 95509687)

Found and verified reviews_Apps_for_Android_5.json.gz


In [5]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    data = dict()

    for d in parse(path):
        data[i] = d
        i += 1

    return pd.DataFrame.from_dict(data, orient='index')

# Load pickle if exists, extract and pickle if it doesn't.
pickle_file_name = (DATASET_NAME[:-8] + '.pickle')
if os.path.exists(pickle_file_name):
    print ('Pickled file already present, loading...')
    data = pd.read_pickle(pickle_file_name)
    print ('Pickle file loaded.')
else:
    data = get_df(DATASET_NAME)
    data.to_pickle(pickle_file_name)

Pickled file already present, loading...
Pickle file loaded.


In [6]:
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1N4O8VOJZTDVB,B004A9SDD8,Annette Yancey,"[1, 1]","Loves the song, so he really couldn't wait to ...",3.0,Really cute,1383350400,"11 2, 2013"
1,A2HQWU6HUKIEC7,B004A9SDD8,"Audiobook lover ""Kathy""","[0, 0]","Oh, how my little grandson loves this app. He'...",5.0,2-year-old loves it,1323043200,"12 5, 2011"
2,A1SXASF6GYG96I,B004A9SDD8,Barbara Gibbs,"[0, 0]",I found this at a perfect time since my daught...,5.0,Fun game,1337558400,"05 21, 2012"
3,A2B54P9ZDYH167,B004A9SDD8,"Brooke Greenstreet ""Babylove""","[3, 4]",My 1 year old goes back to this game over and ...,5.0,We love our Monkeys!,1354752000,"12 6, 2012"
4,AFOFZDTX5UC6D,B004A9SDD8,C. Galindo,"[1, 1]",There are three different versions of the song...,5.0,This is my granddaughters favorite app on my K...,1391212800,"02 1, 2014"


In [7]:
# Select the columns.
df = data.loc[:, ["reviewText", "overall", "helpful"]].copy()

# Split numerator and denominator.
df['helpful_ratings'] = df['helpful'].apply(lambda x: x[0])
df['total_ratings'] = df['helpful'].apply(lambda x: x[1])

# Delete un-needed 'helpful' category.
del df['helpful']

In [8]:
df

Unnamed: 0,reviewText,overall,helpful_ratings,total_ratings
0,"Loves the song, so he really couldn't wait to ...",3.0,1,1
1,"Oh, how my little grandson loves this app. He'...",5.0,0,0
2,I found this at a perfect time since my daught...,5.0,0,0
3,My 1 year old goes back to this game over and ...,5.0,3,4
4,There are three different versions of the song...,5.0,1,1
...,...,...,...,...
752932,I love it!!!!!!!!!! really keeps your attenti...,5.0,0,0
752933,Okay but there are so many free ones that are ...,3.0,0,0
752934,Another great jewels game that just keeps you ...,4.0,0,0
752935,I find this the best jewels star ever. There s...,5.0,0,0


In [31]:
# Filter unreasonable comments.
df = df[(df.total_ratings > 10)]
df.shape

(47932, 5)

In [65]:
# Create target column.
threshold = 0.7
df.loc[:, 'Helpful'] = np.where(df.loc[:, 'helpful_ratings'] \
                                 / df.loc[:, 'total_ratings'] > threshold, 1, 0)

In [66]:
X = df["reviewText"].copy()
y = df["Helpful"].copy()

### Train-test split

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

### Wrap test dataset

In [68]:
raw_dataset = pd.concat([X_test, y_test], axis=1)

In [69]:
wrapped_dataset = wrap_dataset(raw_dataset,
                               name="reviews",
                               target="Helpful",
                               column_types={"reviewText": "text"})

Your 'pandas.DataFrame' dataset is successfully wrapped by Giskard's 'Dataset' wrapper class.


### Define preprocessing pipeline

In [70]:
def make_lowercase(x):
    if isinstance(x, pd.DataFrame):
        x = x.iloc[:, 0]
    x = x.apply(lambda row: row.lower())
    return x

def remove_punctuation(x):
    def _remove_punctuation(text):
        return text.translate(str.maketrans('', '', string.punctuation))

    x.apply(lambda row: _remove_punctuation(row))
    return x

In [71]:
stemmer = SnowballStemmer("english")
def tokenizer(x):
    x = x.split()
    stems = list()
    [stems.append(stemmer.stem(word)) for word in x]
    return stems

vectorizer = TfidfVectorizer(tokenizer=tokenizer, stop_words='english', ngram_range=(1, 1), min_df=0.01)

In [72]:
preprocessor = Pipeline(steps=[
    ("lowercase", FunctionTransformer(make_lowercase)),
    ("punctuation", FunctionTransformer(remove_punctuation)),
    ("vectorizer", vectorizer)
])

### Build estimator

In [73]:
estimator = LogisticRegression(random_state=RANDOM_SEED)

In [74]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("estimator", estimator)
])

In [75]:
pipeline.fit(X_train, y_train)



In [76]:
roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1].T)

0.7078508740933849

### Wrap model

In [77]:
wrapped_model = wrap_model(model=pipeline,
                           model_type="classification",
                           feature_names=["reviewText"],
                           name="review_helpfulness_predictor")

Your 'sklearn' model is successfully wrapped by Giskard's 'SKLearnModel' wrapper class.


In [81]:
results = giskard.scan(model=wrapped_model, dataset=wrapped_dataset)

Hint: "Your target variable values are numeric. It is recommended to have Human readable string as your target values to make results more understandable in Giskard."




Your model is successfully validated.


In [79]:
display(results)

In [80]:
print(y_test.value_counts())

1    6762
0    2825
Name: Helpful, dtype: int64


In [30]:
from giskard import GiskardClient

# Create a project
url = "http://localhost:19000"
token = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsInRva2VuX3R5cGUiOiJBUEkiLCJhdXRoIjoiUk9MRV9BRE1JTiIsImV4cCI6MTY5MDI5Mjk4OH0.UwKaFWP_Jr3Zm7q2cotrcoyiV9ren3o7TwsFnGYwkFM"
client = GiskardClient(url, token)
# your_project = client.create_project("amazon_review", "AMAZON_REVIEW", "AMAZON_REVIEW_CLASSIFICATION")
your_project = client.get_project("amazon_review")

# Upload your model and dataset
model_id = wrapped_model.upload(client, "amazon_review")
dataset_id = wrapped_dataset.upload(client, "amazon_review")

Your model is successfully validated.


TypeError: Object of type int64 is not JSON serializable

In [85]:
len(X_test)

9587