In [1]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

from sklearn.metrics import accuracy_score

In [2]:
data_dir = '../data/sklearn' # The folder we will use for storing data
os.makedirs(data_dir, exist_ok=True)  # ensure cache directory exists

cache_dir = os.path.join("../cache", "lstm")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

# 1. Data Prep

## 1.1 Load Data

In [42]:
train_df = pickle.load(open('../data/processed/train_df.pkl', 'rb'))
val_df = pickle.load(open('../data/processed/val_df.pkl', 'rb'))
test_df = pickle.load(open('../data/processed/test_df.pkl', 'rb'))

train_df = train_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
val_df = val_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
test_df = test_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})

train_df = pd.concat([train_df, val_df]).reset_index(drop = True)
val_df = None

In [43]:
test_y = test_df.label.values

## 1.2 Question to List of Words

In [69]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import os
import re
from bs4 import BeautifulSoup

def question_to_words(question):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    #print(review)
    #text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", question.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [70]:
import pickle

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        words_train = [question_to_words(question) for question in tqdm(data_train)]
        words_test = [question_to_words(question) for question in tqdm(data_test)]
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

In [71]:
train_X, train_y = train_df.sentence.values, train_df.label.values
test_X, test_y = test_df.sentence.values, test_df.label.values

In [72]:
train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)

Read preprocessed data from cache file: preprocessed_data.pkl


In [73]:
train_df = test_df = None

In [74]:
# train_X.shape

## 1.3 Extract BOW features

In [75]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.externals 
import joblib
# joblib is an enhanced version of pickle that is more efficient for storing NumPy arrays

In [76]:
# with open(os.path.join(cache_dir, "bow_features.pkl"), "rb") as f:
#     cache_data = joblib.load(f)

In [77]:
def extract_BoW_features(words_train, words_test, vocabulary_size=5000,
                         cache_dir=cache_dir, cache_file="bow_features.pkl"):
    """Extract Bag-of-Words for a given set of documents, already preprocessed into words."""
    
    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = joblib.load(f)
            print("Read features from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Fit a vectorizer to training documents and use it to transform them
        # NOTE: Training documents have already been preprocessed and tokenized into words;
        #       pass in dummy functions to skip those steps, e.g. preprocessor=lambda x: x
        vectorizer = CountVectorizer(max_features=vocabulary_size,
                preprocessor=lambda x: x, tokenizer=lambda x: x)  # already preprocessed
        features_train = vectorizer.fit_transform(words_train).toarray()

        # Apply the same vectorizer to transform the test documents (ignore unknown words)
        features_test = vectorizer.transform(words_test).toarray()
        
        # NOTE: Remember to convert the features using .toarray() for a compact representation
        
        # Write to cache file for future runs (store vocabulary as well)
        if cache_file is not None:
            vocabulary = vectorizer.vocabulary_
            cache_data = dict(features_train=features_train, features_test=features_test,
                             vocabulary=vocabulary)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                joblib.dump(cache_data, f)
            print("Wrote features to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        features_train, features_test, vocabulary = (cache_data['features_train'],
                cache_data['features_test'], cache_data['vocabulary'])
    
    # Return both the extracted features as well as the vocabulary
    return features_train, features_test, vocabulary

In [78]:
# Extract Bag of Words features for both training and test datasets
train_X, test_X, vocabulary = extract_BoW_features(train_X, test_X)

Read features from cache file: bow_features.pkl


- **Test set**

In [79]:
pd.DataFrame(test_X).to_csv(os.path.join(data_dir, 'test_sklearn.csv'), header=False, index=False)
test_X = vocabulary = None

- **Validation Set**

In [18]:
val_idx = np.random.choice(range(len(train_y)), size=10000, replace=False)

val_X = pd.DataFrame(train_X[val_idx])
val_y = pd.DataFrame(train_y[val_idx])

val_X.shape, val_y.shape

((10000, 5000), (10000, 1))

In [19]:
pd.concat([val_y, val_X], axis=1).to_csv(os.path.join(data_dir, 'val_sklearn.csv'), header=False, index=False)
val_y = val_X = None

- **Train Set**

In [25]:
train_idx = [i for i in range(len(train_y)) if i not in val_idx]

train_X = pd.DataFrame(train_X[train_idx])
train_y = pd.DataFrame(train_y[train_idx])

train_X.shape, train_y.shape

In [None]:
pd.concat([train_y, train_X], axis=1).to_csv(os.path.join(data_dir, 'train_sklearn.csv'), header=False, index=False)

In [24]:
test_X = train_X = val_X = train_y = val_y = None

## 1.4 Upload to S3

In [5]:
import sagemaker

session = sagemaker.Session() # Store the current SageMaker session
bucket = sagemaker.Session().default_bucket()

prefix = 'sagemaker/stackoverflow-question-quality'

test_location = session.upload_data(os.path.join(data_dir, 'test_sklearn.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'val_sklearn.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train_sklearn.csv'), key_prefix=prefix)

In [7]:
# test_location = session.upload_data(os.path.join(data_dir, 'test_sklearn.csv'), key_prefix=prefix)

In [8]:
# t = pd.read_csv(data_dir + '/test_sklearn.csv', header = None)
# t.head()

In [11]:
# t[5].unique()

In [12]:
# t.shape

In [13]:
# t = pd.read_csv(data_dir + '/val_sklearn.csv', header = None)
# t.head()

# 2. Model Training

In [28]:
sagemaker.__version__

'2.48.1'

In [14]:
# pip install -U sagemaker

In [15]:
import sagemaker
from sagemaker import image_uris
from sagemaker.inputs import TrainingInput
from sagemaker import get_execution_role

role = get_execution_role()
container = image_uris.retrieve(framework='xgboost',region='us-east-1',version='1.2-1')

In [16]:
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/sklearn_output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

xgb.set_hyperparameters(max_depth=11,
                        eta=0.2622829413333343,
                        gamma=1.0375026465214707,
                        min_child_weight=3,
                        subsample=0.8544514211613925,
                        #silent=0,
                        objective='multi:softmax',
                        num_class=3,
                        early_stopping_rounds=10,
                        num_round=500)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [17]:
train_input = TrainingInput("s3://{}/{}/{}".format(bucket, prefix, 'train_sklearn.csv'), content_type='csv')
validation_input = TrainingInput("s3://{}/{}/{}".format(bucket, prefix, 'val_sklearn.csv'), content_type='csv')

In [25]:
"s3://{}/{}/{}".format(bucket, prefix, 'train_sklearn.csv')# validation_input

's3://sagemaker-us-east-1-997893341280/sagemaker/stackoverflow-question-quality/train_sklearn.csv'

In [18]:
xgb.fit({'train': train_input, 'validation': validation_input})

2021-07-09 04:12:06 Starting - Starting the training job...
2021-07-09 04:12:29 Starting - Launching requested ML instancesProfilerReport-1625803925: InProgress
...
2021-07-09 04:13:04 Starting - Preparing the instances for training............
2021-07-09 04:14:50 Downloading - Downloading input data...
2021-07-09 04:15:30 Training - Downloading the training image...
2021-07-09 04:15:57 Training - Training image download completed. Training in progress..[34m[2021-07-09 04:16:00.002 ip-10-2-133-222.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root

# 3. Model Evaluation

In [34]:
# "s3://{}/{}/{}".format(bucket, prefix, 'val_sklearn.csv')
test_location = 's3://sagemaker-us-east-1-997893341280/sagemaker/stackoverflow-question-quality/test_sklearn.csv'

In [35]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [36]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

.......................................[34m[2021-07-09:07:54:16:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-07-09:07:54:16:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-07-09:07:54:16:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
 

In [37]:
# test_location

In [38]:
# xgb_transformer.wait()
xgb_transformer.output_path

's3://sagemaker-us-east-1-997893341280/sagemaker-xgboost-2021-07-09-07-47-55-844'

In [88]:
# !aws s3 cp --recursive $xgb_transformer.output_path $data_dir

'../data/sklearn'

In [39]:
predictions = pd.read_csv(os.path.join(xgb_transformer.output_path, 'test_sklearn.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [44]:
accuracy_score(test_y, predictions)

0.7146666666666667

In [45]:
test_y

array([1, 1, 2, ..., 1, 0, 0])

In [91]:
# !$xgb_transformer.output_path
# bucket

In [92]:
# !aws s3 cp
# data_dir

# 4. Hyperparameter Tuning

In [95]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

In [96]:
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/sklearn_output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        #silent=0,
                        objective='multi:softmax',
                        num_class=3,
                        early_stopping_rounds=10,
                        num_round=500)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [97]:
xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:merror', # The metric used to compare trained models.
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 6, # The total number of models to train
                                               max_parallel_jobs = 3, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                               })

In [98]:
train_input = TrainingInput("s3://{}/{}/{}".format(bucket, prefix, 'train_sklearn.csv'), content_type='csv')
validation_input = TrainingInput("s3://{}/{}/{}".format(bucket, prefix, 'val_sklearn.csv'), content_type='csv')

In [99]:
xgb_hyperparameter_tuner.fit({'train': train_input, 'validation': validation_input})

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [12]:
test_location = 's3://sagemaker-us-east-1-997893341280/sagemaker/stackoverflow-question-quality/test_sklearn.csv'
# best_training_job_location = 's3://sagemaker-us-east-1-997893341280/sagemaker/stackoverflow-question-quality/sklearn_output/sagemaker-xgboost-210708-0851-006-1c702236/output'

In [8]:
xgb_attached = sagemaker.estimator.Estimator.attach('sagemaker-xgboost-2021-07-08-14-02-27-286')


2021-07-08 15:51:00 Starting - Preparing the instances for training
2021-07-08 15:51:00 Downloading - Downloading input data
2021-07-08 15:51:00 Training - Training image download completed. Training in progress.
2021-07-08 15:51:00 Uploading - Uploading generated training model
2021-07-08 15:51:00 Completed - Training job completed


In [9]:
xgb_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [10]:
# test_input = TrainingInput("s3://{}/{}/{}".format(bucket, prefix, 'test_sklearn.csv'), content_type='csv')

In [None]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

...............

In [2]:
# xgb_transformer.wait()

In [None]:
# pickle.dump(xgb_hyperparameter_tuner, open('../data/sklearn/xgb_hyperparameter_tuner.pkl', 'wb'))

In [None]:
# !aws s3 cp --recursive $xgb_transformer.output_path $data_dir

In [None]:
predictions = pd.read_csv(os.path.join(data_dir, 'test_sklearn.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [None]:
accuracy_score(test_y, predictions)