In [2]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

from sklearn.metrics import accuracy_score

In [3]:
data_dir = '../data/lstm' # The folder we will use for storing data
os.makedirs(data_dir, exist_ok=True)  # ensure cache directory exists

cache_dir = os.path.join("../cache", "lstm")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

# 1. Data Prep

## 1.1 Loading Data

In [26]:
train_df = pickle.load(open('../data/processed/train_df.pkl', 'rb'))
val_df = pickle.load(open('../data/processed/val_df.pkl', 'rb'))
test_df = pickle.load(open('../data/processed/test_df.pkl', 'rb'))

train_df = train_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
val_df = val_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
test_df = test_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})

train_df = pd.concat([train_df, val_df]).reset_index(drop = True)
val_df = None

## 1.2 Question to List of Words

In [27]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import os
import re
from bs4 import BeautifulSoup

def question_to_words(question):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    #print(review)
    #text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", question.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [28]:
import pickle

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        words_train = [question_to_words(question) for question in tqdm(data_train)]
        words_test = [question_to_words(question) for question in tqdm(data_test)]
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

In [29]:
train_X, train_y = train_df.sentence.values, train_df.label.values
test_X, test_y = test_df.sentence.values, test_df.label.values

In [30]:
train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)

Read preprocessed data from cache file: preprocessed_data.pkl


In [31]:
train_df = test_df = None

## 1.3 List of words to word vector

In [32]:
import numpy as np

def build_dict(data, vocab_size = 5000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    word_count = {} # A dict storing the words that appear in the reviews along with how often they occur
    for sent in tqdm(data):
        for word in sent:
            if word in word_count.keys():
                word_count[word] += 1
            else:
                word_count[word] = 1
    
    sorted_words = [pair[0] for pair in sorted(word_count.items(), key = lambda x: x[1], reverse = True)]
    
    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

In [33]:
try:
    word_dict = pickle.load(open(os.path.join(data_dir, 'word_dict.pkl'), 'rb'))
    print('Loaded word_dict from local cache!')
except:
    word_dict = build_dict(train_X)
    with open(os.path.join(data_dir, 'word_dict.pkl'), "wb") as f:
        pickle.dump(word_dict, f)
    print('Constructed word_dict and saved to local cache!')

Loaded word_dict from local cache!


In [25]:
# word_dict = pickle.load(open(os.path.join(data_dir, 'word_dict.pkl'), 'rb'))

## 1.4 Transform the Data

In [34]:
def convert_and_pad(word_dict, sentence, pad=2000):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)

def convert_and_pad_data(word_dict, data, pad=2000):
    result = []
    lengths = []
    
    for sentence in tqdm(data):
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [None]:
train_X, train_X_len = convert_and_pad_data(word_dict, train_X)

In [None]:
train_df = test_df = None
word_dict = None
test_X = test_y = None

## 1.5 Upload to S3

In [None]:
pd.concat([pd.DataFrame(train_y), pd.DataFrame(train_X_len), pd.DataFrame(train_X)], axis=1).to_csv(os.path.join(data_dir, 'lstm_train.csv'), header=False, index=False)

In [48]:
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/stackoverflow-question-quality'

role = sagemaker.get_execution_role()

In [6]:
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

In [7]:
data_dir

'../data/lstm'

# 2. Training

In [8]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point="train.py",
                    source_dir="../src_lstm",
                    role=role,
                    framework_version='0.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.p2.xlarge',
                    py_version='py3', # Joey: not sure if this is expected here
                    hyperparameters={
                        'epochs': 10,
                        'hidden_dim': 200
                    })

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [None]:
estimator.fit({'training': input_data})

2021-07-09 04:13:13 Starting - Starting the training job...
2021-07-09 04:13:38 Starting - Launching requested ML instancesProfilerReport-1625803993: InProgress
......
2021-07-09 04:14:38 Starting - Preparing the instances for training............
2021-07-09 04:16:38 Downloading - Downloading input data...
2021-07-09 04:17:10 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-07-09 04:17:11,985 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-07-09 04:17:12,012 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-07-09 04:17:12,670 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-07-09 04:17:12,990 sagemaker-containers INFO     Module train does not provide a setup.py. [0m
[34mGenerating se

In [None]:
train_X = None

In [14]:
# from sagemaker.pytorch import PyTorch

# estimator = PyTorch(entry_point="train.py",
#                     model_uri = 's3://sagemaker-us-east-1-997893341280/sagemaker-pytorch-2021-07-09-04-13-13-248/output/model.tar.gz')

# 3. Testing

In [16]:
from sagemaker.estimator import Estimator
# lstm_estimator = Estimator.attach()
my_training_job_name = 'sagemaker-pytorch-2021-07-09-04-13-13-248'
lstm_estimator = PyTorch.attach(my_training_job_name)


2021-07-10 00:25:48 Starting - Preparing the instances for training
2021-07-10 00:25:48 Downloading - Downloading input data
2021-07-10 00:25:48 Training - Training image download completed. Training in progress.
2021-07-10 00:25:48 Uploading - Uploading generated training model
2021-07-10 00:25:48 Completed - Training job completed


In [17]:
lstm_transformer = lstm_estimator.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [None]:
# train_X = train_X_len = None

In [8]:
try:
    test_X = pickle.load(open('../data/lstm/test_X.pkl', 'rb'))
    print(' => Loaded text_X from local pickle!')
except:
    print(' => Pickle file not found... Creating text_X!')
    test_X, test_X_len = convert_and_pad_data(word_dict, test_X)
    test_X = pd.concat([pd.DataFrame(test_X_len), pd.DataFrame(test_X)], axis=1)
    pickle.dump(test_X, open('../data/lstm/test_X.pkl', 'wb'))
    
# pd.concat([pd.DataFrame(test_X_len), pd.DataFrame(test_X)], axis=1).to_csv(os.path.join(data_dir, 'lstm_test.csv'), header=False, index=False)

 => Loaded text_X from local pickle!


- **Batch-transform Method**

In [19]:
# session = sagemaker.Session() # Store the current SageMaker session
# bucket = sagemaker.Session().default_bucket()

# test_location = session.upload_data(os.path.join(data_dir, 'lstm_test.csv'), key_prefix=prefix)

In [21]:
test_location = 's3://sagemaker-us-east-1-997893341280/sagemaker/stackoverflow-question-quality/lstm_test.csv'

In [None]:
lstm_transformer.transform(test_location, content_type='text/csv', split_type='Line')

............................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m[2021-07-10 02:10:55 +0000] [17] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-07-10 02:10:55 +0000] [17] [INFO] Listening at: unix:/tmp/gunicorn.sock (17)[0m
[34m[2021-07-10 02:10:55 +0000] [17] [INFO] Using worker: gevent[0m
[34m[2021-07-10 02:10:55 +0000] [24] [INFO] Booting worker with pid: 24[0m
[34m[2021-07-10 02:10:56 +0000] [28] [INFO] Booting worker with pid: 28[0m
[34m[2021-07-10 02:10:56 +0000] [29] [INFO] Booting worker with pid: 29[0m
[34m[2021-07-10 02:10:56 +0000] [30] [INFO] Booting worker with pid: 30[0m
[34mProcessing /opt/ml/code[0m
[34mCollecting pandas (from -r requirements.txt (line 1))[0m
[34m  Downloading https://files.pythonhosted.org/packages/74/24/0cdbf8907e1e3bc5a8da03345c23cbed7044330bb8f73bb12e711a640a00/pandas-0.24.2-cp35-cp35m-manylinux1_x86_64.whl (10.0MB)[0m
[34mCollecting 

- **Deploy Method**

In [9]:
predictor = lstm_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

# We split the data into chunks and send each chunk seperately, accumulating the results.

def predict(data, rows=512):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = np.array([])
    for array in split_array:
        predictions = np.append(predictions, predictor.predict(array))
    
    return predictions

---------------------------------*

UnexpectedStatusException: Error hosting endpoint sagemaker-pytorch-2021-07-10-01-34-03-915: Failed. Reason:  The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint..

In [None]:
predictions = predict(test_X.values)
predictions = [round(num) for num in predictions]

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, predictions)

# 4. Experiment Zone

In [8]:
import numpy as np
import torch.nn as nn
import torch
import pickle
import os

In [12]:
word_dict = pickle.load(open(os.path.join(data_dir, 'word_dict.pkl'), 'rb'))
list(word_dict.items())[:5]

[('would', 2), ('gt', 3), ('lt', 4), ('use', 5), ('string', 6)]

In [1]:
class LSTMClassifier(nn.Module):
    """
    This is the simple RNN model we will be using to perform Sentiment Analysis.
    """

    def __init__(self, embedding_dim, hidden_dim, vocab_size, class_num):
        """
        Initialize the model by settingg up the various layers.
        """
        super(LSTMClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.dense = nn.Linear(in_features=hidden_dim, out_features=class_num)
        self.sig = nn.Sigmoid()
        
        self.word_dict = None

    def forward(self, x):
        """
        Perform a forward pass of our model on some input.
        """
        x = x.t()
        lengths = x[0,:]
        reviews = x[1:,:]
        embeds = self.embedding(reviews)
        lstm_out, _ = self.lstm(embeds)
        out = self.dense(lstm_out)
        out = out[lengths - 1, range(len(lengths))]
        return self.sig(out.squeeze())

In [2]:
clf = LSTMClassifier(32, 256, 5000, 3)

In [17]:
t = torch.tensor([[5, 1, 2, 7, 9, 4],
           [4, 3, 10, 4, 5, 0]])
result = clf(t)

In [18]:
result

tensor([[0.4901, 0.4918, 0.4821],
        [0.4997, 0.4738, 0.4671]], grad_fn=<SigmoidBackward>)

In [20]:
batch_y = torch.tensor([1, 0]).long()

In [19]:
loss_fn = torch.nn.CrossEntropyLoss()

In [22]:
loss = loss_fn(result, batch_y)

In [23]:
loss.data.item() / 

1.0870113372802734

In [25]:
lstm_train = pd.read_csv('../data/lstm/lstm_train.csv', header = None)
lstm_train.shape

(45000, 2002)