In [4]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

from sklearn.metrics import accuracy_score

# 1. Data Prep

## 1.1 Loading Data

In [15]:
train_df = pickle.load(open('../data/processed/train_df.pkl', 'rb'))
val_df = pickle.load(open('../data/processed/val_df.pkl', 'rb'))
test_df = pickle.load(open('../data/processed/test_df.pkl', 'rb'))

train_df = train_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
val_df = val_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
test_df = test_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})

train_df = pd.concat([train_df, val_df]).reset_index(drop = True)
val_df = None

In [16]:
data_dir = '../data/lstm' # The folder we will use for storing data
os.makedirs(data_dir, exist_ok=True)  # ensure cache directory exists

cache_dir = os.path.join("../cache", "lstm")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

## 1.2 Question to List of Words

In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import os
import re
from bs4 import BeautifulSoup

def question_to_words(question):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    #print(review)
    #text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", question.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [19]:
import pickle

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        words_train = [question_to_words(question) for question in tqdm(data_train)]
        words_test = [question_to_words(question) for question in tqdm(data_test)]
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

In [20]:
train_X, train_y = train_df.sentence.values, train_df.label.values
test_X, test_y = test_df.sentence.values, test_df.label.values

In [21]:
train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)

100%|██████████| 45000/45000 [12:24<00:00, 60.41it/s] 
100%|██████████| 15000/15000 [09:29<00:00, 26.32it/s] 


Wrote preprocessed data to cache file: preprocessed_data.pkl


In [22]:
train_df = test_df = None

## 1.3 List of words to word vector

In [23]:
import numpy as np

def build_dict(data, vocab_size = 5000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    word_count = {} # A dict storing the words that appear in the reviews along with how often they occur
    for sent in tqdm(data):
        for word in sent:
            if word in word_count.keys():
                word_count[word] += 1
            else:
                word_count[word] = 1
    
    sorted_words = [pair[0] for pair in sorted(word_count.items(), key = lambda x: x[1], reverse = True)]
    
    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

In [24]:
try:
    word_dict = pickle.load(open(os.path.join(data_dir, 'word_dict.pkl'), 'rb'))
    print('Loaded word_dict from local cache!')
except:
    word_dict = build_dict(train_X)
    with open(os.path.join(data_dir, 'word_dict.pkl'), "wb") as f:
        pickle.dump(word_dict, f)
    print('Constructed word_dict and saved to local cache!')

100%|██████████| 45000/45000 [00:01<00:00, 31432.89it/s]

Constructed word_dict and saved to local cache!





In [25]:
# word_dict = pickle.load(open(os.path.join(data_dir, 'word_dict.pkl'), 'rb'))

## 1.4 Transform the Data

In [None]:
def convert_and_pad(word_dict, sentence, pad=2000):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)

def convert_and_pad_data(word_dict, data, pad=2000):
    result = []
    lengths = []
    
    for sentence in tqdm(data):
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [None]:
train_X, train_X_len = convert_and_pad_data(word_dict, train_X)

In [None]:
train_df = test_df = None
word_dict = None
test_X = test_y = None

## 1.5 Upload to S3

In [None]:
pd.concat([pd.DataFrame(train_y), pd.DataFrame(train_X_len), pd.DataFrame(train_X)], axis=1).to_csv(os.path.join(data_dir, 'lstm_train.csv'), header=False, index=False)

In [6]:
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/stackoverflow-question-quality'

role = sagemaker.get_execution_role()

In [30]:
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

# 2. Training

In [33]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point="train.py",
                    source_dir="../src_lstm",
                    role=role,
                    framework_version='0.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.m5.large',
                    py_version='py3', # Joey: not sure if this is expected here
                    hyperparameters={
                        'epochs': 10,
                        'hidden_dim': 200
                    })

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [None]:
estimator.fit({'training': input_data})

2021-07-08 01:37:22 Starting - Starting the training job...
2021-07-08 01:37:45 Starting - Launching requested ML instancesProfilerReport-1625708242: InProgress
...
2021-07-08 01:38:20 Starting - Preparing the instances for training.........
2021-07-08 01:39:52 Downloading - Downloading input data...
2021-07-08 01:40:20 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-07-08 01:40:21,664 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-07-08 01:40:21,666 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-08 01:40:21,679 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-07-08 01:40:22,300 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-07-08 01:4

In [None]:
train_X = None

# 3. Testing