In [1]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1. Benchmark Model

In [23]:
train_df = pickle.load(open('../data/train_df.pkl', 'rb'))
val_df = pickle.load(open('../data/val_df.pkl', 'rb'))
test_df = pickle.load(open('../data/test_df.pkl', 'rb'))

In [3]:
X_train = train_df[['title_length', 'body_length']]
X_val = val_df[['title_length', 'body_length']]
X_test = test_df[['title_length', 'body_length']]

y_train = train_df.cleaned_y.values
y_val = val_df.cleaned_y.values
y_test = test_df.cleaned_y.values

In [4]:
all_rows = []
for c in tqdm([0.001, 0.01, 0.1, 1, 10, 100, 1000]):
    clf = LogisticRegression(random_state = 0).fit(X_train, y_train)
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    val_acc = accuracy_score(y_val, clf.predict(X_val))
    all_rows.append({
        'c': c,
        'train_acc': round(train_acc, 2),
        'val_acc': round(val_acc, 2)
    })

100%|██████████| 7/7 [00:04<00:00,  1.73it/s]


In [5]:
pd.DataFrame(all_rows)

Unnamed: 0,c,train_acc,val_acc
0,0.001,0.39,0.39
1,0.01,0.39,0.39
2,0.1,0.39,0.39
3,1.0,0.39,0.39
4,10.0,0.39,0.39
5,100.0,0.39,0.39
6,1000.0,0.39,0.39


# 2. Bert Model

In [6]:
# need torch 1.3.1 for elastic inference
!pip install torch==1.3.1
!pip install transformers

Collecting torch==1.3.1
  Downloading torch-1.3.1-cp36-cp36m-manylinux1_x86_64.whl (734.6 MB)
[K     |████████████████████████████████| 734.6 MB 7.4 kB/s  eta 0:00:01     |███████                         | 161.0 MB 85.8 MB/s eta 0:00:07     |█████████████████▉              | 408.7 MB 82.0 MB/s eta 0:00:04��█▎            | 441.7 MB 51.1 MB/s eta 0:00:06     |███████████████████████████▌    | 630.6 MB 69.8 MB/s eta 0:00:02
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.4.0
    Uninstalling torch-1.4.0:
      Successfully uninstalled torch-1.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 1.0.61 requires nvidia-ml-py3, which is not installed.[0m
Successfully installed torch-1.3.1
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install -

In [7]:
from transformers import BertTokenizer, BertModel
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification

## 2.1 Setup

In [8]:
import os
import numpy as np
import pandas as pd
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = "sagemaker/stackoverflow-question-quality"

role = sagemaker.get_execution_role()

## 2.2 Data Prep

In [24]:
train_df = pickle.load(open('../data/train_df.pkl', 'rb'))
val_df = pickle.load(open('../data/val_df.pkl', 'rb'))
test_df = pickle.load(open('../data/test_df.pkl', 'rb'))

In [25]:
train_df.head(2)

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y,cleaned_y,body_notag,all_text,title_length,body_length
26373,47236918,click edit button any record will fetch first ...,I have a problem\r\nWhen I click the Edit butt...,<php><html><mysql><sql>,2017-11-11 10:31:22,LQ_EDIT,1,have problem click edit button any record will...,click edit button any record will fetch first ...,12,90
18718,43043808,to filter username harmful characters,<p>i really need help from you guys !!\nsome o...,<php><html>,2017-03-27 10:21:57,LQ_CLOSE,0,really need help you guys of friends scamming ...,to filter username harmful characters really n...,5,26


In [28]:
train_df = train_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
val_df = val_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
test_df = test_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})

In [35]:
train_df[['label', 'sentence']].to_csv('../data/train_s3.csv', index = False, header = True)
val_df[['label', 'sentence']].to_csv('../data/val_s3.csv', index = False, header = True)
test_df[['label', 'sentence']].to_csv('../data/test_s3.csv', index = False, header = True)

In [36]:
inputs_train = sagemaker_session.upload_data("../data/train_s3.csv", bucket=bucket, key_prefix=prefix)
inputs_val = sagemaker_session.upload_data("../data/val_s3.csv", bucket=bucket, key_prefix=prefix)
inputs_test = sagemaker_session.upload_data("../data/test_s3.csv", bucket=bucket, key_prefix=prefix)

## 2.3 Training

In [37]:
import torch
torch.cuda.is_available()

False

In [None]:
# from sagemaker.pytorch import PyTorch

# # place to save model artifact
# output_path = f"s3://{bucket}/{prefix}"

# estimator = PyTorch(
#     entry_point="train.py",
#     source_dir="../src/",
#     role=role,
#     framework_version="1.3.1",
#     py_version="py3",
#     instance_count=2,  # this script only support distributed training for GPU instances.
#     instance_type="ml.p3.2xlarge",
#     output_path=output_path,
#     hyperparameters={
#         "epochs": 2,
#         "num_labels": 2,
#         "backend": "gloo",
#     },
#     disable_profiler=True, # disable debugger
# )
# estimator.fit({"training": inputs_train, "testing": inputs_test})

In [None]:
from sagemaker.huggingface import HuggingFace


# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'per_device_train_batch_size': 32,
                 'model_name_or_path': 'distilbert-base-uncased'
                 }

# create the Estimator
huggingface_estimator = HuggingFace(
        entry_point='train.py',
        source_dir='./scripts',
        instance_type='ml.p3.2xlarge',
        instance_count=1,
        role=role,
        transformers_version='4.4',
        #pytorch_version='1.6',
        py_version='py36',
        hyperparameters = hyperparameters
)

In [73]:
huggingface_estimator.fit(
  {'train': bucket + prefix,
   'test': 's3://sagemaker-us-east-1-558105141721/samples/datasets/imdb/test'}
)

In [None]:
bucket + prefix

# 3. Logistic Regression Model

## 3.1 Loading Data

In [71]:
train_df = pickle.load(open('../data/train_df.pkl', 'rb'))
val_df = pickle.load(open('../data/val_df.pkl', 'rb'))
test_df = pickle.load(open('../data/test_df.pkl', 'rb'))

train_df = train_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
val_df = val_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
test_df = test_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})

In [72]:
train_df = pd.concat([train_df, val_df]).reset_index(drop = True)
val_df = None

In [73]:
# embedding_dim
# vocab_size

## 3.2 Convert sentences to list of words

In [74]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import os
import re
from bs4 import BeautifulSoup

def question_to_words(question):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    #print(review)
    #text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", question.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [75]:
# question_to_words(train_df.sentence.values[0])

In [76]:
train_df.sentence.values[0]

'click edit button any record will fetch first data please inform thank have problem click edit button any record will fetch first data please inform thank have problem click edit button any record will fetch first data please inform thank have problem click edit button any record will fetch first data please inform thank query query mysqli_query conn select customer order cu_id desc die mysqli_error echo echo cu id cu name email phone coun id card id update delete row mysqli_fetch_assoc query echo row cu_id row cu_name row email row phone row coun_id row card_id edit delete echo echo echo mysqli_close conn'

In [77]:
cache_dir = os.path.join("../cache", "stackoverflow-question-quality")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

In [78]:
import pickle

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        words_train = [question_to_words(question) for question in tqdm(data_train)]
        words_test = [question_to_words(question) for question in tqdm(data_test)]
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

In [79]:
train_X, train_y = train_df.sentence.values, train_df.label.values
test_X, test_y = test_df.sentence.values, test_df.label.values

In [None]:
train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)

 35%|███▍      | 15713/45000 [04:21<07:53, 61.91it/s]

In [12]:
train_df = test_df = None

## 3.3 List of words to word vector

In [13]:
import numpy as np

def build_dict(data, vocab_size = 5000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    
    # TODO: Determine how often each word appears in `data`. Note that `data` is a list of sentences and that a
    #       sentence is a list of words.
    
    word_count = {} # A dict storing the words that appear in the reviews along with how often they occur
    for sent in tqdm(data):
        for word in sent:
            if word in word_count.keys():
                word_count[word] += 1
            else:
                word_count[word] = 1
    
    sorted_words = [pair[0] for pair in sorted(word_count.items(), key = lambda x: x[1], reverse = True)]
    
    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

In [14]:
word_dict = build_dict(train_X)

100%|██████████| 45000/45000 [00:01<00:00, 31406.77it/s]


In [15]:
data_dir = '../data/lstm' # The folder we will use for storing data
os.makedirs(data_dir, exist_ok=True)  # ensure cache directory exists

In [16]:
with open(os.path.join(data_dir, 'word_dict.pkl'), "wb") as f:
    pickle.dump(word_dict, f)

## 3.4 Transform the data

In [17]:
def convert_and_pad(word_dict, sentence, pad=2000):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)

def convert_and_pad_data(word_dict, data, pad=2000):
    result = []
    lengths = []
    
    for sentence in tqdm(data):
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [18]:
train_X, train_X_len = convert_and_pad_data(word_dict, train_X)
# test_X, test_X_len = convert_and_pad_data(word_dict, test_X)

100%|██████████| 45000/45000 [00:02<00:00, 21472.49it/s]


In [19]:
# test_X = test_X_len = None

In [28]:
train_df = test_df = None
word_dict = None
test_X = test_y = None

In [29]:
# pd.DataFrame(train_y)

## 3.5 Upload data to S3

In [31]:
train_X_df = pd.DataFrame(train_X)

In [32]:
train_X_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,132,549,66,456,498,89,19,70,395,62,...,0,0,0,0,0,0,0,0,0,0
1,420,287,1,295,431,39,41,1025,1375,1,...,0,0,0,0,0,0,0,0,0,0
2,1,245,135,42,19,1225,83,39,423,1,...,0,0,0,0,0,0,0,0,0,0
3,4892,11,767,968,965,20,1,443,441,11,...,0,0,0,0,0,0,0,0,0,0
4,79,195,705,204,863,14,186,106,18,376,...,0,0,0,0,0,0,0,0,0,0


In [88]:
pd.concat([pd.DataFrame(train_y), pd.DataFrame(train_X_len), pd.DataFrame(train_X)], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)
# pd.concat([pd.DataFrame(test_y), pd.DataFrame(test_X_len), pd.DataFrame(test_X)], axis=1).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

In [35]:
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/stackoverflow-question-quality'

role = sagemaker.get_execution_role()

In [36]:
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

## 3.6 Training

In [54]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point="train.py",
                    source_dir="../src_lstm",
                    role=role,
                    framework_version='0.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.p2.xlarge',
                    py_version='py3', # Joey: not sure if this is expected here
                    hyperparameters={
                        'epochs': 20,
                        'hidden_dim': 200,
                    })

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [None]:
estimator.fit({'training': input_data})

2021-07-07 02:58:00 Starting - Starting the training job...
2021-07-07 02:58:24 Starting - Launching requested ML instancesProfilerReport-1625626679: InProgress
......
2021-07-07 02:59:24 Starting - Preparing the instances for training............
2021-07-07 03:01:25 Downloading - Downloading input data...
2021-07-07 03:01:45 Training - Downloading the training image...
2021-07-07 03:02:25 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-07-07 03:02:22,463 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-07-07 03:02:22,489 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-07-07 03:02:28,719 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-07-07 03:02:28,968 sagemaker-containers INFO     M

## 3.7 Deploy

In [None]:
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

-

## 3.8 Testing

In [None]:
# word_dict = pickle.load(open('../data/lstm/word_dict.pkl', "rb"))

train_df = pickle.load(open('../data/train_df.pkl', 'rb'))
val_df = pickle.load(open('../data/val_df.pkl', 'rb'))
test_df = pickle.load(open('../data/test_df.pkl', 'rb'))

train_df = train_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
val_df = val_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
test_df = test_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})

train_df = pd.concat([train_df, val_df]).reset_index(drop = True)
val_df = None


test_X, test_y = test_df.sentence.values, test_df.label.values
train_X, train_y = train_df.sentence.values, train_df.label.values

word_dict = build_dict(train_X)

train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)


test_X, test_X_len = convert_and_pad_data(word_dict, test_X)

 25%|██▍       | 11159/45000 [00:01<00:04, 8020.04it/s]

In [91]:
train_X = train_y = None
test_df = train_df = None
word_dict = None

In [97]:
# test_X = pd.concat([pd.DataFrame(test_X_len), pd.DataFrame(test_X)], axis=1)
# np.array(test_X)

In [None]:
def predict(data, rows=512):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = np.array([])
    for array in split_array:
        predictions = np.append(predictions, predictor.predict(array))
    
    return predictions

In [None]:
predictions = predict(test_X.values)
predictions = [round(num) for num in predictions]

In [None]:
len(predictions), len(test_y)

In [67]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, predictions)

ValueError: Found input variables with inconsistent numbers of samples: [15000, 45000]