# Deciding whether to escalate a customer support issue

## Part 1: Load and examine the data

In [1]:
data_bucket = 'machliba'
subfolder = 'ch04'
dataset = 'inbound.csv'

In [2]:
import pandas as pd                               
import boto3
import sagemaker
import s3fs
from sklearn.model_selection import train_test_split
import json
from slugify import slugify
import csv
from time import sleep

role = sagemaker.get_execution_role()
s3 = s3fs.S3FileSystem(anon=False)

In [3]:
%%time
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}')
display(df.head())

Unnamed: 0,tweet_id,author_id,created_at,in_reply_to,text,escalate
0,2,115712,Tue Oct 31 22:11:45 +0000 2017,sprintcare,@sprintcare and how do you propose we do that,False
1,3,115712,Tue Oct 31 22:08:27 +0000 2017,sprintcare,@sprintcare I have sent several private messag...,True
2,5,115712,Tue Oct 31 21:49:35 +0000 2017,sprintcare,@sprintcare I did.,False
3,16,115713,Tue Oct 31 20:00:43 +0000 2017,sprintcare,@sprintcare Since I signed up with you....Sinc...,False
4,22,115716,Tue Oct 31 22:16:48 +0000 2017,Ask_Spectrum,@Ask_Spectrum Would you like me to email you a...,False


CPU times: user 1.73 s, sys: 253 ms, total: 1.99 s
Wall time: 3.51 s


In [4]:
print(f'Number of rows in dataset: {df.shape[0]}')
print(df['escalate'].value_counts())

Number of rows in dataset: 520793
False    417800
True     102993
Name: escalate, dtype: int64


## Part 2: Get the data into the right shape

In [5]:
train_df, val_df, _, _ = train_test_split(df, df['escalate'], test_size=0.2, random_state=0)
print(f'{train_df.shape[0]} rows in training data')
print(f'{val_df.shape[0]} rows in validation data')

416634 rows in training data
104159 rows in validation data


In [6]:
%%time

def preprocess(df):
    all_rows = df.values.tolist()
    transformed_rows = list(map(transform_instance, all_rows))
    transformed_df = pd.DataFrame(transformed_rows)
    return transformed_df

def transform_instance(row):
    text = slugify(row[4], separator=' ')
    cur_row = []
    label = "__label__1" if row[5] == True else "__label__0" # Prefix 0 or 1 from sentiment
    cur_row.append(label)
    cur_row.append(text)
    return ' '.join(cur_row)

transformed_validation_rows = preprocess(val_df)
display(transformed_validation_rows.head())

Unnamed: 0,0
0,__label__1 115990 no joke this is one of the w...
1,__label__0 amazonhelp primeira camada https t ...
2,__label__1 microsofthelps my mistake
3,__label__1 770932 americanair they notoriously...
4,__label__1 amazonhelp neither man seems to kno...


CPU times: user 4.68 s, sys: 25.4 ms, total: 4.71 s
Wall time: 4.71 s


In [7]:
s3_validation_data = f's3://{data_bucket}/{subfolder}/processed/validation.csv'

data = transformed_validation_rows.to_csv(
        header=False, index=False, quoting=csv.QUOTE_NONE, sep='|', escapechar='^').encode()
with s3.open(s3_validation_data, 'wb') as f:
    f.write(data)

In [8]:
%%time
transformed_train_rows = preprocess(train_df)
display(transformed_train_rows.head())

s3_train_data = f's3://{data_bucket}/{subfolder}/processed/train.csv'

data = transformed_train_rows.to_csv(
        header=False, index=False, quoting=csv.QUOTE_NONE, sep='|', escapechar='^').encode()
with s3.open(s3_train_data, 'wb') as f:
    f.write(data)

Unnamed: 0,0
0,__label__0 amazonhelp et en plus se faire engu...
1,__label__1 morrisons 641226 standard reply mor...
2,__label__1 idea cares 1936 116590 this is gr8 ...
3,__label__0 askamex yes i did weeks ago and no ...
4,__label__0 amazonhelp i don t want your stupid...


CPU times: user 19.9 s, sys: 201 ms, total: 20.1 s
Wall time: 20.8 s


## Part 3: Create training and validation datasets

In [9]:
%%time

train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')

CPU times: user 14 µs, sys: 1 µs, total: 15 µs
Wall time: 16.7 µs


## Part 4: Train the model

In [10]:
s3_output_location = f's3://{data_bucket}/{subfolder}/output'

sess = sagemaker.Session()

container = sagemaker.amazon.amazon_estimator.get_image_uri(boto3.Session().region_name, "blazingtext", "latest")

estimator = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_max_run = 600,
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

estimator.set_hyperparameters(mode="supervised",
                            epochs=10,
                            vector_dim=10,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=2)

estimator.fit({'train': train_data, 'validation': validation_data})

INFO:sagemaker:Creating training-job with name: blazingtext-2019-04-28-10-09-29-359


2019-04-28 10:09:29 Starting - Starting the training job...
2019-04-28 10:09:30 Starting - Launching requested ML instances......
2019-04-28 10:10:40 Starting - Preparing the instances for training...
2019-04-28 10:11:25 Downloading - Downloading input data
2019-04-28 10:11:25 Training - Downloading the training image..
[31mArguments: train[0m
[31m[04/28/2019 10:11:35 INFO 140365849884480] nvidia-smi took: 0.0251679420471 secs to identify 0 gpus[0m
[31m[04/28/2019 10:11:35 INFO 140365849884480] Running single machine CPU BlazingText training using supervised mode.[0m
[31m[04/28/2019 10:11:35 INFO 140365849884480] Processing /opt/ml/input/data/train/train.csv . File size: 40 MB[0m
[31m[04/28/2019 10:11:35 INFO 140365849884480] Processing /opt/ml/input/data/validation/validation.csv . File size: 10 MB[0m
[31mRead 7M words[0m
[31mNumber of words:  23687[0m
[31mLoading validation data from /opt/ml/input/data/validation/validation.csv[0m
[31mLoaded validation data.[0m
[31

## Part 5: Host the Model

In [11]:
endpoint_name = 'customer-support-slugify'
try:
    sess.delete_endpoint(sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name).endpoint)
    print('Warning: Existing endpoint deleted to make way for your new endpoint.')
    sleep(30)
except:
    pass

INFO:sagemaker:Deleting endpoint with name: customer-support-slugify




In [13]:
text_classifier = estimator.deploy(initial_instance_count = 1,
                                instance_type = 'ml.t2.medium',
                                endpoint_name=endpoint_name)

INFO:sagemaker:Creating model with name: blazingtext-2019-04-28-10-16-05-464
INFO:sagemaker:Creating endpoint with name customer-support-slugify


----------------------------------------------------------------------------------------!

## Test the Model

In [19]:
tweet = "I am very happy to be finished the chapter"
tweet = "I was frustrated by my competing priorities"

tokenized_tweet = [slugify(tweet, separator=' ')]
payload = {"instances" : tokenized_tweet}
response = text_classifier.predict(json.dumps(payload))
escalate = json.loads(response.decode("utf-8"))
escalate

[{'prob': [0.993664026260376], 'label': ['__label__1']}]

## Remove the Endpoint (optional)

Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"

In [None]:
# Remove the Endpoint (optional)
# Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"

# sess.delete_endpoint(text_classifier.endpoint)