**Natural Language Processing**

**OBJECTIVE:** Decide if customer complaints should be escalated or not using BlazingText Algorithm

In [1]:
# Importing the require libraries 

import pandas as pd                               
import boto3
import sagemaker
import s3fs
from sklearn.model_selection import train_test_split
import json
import nltk
import csv
from time import sleep

nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Data Setup

In [2]:
data_bucket = "just-abdul-aws" #Defining bucket name
subfolder = "NLP"
dataset = "twitter_data.csv"

In [3]:
role = sagemaker.get_execution_role()
sess = sagemaker.Session()
s3 = s3fs.S3FileSystem(anon=False)
sm = boto3.Session().client('sagemaker')

Loading the data

In [4]:
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}')

In [5]:
df.head()

Unnamed: 0,tweet_id,author_id,created_at,in_reply_to,text,escalate
0,2,115712,Tue Oct 31 22:11:45 +0000 2017,sprintcare,@sprintcare and how do you propose we do that,False
1,3,115712,Tue Oct 31 22:08:27 +0000 2017,sprintcare,@sprintcare I have sent several private messag...,True
2,5,115712,Tue Oct 31 21:49:35 +0000 2017,sprintcare,@sprintcare I did.,False
3,16,115713,Tue Oct 31 20:00:43 +0000 2017,sprintcare,@sprintcare Since I signed up with you....Sinc...,False
4,22,115716,Tue Oct 31 22:16:48 +0000 2017,Ask_Spectrum,@Ask_Spectrum Would you like me to email you a...,False


In [6]:
df.shape

(520793, 6)

In [7]:
df['escalate'].value_counts()

False    417800
True     102993
Name: escalate, dtype: int64

Preprocessing

In [8]:
X = df
Y = df['escalate']

In [9]:
# Splitting the data set into training and validation set

train_df, val_df, _, _ = train_test_split(X, Y, test_size=0.4, random_state=0)

In [10]:
# Function that converts the data to BlazingText Format

def preprocess(df):
    all_rows = df.values.tolist()
    transformed_rows = list(map(transform_instance, all_rows))
    transformed_df = pd.DataFrame(transformed_rows)
    return transformed_df

def transform_instance(row):
    cur_row = []
    label = "__label__1" if row[5] == True else "__label__0" # Prefix 0 or 1 from sentiment
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[4].lower()))
    return ' '.join(cur_row)

transformed_validation_rows = preprocess(val_df)

In [11]:
transformed_validation_rows.head()

Unnamed: 0,0
0,__label__1 @ 115990 no joke ... this is one of...
1,__label__0 @ amazonhelp primeira camada ... ht...
2,__label__1 @ microsofthelps my mistake
3,__label__1 @ 770932 @ americanair they notorio...
4,__label__1 @ amazonhelp neither man seems to k...


Storing the transformed data into amazon s3

In [12]:
s3_validation_data = f's3://{data_bucket}/{subfolder}/processed/validation.csv'

data = transformed_validation_rows.to_csv(header=False, index=False, quoting=csv.QUOTE_NONE, sep='|', escapechar='^').encode()

with s3.open(s3_validation_data, 'wb') as f:
    f.write(data)

In [13]:
# Preprocessing the training data

transformed_training_rows = preprocess(train_df)

In [14]:
# Storing the training data into amazon S3
s3_training_data = f's3://{data_bucket}/{subfolder}/processed/training.csv'

training_data = transformed_training_rows.to_csv(header=False, index=False, quoting=csv.QUOTE_NONE, sep='|', escapechar='^').encode()

with s3.open(s3_training_data, 'wb') as f:
    f.write(training_data)

Preparing the CSV data for SageMaker

In [16]:
train_data = sagemaker.TrainingInput(s3_training_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.TrainingInput(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')

Training the Model

In [17]:
s3_output_location = f's3://{data_bucket}/{subfolder}/output'

sess = sagemaker.Session()
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'blazingtext', 'latest')

estimator = sagemaker.estimator.Estimator(
                            container,
                            role, 
                            train_instance_count=1, 
                            train_instance_type='ml.m4.xlarge',
                            train_max_run = 600,
                            output_path=s3_output_location,
                            sagemaker_session=sess)

estimator.set_hyperparameters(
                            mode="supervised",
                            epochs=10,
                            vector_dim=10,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=2)

estimator.fit({'train': train_data, 'validation': validation_data})

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2021-06-27 19:02:00 Starting - Starting the training job...
2021-06-27 19:02:23 Starting - Launching requested ML instancesProfilerReport-1624820520: InProgress
......
2021-06-27 19:03:23 Starting - Preparing the instances for training......
2021-06-27 19:04:30 Downloading - Downloading input data...
2021-06-27 19:04:44 Training - Downloading the training image..[34mArguments: train[0m
[34m[06/27/2021 19:05:06 INFO 139740146722176] nvidia-smi took: 0.02530217170715332 secs to identify 0 gpus[0m
[34m[06/27/2021 19:05:06 INFO 139740146722176] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[06/27/2021 19:05:06 INFO 139740146722176] Processing /opt/ml/input/data/train/training.csv . File size: 32.84108638763428 MB[0m
[34m[06/27/2021 19:05:06 INFO 139740146722176] Processing /opt/ml/input/data/validation/validation.csv . File size: 21.904098510742188 MB[0m
[34mRead 6M words[0m
[34mNumber of wo

Hosting the Model

In [18]:
text_classifier = estimator.deploy(
                        initial_instance_count = 1,
                        instance_type = 'ml.m4.xlarge',
                        endpoint_name="NLP-Text-Classifier")

-------------!

Testing the Model

In [26]:
# Convert the payload into JSON format for the model

from sagemaker.serializers import JSONSerializer

text_classifier.serializer = JSONSerializer()

In [146]:
tweet1 = "I don't know why they wouldn't fix it!"

tokenized_tweet = [' '.join(nltk.word_tokenize(tweet1))]
payload = {"instances" : tokenized_tweet}
response = text_classifier.predict(payload)
escalate = pd.read_json(response)
escalate

Unnamed: 0,label,prob
0,[__label__0],[0.9982776641845701]


In [147]:
# Defining function for result

def Result():
    for item in escalate['label']:
        i = str(item).replace('[','').replace(']','') # Removes the square brackets
        if i == "'__label__0'":
            print ('Do not Escalate!')
        else:
            print ("Escalate!")   

In [152]:
tweet2 = "Excellent service!"

tokenized_tweet = [' '.join(nltk.word_tokenize(tweet2))]
payload = {"instances" : tokenized_tweet}
response = text_classifier.predict(payload)
escalate = pd.read_json(response)
Result()

Do not Escalate!


In [153]:
tweet3 = "I'm not angry!"

tokenized_tweet = [' '.join(nltk.word_tokenize(tweet3))]
payload = {"instances" : tokenized_tweet}
response = text_classifier.predict(payload)
escalate = pd.read_json(response)
Result()

Escalate!


In [159]:
tweet4 = "nonsence, slow, annoying, lack, worst!"

tokenized_tweet = [' '.join(nltk.word_tokenize(tweet4))]
payload = {"instances" : tokenized_tweet}
response = text_classifier.predict(payload)
escalate = pd.read_json(response)
Result()

Escalate!


# Clean Up

In [None]:
sess.delete_endpoint(text_classifier.endpoint_name)