In [6]:
# Import libraries
import os
import boto3
import pandas as pd

In [8]:
# Constants
_HOME_DIR = "~/Workspace/yought-dev/"

In [22]:
# Initialise Amazon Comprehend client
# Start session
s = boto3.session.Session()
# Start client
comprehend_client = s.client('comprehend')

In [23]:
# Load data
fpath = _HOME_DIR+"Data/text-categorisation-data.xlsx"
data = pd.read_excel(fpath, sheet_name="raw-data")
data.head()

Unnamed: 0,id,rating,comments,comment category
0,1,Good,AMAZING! best scanning experience ever!,
1,2,Good,"A bit slow but its working, also got the day o...",
2,3,Good,,
3,4,Good,,
4,5,Good,,


In [16]:
# Test detect sentiment
t_l = ["This is my favourite TV Show!","I hate Friends! Worst TV show!"]
for t in t_l:
    result = comprehend_client.detect_sentiment(Text=t,LanguageCode="en")
    print(result)

{'Sentiment': 'POSITIVE', 'SentimentScore': {'Positive': 0.9993520379066467, 'Negative': 6.146515079308301e-05, 'Neutral': 0.0005709004472009838, 'Mixed': 1.5626157619408332e-05}, 'ResponseMetadata': {'RequestId': 'ef73ae75-1b27-4c1c-aad0-ea6f4399c1bf', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'ef73ae75-1b27-4c1c-aad0-ea6f4399c1bf', 'content-type': 'application/x-amz-json-1.1', 'content-length': '166', 'date': 'Wed, 16 Jun 2021 11:12:40 GMT'}, 'RetryAttempts': 0}}
{'Sentiment': 'NEGATIVE', 'SentimentScore': {'Positive': 0.0003344400611240417, 'Negative': 0.9991570711135864, 'Neutral': 0.0004947020788677037, 'Mixed': 1.3743381714448333e-05}, 'ResponseMetadata': {'RequestId': '22767518-6ea3-4fbc-814e-a3c64b86c4ba', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '22767518-6ea3-4fbc-814e-a3c64b86c4ba', 'content-type': 'application/x-amz-json-1.1', 'content-length': '166', 'date': 'Wed, 16 Jun 2021 11:12:40 GMT'}, 'RetryAttempts': 0}}


In [24]:
# Sample comments
data_sample = data.loc[~(data["comments"].isna()),['id','rating','comments']].sample(10)
data_sample

Unnamed: 0,id,rating,comments
19859,19860,Good,Good
23359,23360,Good,Very smooth experience!
79,80,Bad,Took ages to do.
12136,12137,Good,My address has changed and is recorded on the ...
40512,40513,Good,No it was great
19638,19639,Good,You already have my license scanned
29125,29126,Good,Correct address is on rear of card
40230,40231,Bad,Took a long time to position correctly for it ...
2045,2046,Good,It was very annoying and tricky that it was us...
32897,32898,Good,Like it


In [28]:
# Test sentiment detection on comments
for ind,row in data_sample.iterrows():
    result = comprehend_client.detect_sentiment(Text=row["comments"],LanguageCode="en")
    print("Text:\t\t{}\nLabel:\t\t{}\nSentiment:\t{}\n".format(row["comments"],row["rating"],result['Sentiment']))

Text:		Good
Label:		Good
Sentiment:	POSITIVE

Text:		Very smooth experience!
Label:		Good
Sentiment:	POSITIVE

Text:		Took ages to do.
Label:		Bad
Sentiment:	NEUTRAL

Text:		My address has changed and is recorded on the back of my licence. Do you wish to scan that?
Label:		Good
Sentiment:	NEUTRAL

Text:		No it was great
Label:		Good
Sentiment:	NEGATIVE

Text:		You already have my license scanned
Label:		Good
Sentiment:	NEUTRAL

Text:		Correct address is on rear of card
Label:		Good
Sentiment:	NEUTRAL

Text:		Took a long time to position correctly for it to be scanned
Label:		Bad
Sentiment:	NEGATIVE

Text:		It was very annoying and tricky that it was using the selfie camera... it should just use the regular camera to make it easier
Label:		Good
Sentiment:	NEGATIVE

Text:		Like it
Label:		Good
Sentiment:	POSITIVE



In [30]:
# Test detect-key-phrases
for ind,row in data_sample.iterrows():
    result = comprehend_client.detect_key_phrases(Text=row["comments"],LanguageCode="en")
    kp_l = [kp['Text'] for kp in result['KeyPhrases']]
    print("Text:\t\t{}\nLabel:\t\t{}\nKey phrases:\t{}\n".format(row["comments"],row["rating"],kp_l))

Text:		Good
Label:		Good
Key phrases:	['Good']

Text:		Very smooth experience!
Label:		Good
Key phrases:	['smooth experience']

Text:		Took ages to do.
Label:		Bad
Key phrases:	['ages']

Text:		My address has changed and is recorded on the back of my licence. Do you wish to scan that?
Label:		Good
Key phrases:	['My address', 'the back', 'my licence']

Text:		No it was great
Label:		Good
Key phrases:	['No it']

Text:		You already have my license scanned
Label:		Good
Key phrases:	['my license']

Text:		Correct address is on rear of card
Label:		Good
Key phrases:	['Correct address', 'rear']

Text:		Took a long time to position correctly for it to be scanned
Label:		Bad
Key phrases:	['a long time', 'position']

Text:		It was very annoying and tricky that it was using the selfie camera... it should just use the regular camera to make it easier
Label:		Good
Key phrases:	['the selfie camera', 'the regular camera']

Text:		Like it
Label:		Good
Key phrases:	[]

