# Initial Load Config

In [1]:
import pandas as pd
import boto3
from io import BytesIO
import gzip
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from datetime import timedelta, datetime, date

AWS_REGION = 'us-east-1'
MAX_SENTENCE_LENGTH_IN_CHARS = 4500

s3 = boto3.client('s3')
comprehend_client = boto3.client('comprehend')

In [2]:
bucket = "sura-text-mining-poc"
key = "raw/complaints/complaints.csv"

obj = s3.get_object(Bucket=bucket, Key=key)
obj = BytesIO(obj['Body'].read())

df_source = pd.read_csv(obj)

# Exploratory Analysis

In [3]:
df_source.head()

Unnamed: 0,Ticket #,Customer Complaint,Date,Time,Received Via,City,State,Zip code,Status,Filing on Behalf of Someone,Description
0,250635,Comcast Cable Internet Speeds,4/22/2015,3:53:50 PM,Internet,Abingdon,Maryland,21009,Closed,No,I have been contacting Comcast Internet Techni...
1,223441,Payment disappear - service got disconnected,4/8/2015,10:22:56 AM,Internet,Acworth,Georgia,30102,Closed,No,Back in January 2015 I made 2 payments: One fo...
2,242732,Speed and Service,4/18/2015,9:55:47 AM,Internet,Acworth,Georgia,30101,Closed,Yes,Our home is located at in Acworth Georgia 3010...
3,277946,Comcast Imposed a New Usage Cap of 300GB that ...,5/7/2015,11:59:35 AM,Internet,Acworth,Georgia,30101,Open,Yes,Comcast in the Atlanta area has just put into ...
4,307175,Comcast not working and no service to boot,5/26/2015,1:25:26 PM,Internet,Acworth,Georgia,30101,Solved,No,I have been a customer of Comcast of some sort...


# Calculating a score to opened tickets using comprehend

### Select Columns

In [4]:
df = df_source[["Ticket #", "Customer Complaint", "Description", "Status"]]

### Filter only opened tickets

In [5]:
opened_tickets_indexes = df["Status"] == "Open"
df = df[opened_tickets_indexes]

### Truncate Text to the limit of comprehend

In [6]:
df["TextToBeAnalyzed"] = df["Description"].str[:MAX_SENTENCE_LENGTH_IN_CHARS]

### Add QualityScore Column

In [7]:
df["QualityScore"] = 0.0

### Split DataFrame to the limit of comprehend

In [8]:
(rows, _) = df.shape
splitted_dataframe = np.array_split(df, rows / 23)

### Get score from comprehend

In [9]:
def select_score_list(syntax_tokens):
    score_list = list(map(lambda r: r['PartOfSpeech']['Score'], syntax_tokens))
    return score_list

def calculate_score_from_comprehend_response(response):
    comprehend_result = pd.DataFrame(response["ResultList"])
    comprehend_result["QualityScore"] = comprehend_result["SyntaxTokens"].apply(select_score_list)
    comprehend_result["QualityScore"] = comprehend_result["QualityScore"].apply(lambda r: round(np.mean(r) * 100, 1))
    comprehend_result = comprehend_result["QualityScore"]
    
    return comprehend_result

In [10]:
df_result = pd.DataFrame()

for dataframe in splitted_dataframe:
    dataframe_selected = dataframe.reset_index()
    text_list = dataframe_selected["TextToBeAnalyzed"].tolist()

    response = comprehend_client.batch_detect_syntax(TextList=text_list, LanguageCode="en")
    dataframe_selected["QualityScore"] = calculate_score_from_comprehend_response(response)

    dataframe_selected = dataframe_selected[["Ticket #", "Customer Complaint", "QualityScore"]]
    df_result = pd.concat([df_result, dataframe_selected], ignore_index=True)
    
df_result = df_result.sort_values(by=["QualityScore"], ascending=False)

In [11]:
df_result.head()

Unnamed: 0,Ticket #,Customer Complaint,QualityScore
102,291175,Data Caps,99.6
345,303296,Data Caps,99.5
269,310714,Data limit with Comcast,99.3
9,343346,Slow Internet Speed,99.3
186,322200,Comcast monopoly hurting my business,99.3


# Results

### Top 10 Quality Scored Texts

In [12]:
df_result.head(10)

Unnamed: 0,Ticket #,Customer Complaint,QualityScore
102,291175,Data Caps,99.6
345,303296,Data Caps,99.5
269,310714,Data limit with Comcast,99.3
9,343346,Slow Internet Speed,99.3
186,322200,Comcast monopoly hurting my business,99.3
308,374393,Comcast pricing practices,99.1
90,309213,"Billed for modem rental for several years, Com...",98.9
320,316257,Comcast Data Usage Meter,98.9
350,312977,Comcast Fraudulent Charges,98.8
267,307604,Comcast has a monopoly on our internet,98.8
