# Initial Load Config

In [53]:
import pandas as pd
import boto3
import numpy as np
import re
from io import BytesIO
import time
from datetime import datetime

MAX_SENTENCE_LENGTH_IN_CHARS = 4500
ENDPOINT_ARN = "arn:aws:comprehend:us-east-1:193024568733:document-classifier-endpoint/complaint-training-tmp"

s3 = boto3.client('s3')
comprehend_client = boto3.client('comprehend')

In [123]:
bucket = "sura-text-mining-poc"
key = "raw/complaints/complaints.csv"
obj = s3.get_object(Bucket=bucket, Key=key)
obj = BytesIO(obj['Body'].read())

df_source = pd.read_csv(obj)

# Exploratory Analysis

In [52]:
df_source.iloc[0:1100].tail(1)

Unnamed: 0,Ticket #,Customer Complaint,Date,Time,Received Via,City,State,Zip code,Status,Filing on Behalf of Someone,Description
1099,327394,Fraudulent billing practice,6/7/2015,1:35:54 PM,Internet,Lancaster,Pennsylvania,17602,Solved,No,I have internet service with Comcast. I purcha...


### Select columns

In [40]:
df = df_source[['Ticket #','Customer Complaint','Description']].copy()

In [41]:
df.head()

Unnamed: 0,Ticket #,Customer Complaint,Description
0,250635,Comcast Cable Internet Speeds,I have been contacting Comcast Internet Techni...
1,223441,Payment disappear - service got disconnected,Back in January 2015 I made 2 payments: One fo...
2,242732,Speed and Service,Our home is located at in Acworth Georgia 3010...
3,277946,Comcast Imposed a New Usage Cap of 300GB that ...,Comcast in the Atlanta area has just put into ...
4,307175,Comcast not working and no service to boot,I have been a customer of Comcast of some sort...


### Remove punctuation/lower casing

In [42]:
df['TextToBeAnalyzed'] = df['Customer Complaint'].map(lambda x: re.sub(r'[,\.!?-]', '', x,))
df['TextToBeAnalyzed'] = df['TextToBeAnalyzed'].map(lambda x: x.lower())

In [43]:
df.head()

Unnamed: 0,Ticket #,Customer Complaint,Description,TextToBeAnalyzed
0,250635,Comcast Cable Internet Speeds,I have been contacting Comcast Internet Techni...,comcast cable internet speeds
1,223441,Payment disappear - service got disconnected,Back in January 2015 I made 2 payments: One fo...,payment disappear service got disconnected
2,242732,Speed and Service,Our home is located at in Acworth Georgia 3010...,speed and service
3,277946,Comcast Imposed a New Usage Cap of 300GB that ...,Comcast in the Atlanta area has just put into ...,comcast imposed a new usage cap of 300gb that ...
4,307175,Comcast not working and no service to boot,I have been a customer of Comcast of some sort...,comcast not working and no service to boot


### Add group column

In [24]:
df['Group Class'] = ''

### Select only 1500 for the test

In [35]:
df = df.iloc[:1500].copy()

### Get Group Class

In [36]:
def get_group_class(example_text):
    try:
        response = comprehend_client.classify_document(
            Text=example_text,
            EndpointArn=ENDPOINT_ARN
        )
    except:
        return "General"

    classes = response["Classes"]
    group_class = max(classes, key=lambda x: x['Score'])
    group_class = group_class["Name"]

    return group_class

In [30]:
start_time = datetime.now()
df["Group Class"] = df["TextToBeAnalyzed"].apply(get_group_class)
print(datetime.now() - start_time)

0:13:41.989973


In [31]:
df_result = df_result[[
    "Ticket #", "Customer Complaint", "Group Class"
]]

In [34]:
df_result[df_result["Group Class"] == "General"]

(21, 3)

# Results

In [None]:
df.head(100)