# Initial Load Config

In [1]:
import pandas as pd
import boto3
from io import BytesIO

import nltk
from nltk.corpus import stopwords

AWS_REGION = 'us-east-2'

session = boto3.Session(profile_name='kranio')
s3 = session.client('s3')

nltk.data.path.append("/tmp")
nltk.download('stopwords', download_dir='/tmp')
ESW = stopwords.words('english')

[nltk_data] Downloading package stopwords to /tmp...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
bucket = "kranio-datalake"
key = "internal/webinar/leoCamilo/enriched/complaints-standard/cd377ca5c3594fba93bca95e25545b6a"

obj = s3.get_object(Bucket=bucket, Key=key)
obj = BytesIO(obj['Body'].read())

df_source = pd.read_json(obj, lines=True)

# Exploratory Analysis

In [76]:
stop_words = list(ESW) + ["comcast", "xfinity", "i", "i'm", "don't", "xfintity", "am", "pm", "pt", "gb", "fcc", "br"]

df_result = df_source.iloc[:1000].copy()
df_result["WordCloud"] = df_result["Description"].str.lower()

df_result["WordCloud"] = df_result["WordCloud"].str.replace(r'(\'s|\W|\W+|[0-9_-])', ' ', regex=True)
df_result["WordCloud"] = df_result["WordCloud"].str.split()
df_result["WordCloud"] = df_result["WordCloud"].apply(lambda w: [item for item in w if item])
df_result["WordCloud"] = df_result["WordCloud"].apply(lambda w: [item for item in w if item not in stop_words])
df_result["WordCloud"] = df_result["WordCloud"].str.join(" ")

In [77]:
all_description = ' '.join(df_result["WordCloud"])
all_description_df = pd.DataFrame({"Words": all_description.split(), "Count": 1})

In [78]:
all_description_group_df = all_description_df.groupby(["Words"], as_index=False).count()
all_description_group_df = all_description_group_df[all_description_group_df["Count"] > 100]
all_description_group_df = all_description_group_df.sort_values(by=["Count"], ascending=False)

In [79]:
all_description_group_df.head()

Unnamed: 0,Words,Count
4283,notice,2346
5688,service,2301
1225,complaint,2104
5372,response,1878
3364,internet,1756


In [80]:
df_tmp = df_result[["Ticket #", "WordCloud", "Sentiment"]].copy()
df_tmp.head()

Unnamed: 0,Ticket #,WordCloud,Sentiment
0,250635,contacting internet technical support last mon...,NEGATIVE
1,223441,back january made payments one january service...,NEGATIVE
2,242732,home located acworth georgia signed one year c...,NEGATIVE
3,277946,atlanta area put effect unprecendented usage c...,NEGATIVE
4,307175,customer sort years never issues like past yea...,NEGATIVE


In [81]:
df_tmp_words = df_tmp.set_index("Ticket #").copy()
df_tmp_words["WordCloud"] = df_tmp_words["WordCloud"].str.split()
df_tmp_words = df_tmp_words["WordCloud"].apply(pd.Series)
df_tmp_words = df_tmp_words.stack().to_frame()
df_tmp_words = df_tmp_words.reset_index()
df_tmp_words = df_tmp_words.rename(columns={0: "Words"})
df_tmp_words = df_tmp_words[["Ticket #", "Words"]]
df_tmp_words = df_tmp_words.merge(df_tmp, on="Ticket #")
df_tmp_words = df_tmp_words[["Ticket #", "Sentiment", "Words"]]

In [82]:
df_tmp_words.shape

(138547, 3)

In [83]:
df_tmp2 = all_description_group_df.iloc[:500].copy()
df_tmp2.head()

Unnamed: 0,Words,Count
4283,notice,2346
5688,service,2301
1225,complaint,2104
5372,response,1878
3364,internet,1756


In [86]:
df_tmp_words.merge(df_tmp2, on="Words").shape

(88389, 4)