## Taking a look at the Enron labeled data

code borrowed from :
https://github.com/shoreason/enron-topic-modeling/blob/master/enron_lda.ipynb

In [1]:
import numpy as np
import pandas as pd
import vocab as vocabulary
import collections
import utils
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from wordcloud import WordCloud ,STOPWORDS
from collections import defaultdict

In [65]:
import glob
import fileinput
import shutil
import os,sys,inspect
import time
import markdown
import json
import requests
import warnings
import re


In [52]:
# !pip3 install langdetect
from langdetect import detect

In [3]:
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)
import googleapi

In [44]:
PerspectiveAPI = googleapi.GOOGLEAPI

In [5]:
datadir = "/data/SuperMod/emails.csv"

In [6]:
enrondata = pd.read_csv(datadir)

In [7]:
enrondata.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


## Parse email

In [8]:
def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to']
    for line in lines:
        if ':' not in line:
            message += line.strip()
            email['body'] = message
        else:
            pairs = line.split(':')
            key = pairs[0].lower()
            val = pairs[1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email

In [9]:
def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return {
        'body': map_to_list(emails, 'body'),
        'to': map_to_list(emails, 'to'),
        'from_': map_to_list(emails, 'from')
    }

In [10]:
def map_to_list(emails, key):
    results = []
    for email in emails:
        if key not in email:
            results.append('')
        else:
            results.append(email[key])
    return results

In [11]:
email_df = pd.DataFrame(parse_into_emails(enrondata.message))
print(email_df.head())

                                                body                    from_  \
0                               Here is our forecast  phillip.allen@enron.com   
1  Traveling to have a business meeting takes the...  phillip.allen@enron.com   
2                     test successful.  way to go!!!  phillip.allen@enron.com   
3  Randy,Can you send me a schedule of the salary...  phillip.allen@enron.com   
4                                                     phillip.allen@enron.com   

                        to  
0     tim.belden@enron.com  
1  john.lavorato@enron.com  
2   leah.arsdall@enron.com  
3    randall.gay@enron.com  
4     greg.piper@enron.com  


## Add Perspective score: toxiciy only

In [29]:
email_df.body

0                                      Here is our forecast
1         Traveling to have a business meeting takes the...
2                            test successful.  way to go!!!
3         Randy,Can you send me a schedule of the salary...
4                                                          
5         Greg,How about either next Tuesday or Thursday...
6         Phillip Allen (pallen@enron.com)Mike Grigsby (...
7                                                          
8         I don't think these are required by the ISP2. ...
9         ---------------------- Forwarded by Phillip K ...
10        Mr. Buckner,For delivered gas behind San Diego...
11        Lucy,Open them and save in the rentroll folder...
12        ---------------------- Forwarded by Phillip K ...
13        ---------------------- Forwarded by Phillip K ...
14        Dave,Here are the names of the west desk membe...
15                          Paula,35 million is finePhillip
16        ---------------------- Forward

In [12]:
# obtained from https://www.kaggle.com/tarunpaparaju/jigsaw-competition-google-perspective-api
# allowed test types
allowed = ["TOXICITY",
           "SEVERE_TOXICITY",
           "TOXICITY_FAST",
           "ATTACK_ON_AUTHOR",
           "ATTACK_ON_COMMENTER",
           "INCOHERENT",
           "INFLAMMATORY",
           "OBSCENE",
           "OFF_TOPIC",
           "UNSUBSTANTIAL",
           "LIKELY_TO_REJECT"]

class Perspective(object):

    base_url = "https://commentanalyzer.googleapis.com/v1alpha1"

    def __init__(self, key):
        self.key = key

    def score(self, text, tests=["TOXICITY"], context=None, languages=None, do_not_store=False, token=None, text_type=None):
        # data validation
        # make sure it's a valid test
        # TODO: see if an endpoint that has valid types exists
        if isinstance(tests, str):
            tests = [tests]
        if not isinstance(tests, (list, dict)) or tests is None:
            raise ValueError("Invalid list/dictionary provided for tests")
        if isinstance(tests, list):
            new_data = {}
            for test in tests:
                new_data[test] = {}
            tests = new_data
        if text_type:
            if text_type.lower() == "html":
                text = remove_html(text)
            elif text_type.lower() == "md":
                text = remove_html(text, md=True)
            else:
                raise ValueError("{0} is not a valid text_type. Valid options are 'html' or 'md'".format(str(text_type)))

        for test in tests.keys():
            if test not in allowed:
                warnings.warn("{0} might not be accepted as a valid test.".format(str(test)))
            for key in tests[test].keys():
                if key not in ["scoreType", "scoreThreshhold"]:
                    raise ValueError("{0} is not a valid sub-property for {1}".format(key, test))

        # The API will only grade text less than 3k characters long
        if len(text) > 3000:
            # TODO: allow disassembly/reassembly of >3000char comments
            warnings.warn("Perspective only allows 3000 character strings. Only the first 3000 characters will be sent for processing")
            text = text[:3000]
        new_langs = []
        if languages:
            for language in languages:
                language = language.lower()
                if validate_language(language):
                    new_langs.append(language)

        # packaging data
        url = Perspective.base_url + "/comments:analyze"
        querystring = {"key": self.key}
        payload_data = {"comment": {"text": text}, "requestedAttributes": {}}
        for test in tests.keys():
            payload_data["requestedAttributes"][test] = tests[test]
        if new_langs != None:
            payload_data["languages"] = new_langs
        if do_not_store:
            payload_data["doNotStore"] = do_not_store
        payload = json.dumps(payload_data)
        headers = {'content-type': "application/json"}
        response = requests.post(url,
                            data=payload,
                            headers=headers,
                            params=querystring)
        data = response.json()
        if "error" in data.keys():
            raise PerspectiveAPIException(data["error"]["message"])
        c = Comment(text, [], token)
        base = data["attributeScores"]
        for test in tests.keys():
            score = base[test]["summaryScore"]["value"]
            score_type = base[test]["summaryScore"]["type"]
            a = Attribute(test, [], score, score_type)
            for span in base[test]["spanScores"]:
                beginning = span["begin"]
                end = span["end"]
                score = span["score"]["value"]
                score_type = span["score"]["type"]
                s = Span(beginning, end, score, score_type, c)
                a.spans.append(s)
            c.attributes.append(a)
        return c

class Comment(object):
    def __init__(self, text, attributes, token):
        self.text = text
        self.attributes = attributes
        self.token = token

    def __getitem__(self, key):
        if key.upper() not in allowed:
            raise ValueError("value {0} does not exist".format(key))
        for attr in self.attributes:
            if attr.name.lower() == key.lower():
                return attr
        raise ValueError("value {0} not found".format(key))

    def __str__(self):
        return self.text

    def __repr__(self):
        count = 0
        num = 0
        for attr in self.attributes:
            count += attr.score
            num += 1
        return "<({0}) {1}>".format(str(count/num), self.text)

    def __iter__(self):
        return iter(self.attributes)

    def __len__(self):
        return len(self.text)

class Attribute(object):
    def __init__(self, name, spans, score, score_type):
        self.name = name
        self.spans = spans
        self.score = score
        self.score_type = score_type

    def __getitem__(self, index):
        return self.spans[index]

    def __iter__(self):
        return iter(self.spans)

class Span(object):
    def __init__(self, begin, end, score, score_type, comment):
        self.begin = begin
        self.end = end
        self.score = score
        self.score_type = score_type
        self.comment = comment

    def __str__(self):
        return self.comment.text[self.begin:self.end]

    def __repr__(self):
        return "<({0}) {1}>".format(self.score, self.comment.text[self.begin:self.end])

class PerspectiveAPIException(Exception):
    pass

In [45]:

client = Perspective(PerspectiveAPI)

In [38]:

emails = email_df['body']


In [39]:
emails[:5]

0                                 Here is our forecast
1    Traveling to have a business meeting takes the...
2                       test successful.  way to go!!!
3    Randy,Can you send me a schedule of the salary...
4                                                     
Name: body, dtype: object

In [46]:
toxicity_scores = []


start = time.time()
print("                         EXAMPLE WORKING OF PERSPECTIVE API                          ")
print("                         ----------------------------------                          ")
print("")
for i, email in enumerate(emails[:10]):
    if email == '':
        continue

    current = time.time()
    time.sleep((i + 1) - (current - start)) # limit API calls to 1 per second
    toxicity = client.score(email, tests=["TOXICITY"])

    toxicity_scores.append(toxicity["TOXICITY"].score)

    if i <= 50:
        print("email :\n" + email)
        print("")
        print("TOXICITY SCORE : " + str(toxicity["TOXICITY"].score) )

        print(("*********************************************************************"+\
               "***********************").replace('*', '-'))
        print("")

                         EXAMPLE WORKING OF PERSPECTIVE API                          
                         ----------------------------------                          

email :
Here is our forecast

TOXICITY SCORE : 0.01873601
--------------------------------------------------------------------------------------------

email :
Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.My suggestion

### add score to all emails

In [73]:
def get_perspective_score(email_list):
    start = time.time()
    toxicity_scores = []
    
    for i, email in enumerate(email_list):
        
        if email == '' or not re.search('[a-zA-Z]+', email) or  detect(email) != 'en' :
            # Perspective only works with english. Remove empty strings and non English emails
            toxicity_scores.append(0)
        else:

            current = time.time()
            time.sleep((i + 1) - (current - start)) # limit API calls to 1 per second
            toxicity = client.score(email, tests=["TOXICITY"])

            toxicity_scores.append(toxicity["TOXICITY"].score)
        print(i)
            

    return toxicity_scores


In [70]:
print (re.search('[a-zA-Z]+','2000-1969=31'))

None


In [64]:
emails[194:197]

194    Are there behind closed doors discussions bein...
195                                         2000-1969=31
196    Mary,I spoke to Gary about the foundation work...
Name: body, dtype: object

In [71]:
# for i, email in enumerate(emails):
#     print(i)
#     if email != '':
#         detect(email) != 'en'

In [74]:

# toxicity_scores_all = get_perspective_score(emails)

0
1
2
3
4
5
6
7
8
9
10
11




12




13
14
15
16
17
18
19
20
21
22
23
24




25




26




27
28
29
30
31
32
33
34
35
36
37
38
39
40




41
42
43
44
45
46
47
48
49
50




51
52
53
54
55
56




57
58
59
60




61
62




63
64
65
66
67
68
69
70
71




72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115




116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155




156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222


KeyboardInterrupt: 

In [None]:
## total count
## 517398

In [41]:
# print(email_df.iloc[13,0])

In [42]:
# print(enrondata.iloc[13,1])

In [43]:
# email_df.body.iloc