## Anonymize data for MIT

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("anonymization")

import pandas as pd
import numpy as np
import os
import multiprocessing as mp
from multiprocessing.pool import Pool
import re
from anonymization.Anonymization import Anonymization, AnonymizerChain
from anonymization.anonymizers import EmailAnonymizer, NamedEntitiesAnonymizer, PhoneNumberAnonymizer, UriAnonymizer, DateAnonymizer, MacAddressAnonymizer, CreditCardAnonymizer, IbanAnonymizer, SignatureAnonymizer, NumberAnonymizer
from collections import defaultdict
import os

### Load anonymization model

In [2]:
class TextMask:
    
    def __init__(self, anonymizers: list):
        self.anon = AnonymizerChain(Anonymization('en_US'))
        for ano in anonymizers:
            self.anon.add_anonymizers(ano)
            
    def anonymize(self, text: str):
        return self.anon.anonymize(text)
    
anonymizers = [EmailAnonymizer, PhoneNumberAnonymizer, UriAnonymizer, DateAnonymizer, CreditCardAnonymizer, IbanAnonymizer, NamedEntitiesAnonymizer('en_core_web_lg'), NumberAnonymizer]
anon = TextMask(anonymizers)

### Read raw 2way SMS data

In [65]:
MAIN_FOLDER_DIR = '../lm_project/2way_sms/2way_sms_data_for_mit/raw_sms_data'
MESSAGE_FILE_PATHS = [os.path.join(MAIN_FOLDER_DIR, i) for i in os.listdir(MAIN_FOLDER_DIR)]
print(len(MESSAGE_FILE_PATHS))

49


In [56]:
ASSIT_CAT_MAP_DICT = {'crc': 'Service' ,
                      'safeco-gold': 'Service',
                      'claims': 'Claims', 
                      'claims-safeco': 'Claims', 
                      'no-fault-liberty': 'Claims', 
                      'no-fault-safeco': 'Claims', 
                      'property-liberty': 'Claims', 
                      'property-safeco': 'Claims', 
                      'casualty-liberty': 'Claims', 
                      'casualty-safeco': 'Claims', 
                      'salvage-liberty': 'Claims', 
                      'salvage-safeco': 'Claims'}

In [85]:
def anonymize_error_tolerate(text: str):
    try: 
        return anon.anonymize(text)
    except Exception:
        return text
    
def filter_non_system(messageList_repName: str, messageList_body: str):
    try:
        if messageList_repName in ['system', 'Billing Bot'] or len(messageList_body.split(' ')) < 2:
            return messageList_body
        else:
            return anonymize_error_tolerate(messageList_body)
    except Exception:
        return messageList_body

In [None]:
service_df = pd.DataFrame()
for num, path in enumerate(MESSAGE_FILE_PATHS):
    df_message_sub = pd.read_parquet(path)
    df_message_sub['Ass_Cat'] = df_message_sub['type'].map(ASSIT_CAT_MAP_DICT)
    service_sub_df = df_message_sub[df_message_sub['Ass_Cat']=='Service'].reset_index(drop=True)
    # claims_sub_df = df_message_sub[df_message_sub['Ass_Cat']=='Claims'].reset_index(drop=True)
    print(num, service_sub_df.shape)
    service_sub_df['anonymized_messageList.body'] = service_sub_df[['messageList.repName', 'messageList.body']].apply(lambda record: filter_non_system(*record), axis=1)
    service_df = pd.concat([service_df, service_sub_df])
    # df_message = pd.concat([df_message, df_message_sub])
    service_df = service_df.reset_index(drop=True)

0 (146937, 40)
1 (190339, 40)
2 (231085, 40)
3 (169262, 40)
4 (192863, 40)
5 (137698, 40)
6 (140805, 40)
7 (141856, 40)
8 (177537, 40)
9 (224511, 40)
10 (196045, 40)
11 (149447, 40)
12 (158088, 40)
13 (209185, 40)
14 (153254, 40)
15 (42746, 40)
16 (183698, 40)
17 (129523, 40)
18 (142767, 40)


In [97]:
df_message_sub['type'].value_counts()

claims                  218489
crc                     202037
claims-safeco           160441
campaign-qns            131635
policy-billing-liber     84207
ivr-claims               37921
ivr-safeco               32180
property-liberty         30833
property-safeco          23109
salvage-liberty          20075
salvage-safeco           13213
cf-service               10496
ivr-service              10420
ivr-billing               8918
no-fault-liberty          4523
casualty-liberty          3455
grs-pal-auto-gl-libe      3116
casualty-safeco           2007
no-fault-safeco           1794
immc-auto                  608
grs-pal-auto-gl-helm       319
grs-pal-property-equ       115
safeco-gold                 53
rt-liberty                  18
sales-quote-bot             12
weather-alert                6
Name: type, dtype: int64

In [None]:
service_df.to_parquet('../lm_project/2way_sms/2way_sms_data_for_mit/anonymized_sms_data/service_anonymized.parquet')

In [175]:
print(service_df.shape)

(14524269, 41)


In [94]:
print(service_df.shape) # dhms('31JUL2021'd, 0, 0, 0);

(8226846, 41)


In [176]:
service_df[service_df['messageList.repName'].isin(['system', 'Billing Bot'])]['messageList.repName'].value_counts()

system         3120122
Billing Bot    1888070
Name: messageList.repName, dtype: int64

system         1567364
Billing Bot    1312937
Name: messageList.repName, dtype: int64

In [69]:
service_df.head(2)

Unnamed: 0,_id,channelType,claimNumber,closedDtm,closedType,consumerName,consumerPhone,consumerType,coverageType,createdDtm,...,schemaVersion,status,taskSid,topic,type,underwritingCompany,unreadDtm,unreadStatus,Ass_Cat,anonymized_messageList.body
0,6336675b556c21006827d2b3,sms,,2022-09-30 03:50:47.982,manual,,6063078032,,crc,2022-09-30 03:49:47.709,...,1.0,closed,,Rate,crc,,2022-09-30 03:49:47.709,0.0,Service,Hi how do I reinstate my policy and get my ins...
1,6336675b556c21006827d2b3,sms,,2022-09-30 03:50:47.982,manual,,6063078032,,crc,2022-09-30 03:49:47.709,...,1.0,closed,,Rate,crc,,2022-09-30 03:49:47.709,0.0,Service,"By texting us, you agree Liberty Mutual can te..."


In [54]:
service_df[service_df['_id']=='6336675b556c21006827d2b3'].sort_values(by='messageList_idx')['anonymized_messageList.body']

0    Hi how do I reinstate my policy and get my ins...
1    By texting us, you agree Liberty Mutual can te...
2    Hi there! To help get you to the right place, ...
3                                  Ask about my policy
4    Could you try that again? Type 1-5 from the op...
5                                     [number_removed]
6    We can certainly review your policy for potent...
Name: anonymized_messageList.body, dtype: object

In [181]:
for index, row in service_df.sample(20).iterrows():
    print(row['messageList.messageId'], '\n')
    print(row['messageList.body'], '\n',row['anonymized_messageList.body'] , '\n')

f7b31ed0-9be5-11ec-b929-93cd64c8d7ac 

We can certainly review your policy for potential savings and to confirm you have the best coverage for your needs! Please call us at 800-658-9857 and inform the agent you are interested in a policy review. They will be happy to assist you since a review cannot be completed via text. 
 We can certainly review your policy for potential savings and to confirm you have the best coverage for your needs! Please call us at 800-658-9857 and inform the agent you are interested in a policy review. They will be happy to assist you since a review cannot be completed via text. 

679f5eb0-b1a5-11eb-8a55-b5a031b90e5e 

Good morning. I apologize for our delayed response. I'm happy to see you were able to get this taken care of already. Feel free to reach back out if you ever need assistance in the future. Stay safe! Please take a moment to complete our short survey to let us know how we're doing.  https://libertymutualvoc.co1.qualtrics.com/jfe/form/SV_5cJIFsmSC8

### Save data in compliance with MIT requested format

In [None]:
service_df['messageList.body'] = service_df['anonymized_messageList.body']
service_df = service_df.drop(columns=['anonymized_messageList.body'])

In [184]:
service_df['createdate'] = pd.to_datetime(service_df['messageList.msgCreatedDtm']).dt.date
service_df['year'] = pd.to_datetime(service_df['createdate']).dt.year 
service_df['month'] = pd.to_datetime(service_df['createdate']).dt.month
print(service_df.shape)

(14524269, 43)


In [197]:
for index, row in service_df.sample(5).iterrows():
    print(row['messageList.messageId'], '\n')
    print(row['messageList.body'])

0a037cb0-54c3-11ed-8738-db7828e9a16a 

My name is [name_removed]
dd4850b0-3f11-11eb-b13f-cd089a5021da 

Liberty Mutual: How can I help you today? By texting us, you agree to rec txts at this #. Consent not reqd for svc. May use automated msg systems. Txt STOP to stop. Msg&Data rates apply.

4eefd5b0-f985-11ec-978b-79810d642130 

No
9eec9ca0-4c89-11ec-8383-c5e83dd3bce0 

I'm getting the confirmation mail in creating my account for first time
ad075a8c-cb23-4cf8-90e6-68a41b5bbaaf 

Hi there! My name is [name_removed] and I would be happy to assist you via Text, or you may call our office at [number_removed]-[number_removed]. Agents will be available until [date_removed] tonight.


In [130]:
service_df.sort_values(by='createdate').filter(['createdDtm', 'modifiedDtm', 'unreadDtm', 'messageList.msgCreatedDtm', 'messageList.body']).head(10)

Unnamed: 0,createdDtm,modifiedDtm,unreadDtm,messageList.msgCreatedDtm,messageList.body
3772760,2021-01-19 23:44:31.419,2021-07-31 23:50:37.307,2021-07-31 23:18:13.305,2021-07-31 10:58:57.149,Liberty Mutual: How can I help you today? By t...
576557,2021-07-31 17:10:15.033,2021-08-01 16:33:48.595,2021-08-01 16:27:38.206,2021-07-31 17:14:13.665,This is a major and urgent concern
576558,2021-07-31 17:10:15.033,2021-08-01 16:33:48.595,2021-08-01 16:27:38.206,2021-07-31 17:14:52.642,My name is [name_removed]\n[date_removed] [add...
576559,2021-07-31 17:10:15.033,2021-08-01 16:33:48.595,2021-08-01 16:27:38.206,2021-07-31 18:49:02.965,I been waiting for a while and no response
576561,2021-07-31 17:10:37.413,2021-07-31 17:11:28.287,2021-07-31 17:10:37.413,2021-07-31 17:10:37.710,Liberty Mutual: How can I help you today? By t...
576562,2021-07-31 17:10:37.413,2021-07-31 17:11:28.287,2021-07-31 17:10:37.413,2021-07-31 17:11:08.776,Are you a real person?
576563,2021-07-31 17:10:37.413,2021-07-31 17:11:28.287,2021-07-31 17:10:37.413,2021-07-31 17:11:11.419,"Hi there! To help get you to the right place, ..."
576564,2021-07-31 17:10:37.413,2021-07-31 17:11:28.287,2021-07-31 17:10:37.413,2021-07-31 17:11:27.160,4
576556,2021-07-31 17:10:15.033,2021-08-01 16:33:48.595,2021-08-01 16:27:38.206,2021-07-31 17:13:56.070,Thank you for your message! An agent will resp...
576565,2021-07-31 17:10:37.413,2021-07-31 17:11:28.287,2021-07-31 17:10:37.413,2021-07-31 17:11:28.107,Reporting a claim is now handled online at bit...


In [201]:
day

datetime.date(2023, 3, 15)

In [200]:
for yr in sorted(service_df['year'].unique()):
    year_df = service_df[service_df['year'].isin([yr])]
    print(yr, year_df.shape)
    for mt in sorted(year_df['month'].unique()):
        month_df = year_df[year_df['month'].isin([mt])]
        print(mt, month_df.shape)
        newpath = f'../lm_project/2way_sms/2way_sms_data_for_mit/anonymized_sms_data/Service/parquet/{yr}-{mt}'
        if not os.path.exists(newpath):
            os.makedirs(newpath)
        for day in month_df['createdate'].unique():
            day_df = month_df[month_df['createdate']==day].reset_index(drop=True)
            day_df = day_df.drop(columns=['createdate', 'year', 'month'])
            day_df.to_parquet(f'../lm_project/2way_sms/2way_sms_data_for_mit/anonymized_sms_data/Service/parquet/{yr}-{mt}/{str(day)}.parquet')

2019 (133624, 43)
1 (8924, 43)
2 (5896, 43)
3 (7162, 43)
4 (7798, 43)
5 (7876, 43)
6 (8236, 43)
7 (11542, 43)
8 (7973, 43)
9 (14536, 43)
10 (19425, 43)
11 (15004, 43)
12 (19252, 43)
2020 (1954399, 43)
1 (39638, 43)
2 (26349, 43)
3 (65334, 43)
4 (101659, 43)
5 (81957, 43)
6 (92748, 43)
7 (142479, 43)
8 (181697, 43)
9 (238351, 43)
10 (276606, 43)
11 (252630, 43)
12 (454951, 43)
2021 (6759419, 43)
1 (652743, 43)
2 (597266, 43)
3 (672354, 43)
4 (566627, 43)
5 (544604, 43)
6 (596826, 43)
7 (567796, 43)
8 (588986, 43)
9 (511882, 43)
10 (506916, 43)
11 (499240, 43)
12 (454179, 43)
2022 (4741758, 43)
1 (492743, 43)
2 (422358, 43)
3 (508519, 43)
4 (433965, 43)
5 (403963, 43)
6 (378835, 43)
7 (334548, 43)
8 (416757, 43)
9 (362445, 43)
10 (352415, 43)
11 (312843, 43)
12 (322367, 43)
2023 (935069, 43)
1 (347021, 43)
2 (309488, 43)
3 (278560, 43)
