In [None]:
import logging
import os
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
from slack_bolt import App
import pandas as pd
import numpy as np
import datetime
import json
from collections import OrderedDict
import openpyxl
import requests
import json
import urllib

In [None]:
logger = logging.getLogger(__name__)

In [None]:
proxy = urllib.request.getproxies().get('http')
client = WebClient(token=os.environ['BOT_TOKEN_HK'], proxy=proxy) 
uclient = WebClient(token=os.environ['USER_TOKEN_HK'], proxy=proxy)

In [None]:
# get users data
users_info = uclient.users_list().get('members')
df_users_info = pd.json_normalize(users_info)

In [None]:
# create list of user_id and names
user_id_names = []
for i in zip(df_users_info['id'], df_users_info['real_name']):
    user_id_names.append(i)
df_user_id_names = pd.DataFrame(user_id_names,columns=['id', 'real_name'])
df_user_id_names

In [None]:
# specify user whom you want to retrive logs of DM with
usr_idx = 5
dm_user_id = df_user_id_names['id'][usr_idx]
dm_user_name = df_user_id_names['real_name'][usr_idx]
dm_user_name

In [None]:
# get DM channel raw data and id
dm_info = uclient.conversations_open(users=dm_user_id)
print(dm_info)
dm = dm_info.get('channel')
print(dm['id'])

In [None]:
# get DM logs    
log_dm = uclient.conversations_history(channel=dm['id'])
log_dm_ms = log_dm['messages']
list_log = []
df_log_dm_ms = pd.json_normalize(log_dm_ms)
i = 0

# can get over 100 messages
if log_dm['has_more'] == True:
    while log_dm['has_more'] == True:
        log_dm = uclient.conversations_history(channel=dm['id'], cursor=log_dm['response_metadata']['next_cursor'])
        log_dm_ms_n = log_dm['messages']
        list_log.append(log_dm['messages'])

        df_log_dm_ms_n = pd.json_normalize(log_dm_ms_n)
        df_log_dm_ms = df_log_dm_ms.append(df_log_dm_ms_n)
        i += 1
print(i)
flat = [x for row in list_log for x in row]
log_dm_ms.extend(flat)
print(len(log_dm_ms))
df_log_dm_ms

In [None]:
# select columns to use
reindex_col = ['type', 'ts', 'user', 'text', 'subtype', 'reply_count', 'reply_users_count', 'thread_ts']
df_log_dm_ms = df_log_dm_ms.reindex(columns=reindex_col)
df_log_dm_ms = df_log_dm_ms.reset_index().drop(columns='index')
df_log_dm_ms

In [None]:
# extract value of files key
files = []
for i in df_log_dm_ms.index:
    if 'files' in log_dm_ms[i].keys():
        files.append(log_dm_ms[i]['files'][0])
    else:
        files.append({'name': 'None', 'url_private_download': 'None'})  

In [None]:
# create df of files and rename columns
df_dm_files = pd.json_normalize(files)
df_dm_files = df_dm_files[['name', 'url_private_download']]
df_dm_files.rename(columns={'name': 'FileName', 'url_private_download': 'FileURL'}, inplace=True)
df_dm_files

In [None]:
df_dm_log = pd.concat([df_log_dm_ms, df_dm_files], axis=1)

# query('subtype != "thread_broadcast"') maybe unnecessary... because duplicates is dropped later 
# but prevent causing bugs by fixing it, I remain
df_dm_log = df_dm_log.sort_values(by='ts').reset_index().drop('index',axis=1).query('subtype != "thread_broadcast"')
df_dm_log_mi = df_dm_log.set_index(['type', 'thread_ts', 'ts'])
df_dm_log_mi

In [None]:
# get replies to associate with each replies by parent messages
channel_id = dm['id']
threads = []
for i in df_dm_log.index:
    # 要素がNaNじゃないことの判定
    if np.isnan(df_dm_log['reply_count'][i]) == False:
        thr = uclient.conversations_replies(channel=channel_id, ts=df_dm_log['thread_ts'][i])
        thr_ms = thr.get('messages')
        threads.append(thr_ms)

In [None]:
reps = []
for i in range(len(threads)):
    rep = threads[i]
    for j in range(len(rep)):
        #print(rep[i])
        reps.append(rep[j])
print(len(reps))

In [None]:
# combine messages (not included replies) with replies included parent messages
if len(reps) != 0:
    df_dm_rep = pd.json_normalize(reps)
    df_dm_rep = df_dm_rep[['type', 'ts', 'user', 'text', 'reply_count', 'reply_users_count', 'thread_ts']]
    df_dm_rep = df_dm_rep.sort_values('ts').reset_index().drop(columns='index')
    df_dm_rep['type']='thread'
    df_dm_rep_mi = df_dm_rep.set_index(['type', 'thread_ts', 'ts'])
    df_dm_rep_mi
    
    df_dm_log = pd.concat([df_dm_log_mi, df_dm_rep_mi])

    # remove duplicated parent messages by drop_duplicates
    df_dm_log = df_dm_log.reset_index().drop_duplicates(subset = ['text', 'ts'], keep='last').reset_index().drop(columns='index')

else:
    df_dm_log = df_dm_log_mi.reset_index()

df_dm_log

In [None]:
# convert dtypes of ts and thread_ts, from str to float, and replace NaN to 'None'
df_dm_log_astype = df_dm_log.astype({'ts': float, 'thread_ts': float}).fillna('None')
df_dm_log_astype

In [None]:
# replace UNIX DATE to formatted one and separate ts to date and time (for multiindex)
date = []
time = []
for i in range(len(df_dm_log_astype['ts'])):
    if (type(df_dm_log_astype['ts'][i]) == np.float64 or 
        type(df_dm_log_astype['ts'][i]) == float):
        # create list of date        
        dt_raw = datetime.date.fromtimestamp(df_dm_log_astype['ts'][i])
        dt = dt_raw.strftime('%a, %b %d, %Y')
        date.append(dt)
        print('finished formatting date')        
        
        # create list of time
        ti_raw = datetime.datetime.fromtimestamp(df_dm_log_astype['ts'][i])
        ti = ti_raw.strftime('%H:%M')
        time.append(ti)       
        print('finished formatting time') 

    # format thread_ts to datetime
    if (type(df_dm_log_astype['thread_ts'][i]) == np.float64 or 
        type(df_dm_log_astype['thread_ts'][i]) == float):
        dtime = datetime.datetime.fromtimestamp(df_dm_log_astype['thread_ts'][i])
        df_dm_log_astype.iloc[i, 1] = dtime.strftime('%H:%M, %a, %b %d, %Y')
        print ('finished formatting thread_datetime')
    
    else:
        print('thread_ts is not found')

In [None]:
# combine formatted date and time to dataframe
df_datetime = pd.DataFrame({'date': date, 
                            'time': time})
df_dm_log_frt = pd.concat([df_datetime, df_dm_log_astype],axis=1).drop(columns='ts')

In [None]:
# replace user id in names column to real_name
for i in df_dm_log_frt.index:
    #print(i)
    for j in range(len(user_id_names)):
        if df_dm_log_frt.user[i] == user_id_names[j][0]:
            df_dm_log_frt.user[i] = user_id_names[j][1]

In [None]:
# replace user id in text column to real_name (maybe able to merge above cell?)
for i in df_dm_log_frt.index:
    for j in df_user_id_names.index:
        if user_id_names[j][0] in df_dm_log_frt['text'][i]:
            df_dm_log_frt['text'][i] = df_dm_log_frt['text'][i].replace(user_id_names[j][0], user_id_names[j][1])
        else:
            continue

In [None]:
# format only uploading file (None of text fields)
for i in df_dm_log_frt.index:
    if df_dm_log_frt.text[i] == '':
        df_dm_log_frt.text[i] = 'attached file(s) only'
    else:
        continue

In [None]:
# check result of formatting and replacing
df_dm_log_frt

In [None]:
# multiIndex
df_dm_log_frt_mi = df_dm_log_frt.set_index(['type', 'thread_ts', 'date', 'time'])
df_dm_log_frt_mi

In [None]:
#my_idx = 5
my_name = 'hk'
dm_user_name
#= 'hk'

In [None]:
backup_date = datetime.date.today()
backup_date = backup_date.strftime('%y%m%d')

# DO NOT FORGET MODIFY YEAR (year of current backup workspace) 
tgt_ws_year = 2022
print(backup_date)
print(tgt_ws_year)

In [None]:
path_DM = '../DM/'
if os.path.exists(path_DM) == False:
    %mkdir path_DM
else:
    print('directory already exists')
    
path_DM_full = '../DM_full_log/'
if os.path.exists(path_DM_full) == False:
    %mkdir path_DM_full
else:
    print('directory already exists')
    
path_YEAR = str(tgt_ws_year)
if os.path.exists(path_DM+path_YEAR) == False:
    %mkdir path_DM+path_YEAR
else:
    print('directory already exists')

if os.path.exists(path_DM_full+path_YEAR) == False:
    %mkdir path_DM_full+path_YEAR
else:
    print('directory already exists')

In [None]:
# to export excel file (for remaining multiIndex and encoding automatically)
df_dm_log_frt_mi.to_excel(path_DM+path_YEAR+'/DM_log_'+path_YEAR+'_'+my_name+'_'+dm_user_name+'_'+backup_date+'.xlsx')

In [None]:
# recommend saving raw data (not included replies)
with open(path_DM_full+path_YEAR+'/Raw_dm_log_NOrep'+path_YEAR+'_'+my_name+'_'+dm_user_name+'_'+backup_date+'_.dat', 'w') as f:
    print(log_dm, file=f) 

In [None]:
# replies raw data
with open(path_DM_full+path_YEAR+'/Raw_dm_rep_log'+path_YEAR+'_'+my_name+'_'+dm_user_name+'_'+backup_date+'_.dat', 'w') as f:
    print(thr, file=f) 