In [None]:
import logging
import os
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
from slack_bolt import App
import pandas as pd
import numpy as np
import datetime
import json
from collections import OrderedDict
import openpyxl
import urllib

In [None]:
logger = logging.getLogger(__name__)

In [None]:
proxy = urllib.request.getproxies().get('http')
print(proxy)
client = WebClient(token=os.environ['BOT_TOKEN_HK'], proxy=proxy)
uclient = WebClient(token=os.environ['USER_TOKEN_HK'], proxy=proxy)

In [None]:
# get channels info in workspace
ws_info = client.conversations_list()
ws_info_ch = ws_info.get('channels')
df_ch_info = pd.json_normalize(ws_info_ch)
print(df_ch_info)
print(df_ch_info.columns)

In [None]:
# reindex channel basic info
reindex_col_ch = ['id', 'name', 'num_members', 'topic.value', 'purpose.value']
df_ch_info = df_ch_info.reindex(columns=reindex_col_ch)

In [None]:
# extract ch_ids and names
ch_id_name = []
for i in zip(df_ch_info['id'], df_ch_info['name']):
    ch_id_name.append(i)

In [None]:
df_ch_id_name = df_ch_info[['id', 'name']]
print(df_ch_id_name)

In [None]:
# specify channel
ch_idx = 0
ch_id = df_ch_id_name['id'][ch_idx]
ch_name = df_ch_id_name['name'][ch_idx]
ch_id

In [None]:
# get ch message log   
ch_log = uclient.conversations_history(channel=ch_id)
ch_ms_log = ch_log['messages']
list_log = []

df_ch_ms_log = pd.json_normalize(ch_ms_log)

# can get over 100 messages
i = 0
if ch_log['has_more'] == True:
    while ch_log['has_more'] == True:
        ch_log = uclient.conversations_history(channel=ch_id, cursor=ch_log['response_metadata']['next_cursor'])
        ch_ms_log_n = ch_log['messages']
        list_log.append(ch_log['messages'])

        df_ch_ms_log_n = pd.json_normalize(ch_ms_log_n)
        df_ch_ms_log = df_ch_ms_log.append(df_ch_ms_log_n)
        i += 1
print(i)
flat = [x for row in list_log for x in row]
ch_ms_log.extend(flat)
print(len(ch_ms_log))
df_ch_ms_log

In [None]:
# select columns to use
reindex_col = ['type', 'ts', 'user', 'text', 'subtype', 'reply_count', 'reply_users_count', 'thread_ts', 'topic']
df_ch_ms_log = df_ch_ms_log.reindex(columns=reindex_col).reset_index().drop(columns='index')
df_ch_ms_log

In [None]:
# extract value of files key
files = []
for i in range(len(ch_ms_log)):
    if 'files' in ch_ms_log[i].keys():
        files.append(ch_ms_log[i]['files'][0])
    else:
        files.append({'name': 'None', 'url_private_download': 'None'})  

In [None]:
# create df of files and rename columns
df_ch_files = pd.json_normalize(files)
df_ch_files = df_ch_files[['name', 'url_private_download']]
df_ch_files.rename(columns={'name': 'FileName', 'url_private_download': 'FileURL'}, inplace=True)
df_ch_files

In [None]:
df_ch_log = pd.concat([df_ch_ms_log, df_ch_files], axis=1)

# query('subtype != "thread_broadcast"') maybe unnecessary... because duplicates is dropped later 
# but prevent causing bugs by fixing it, I remain
df_ch_log = df_ch_log.sort_values(by='ts').reset_index().drop('index',axis=1).query('subtype != "thread_broadcast"')
df_ch_log_mi = df_ch_log.set_index(['type', 'thread_ts', 'ts'])
df_ch_log_mi

In [None]:
# get replies to associate with each replies by parent messages
threads = []
for i in df_ch_log.index:
    # 要素がNaNじゃないことの判定
    if np.isnan(df_ch_log['reply_count'][i]) == False:
        thr = uclient.conversations_replies(channel=ch_id, ts=df_ch_log['thread_ts'][i])
        thr_ms = thr.get('messages')
        threads.append(thr_ms)

In [None]:
reps = []
for i in range(len(threads)):
    rep = threads[i]
    for j in range(len(rep)):
        #print(rep[i])
        reps.append(rep[j])
print(len(reps))

In [None]:
# combine messages (not included replies) with replies included parent messages
if len(reps) != 0:
    df_ch_rep = pd.json_normalize(reps)
    df_ch_rep = df_ch_rep[['type', 'ts', 'user', 'text', 'reply_count', 'reply_users_count', 'thread_ts']]
    df_ch_rep = df_ch_rep.sort_values('ts').reset_index().drop(columns='index')
    df_ch_rep['type']='thread'
    df_ch_rep_mi = df_ch_rep.set_index(['type', 'thread_ts', 'ts'])
    
    df_ch_log = pd.concat([df_ch_log_mi, df_ch_rep_mi])
    
    # remove duplicated parent messages by drop_duplicates
    df_ch_log = df_ch_log.reset_index().drop_duplicates(subset = ['text', 'ts'], keep='last').reset_index().drop(columns='index')

else:
    df_ch_log = df_ch_log_mi.reset_index()

df_ch_log

In [None]:
# convert dtypes of ts and thread_ts, from str to float, and replace NaN to 'None'
df_ch_log_astype = df_ch_log.astype({'ts': float, 'thread_ts': float}).fillna('None')
df_ch_log_astype

In [None]:
# replace UNIX DATE to formatted one and separate ts to date and time (for multiindex)
date = []
time = []
for i in df_ch_log_astype.index:
    if (type(df_ch_log_astype['ts'][i]) == np.float64 or 
        type(df_ch_log_astype['ts'][i]) == float):
        # create list of date
        dt_raw = datetime.date.fromtimestamp(df_ch_log_astype['ts'][i])
        dt = dt_raw.strftime('%a, %b %d, %Y')
        date.append(dt)
        print('finished formatting ts to date')        
        
        # create list of time
        ti_raw = datetime.datetime.fromtimestamp(df_ch_log_astype['ts'][i])
        ti = ti_raw.strftime('%H:%M')
        time.append(ti)
        print('finished formatting ts to time') 
    
    # format thread_ts to datetime
    if (type(df_ch_log_astype['thread_ts'][i]) == np.float64 or 
        type(df_ch_log_astype['thread_ts'][i]) == float):
        dtime = datetime.datetime.fromtimestamp(df_ch_log_astype['thread_ts'][i])
        df_ch_log_astype.iloc[i, 1] = dtime.strftime('%H:%M, %a, %b %d, %Y')
        print ('finished formatting thread_ts')
    
    else:
        print('thread_ts is not found')

In [None]:
# combine formatted date and time to dataframe
df_datetime = pd.DataFrame({'date': date, 
                            'time': time})
df_ch_log_frt = pd.concat([df_datetime, df_ch_log_astype],axis=1).drop(columns='ts')
df_ch_log_frt

In [None]:
# get users info
users_info = client.users_list().get('members')
df_users_info = pd.json_normalize(users_info)

In [None]:
# create list of user_id and names
user_id_names = []
for i in zip(df_users_info['id'], df_users_info['real_name']):
    user_id_names.append(i)
df_user_id_names = pd.DataFrame(user_id_names,columns=['id', 'real_name'])
df_user_id_names

In [None]:
# replace user id in names column to real_name
for i in df_ch_log_frt.index:
    for j in range(len(user_id_names)):
        if df_ch_log_frt.user[i] == user_id_names[j][0]:
            df_ch_log_frt.user[i] = user_id_names[j][1]

In [None]:
# replace user id in text column to real_name (maybe able to merge above cell?)
for i in df_ch_log_frt.index:
    for j in df_user_id_names.index:
        if user_id_names[j][0] in df_ch_log_frt['text'][i]:
            df_ch_log_frt['text'][i] = df_ch_log_frt['text'][i].replace(user_id_names[j][0], user_id_names[j][1])
        else:
            continue

In [None]:
# format only uploading file (None of text fields)
for i in df_ch_log_frt.index:
    if df_ch_log_frt.text[i] == '':
        df_ch_log_frt.text[i] = 'attached file(s) only'
    else:
        continue

In [None]:
# check result of formatting and replacing
df_ch_log_frt

In [None]:
# multiIndex
df_ch_log_frt_mi = df_ch_log_frt.set_index(['type', 'thread_ts', 'date', 'time'])
df_ch_log_frt_mi

In [None]:
backup_date = datetime.date.today()
backup_date = backup_date.strftime('%y%m%d')

# DO NOT FORGET MODIFY YEAR (year of current backup workspace) 
tgt_ws_year = 2022
print(backup_date)
print(ch_name)

In [None]:
path_ch = '../channel/'
if os.path.exists(path_ch) == False:
    %mkdir path_ch
else:
    print('directory already exists')
    
path_ch_full = '../ch_full_log/'
if os.path.exists(path_ch_full) == False:
    %mkdir path_ch_full
else:
    print('directory already exists')
    
path_YEAR = str(tgt_ws_year)
if os.path.exists(path_ch+path_YEAR) == False:
    %mkdir path_ch+path_YEAR
else:
    print('directory already exists')

if os.path.exists(path_ch_full+path_YEAR) == False:
    %mkdir path_ch_full+path_YEAR
else:
    print('directory already exists')

In [None]:
# to export excel file (for remaining multiIndex and encoding automatically)
df_ch_log_frt_mi.to_excel(path_ch+path_YEAR+'/log_ch_'+path_YEAR+'_'+ch_name+'_'+backup_date+'.xlsx')

In [None]:
# recommend saving raw data (not included replies)
with open(path_ch_full+path_YEAR+'/Raw_ch_log_NOrep_'+path_YEAR+'_'+ch_name+'_'+backup_date+'.dat', 'w') as f:
    print(ch_log, file=f) 

In [None]:
# replies raw data
with open(path_ch_full+path_YEAR+'/Raw_ch_rep_log_'+path_YEAR+'_'+ch_name+'_'+backup_date+'.dat', 'w') as f:
    print(thr, file=f) 