In [None]:
import json
import re
import glob
import os 

data_path = "../data/slack/slack_export_Janelia-Software_30days"

In [None]:
id2username = {}
id2realname = {}

with open(f"{data_path}/users.json", 'r') as f:
    users = json.load(f)
    for user in users:
        id = user['id']
        id2username[id] = user['name']
        id2realname[id] = user['profile']['real_name']

print(f"{len(id2username)} users")

In [None]:
channel2id = {}
with open(f"{data_path}/channels.json", 'r') as f:
    channels = json.load(f)

    for channel in channels:
        print(f"{channel['id']} {channel['name']}")
        channel2id[channel['name']] = channel['id']

In [None]:
import nltk
nltk.download('punkt')
msg = "Well, this is a sentence. And the U.S. is a country. But is this a question? What about if I mention the U.S.?"
nltk.tokenize.sent_tokenize(msg)


In [None]:
from decimal import *
from nltk import tokenize

ignored_subtypes = set(['channel_join','channel_leave'])

def fix_text(text):
    text = re.sub("&lt;", "<", text)
    text = re.sub("&gt;", ">", text)
    text = re.sub("\n+", "\n", text)
    return text

def get(element, key):
    if element and key in element:
        return element[key]
    return None


def extract_text(elements):
    text = ''
    for element in elements:
        if 'elements' in element:
            text += extract_text(element['elements'])
        el_type = get(element, 'type')
        if el_type == 'text':
            if get(get(element, 'style'), 'code'): text += '`'
            text += element['text']
            if get(get(element, 'style'), 'code'): text += '`'
        elif el_type == 'link':
            text += get(element, 'url')
        elif el_type == 'rich_text_preformatted':
            text += "\n"
        elif el_type == 'user':
            user_id = element['user_id']
            try:
                text += id2realname[user_id]
            except KeyError:
                #print(f"ERROR: no such user {user_id}")
                text += user_id

    return text


def parse_message(message):
    if get(message, 'type') == 'message':
        if 'subtype' in message and get(message, 'subtype') in ignored_subtypes:
            pass
        else:
            ts = message['ts']
            thread_ts = get(message, 'thread_ts') or ts
            msg_user = message['user']
            try:
                realname = id2realname[msg_user]
            except KeyError:
                realname = message['user_profile']['display_name']
                
            if 'blocks' in message:
                text = extract_text(message['blocks'])
            else:
                text = message['text']
            
            text_msg = re.sub("<@(.*?)>", lambda m: id2realname[m.group(1)], text)
            text_msg = fix_text(text_msg)

            text_msg = f"{realname} said: {text_msg}\n"
            return text_msg
            
def parse_questions(msg):
    questions = []
    for sentence in tokenize.sent_tokenize(msg):
        if sentence[-1] == "?":
            questions.append(sentence)
    return questions
    

def parse_channel(channel_name):
    channel_id = channel2id[channel_name]
    messages = {}
    for messages_file in glob.glob(f"{data_path}/{channel_name}/*.json"):
        with open(messages_file, 'r') as f:
            for message in json.load(f):
                msg = parse_message(message)
                qs = parse_questions(msg)
                for q in qs:
                    print(q)

    return documents

documents = []
for channel_name in channel2id.keys():
    for doc in parse_channel(channel_name):
        documents.append(doc)