In [1]:
import json
import yaml
import re

from glob import glob

In [2]:
import requests
import pandas as pd

In [3]:
import clipboard

In [4]:
df_emojis = pd.read_csv('/home/alexeygrigorev/tmp/datatalksclub.github.io/scripts/emojis.csv')
emoji_map = dict(zip(df_emojis.code, df_emojis.emoji))

In [5]:
def repl_user_callback(match):
    user_id = match.group(1)
    user_name = users[user_id]
    return user_name['name']

user_pattern = re.compile(r'<@(.+?)>')
link_pattern_text = re.compile(r'<(http.+?)\|(.+?)>')
link_pattern = re.compile(r'<(http.+?)>')
emoji_pattern = re.compile(r':([-+0-9_a-z]+):(:[-+0-9_a-z]:)?')

def replace_emoji_callback(match):
    code = match.group(1)
    if code in emoji_map:
        return emoji_map[code]
    print('cannot find %s' % code)
    return ":%s:" % code

def prepare_text(text):
    text = text.replace('\xa0', ' ').replace('•', '-').replace('\n\n', '\n')
    text = user_pattern.sub(repl_user_callback, text)
    text = emoji_pattern.sub(replace_emoji_callback, text)
    text = link_pattern_text.sub(r'[\2](\1)', text)
    text = link_pattern.sub(r'[\1](\1)', text)
    return text

In [6]:
def load_docs(files):
    all_docs = []

    for f in files:
        with open(f) as f_in:
            docs = json.load(f_in)
            all_docs.extend(docs)
    
    return all_docs

In [7]:
def clean_user(d):
    p = d['profile']
    name = p['display_name']
    if len(name) == 0:
        name = p['real_name']
    return {
        'name': name,
        'image': p['image_72']
    }

In [8]:
with open('dump/users.json') as f_in:
    all_users = json.load(f_in)

In [9]:
users = {d['id']: clean_user(d) for d in all_users}

In [10]:
all_files = sorted(glob('./dump/book-of-the-week/*.json'))

In [11]:
all_files

['./dump/book-of-the-week/2020-12-08.json',
 './dump/book-of-the-week/2020-12-09.json',
 './dump/book-of-the-week/2020-12-10.json',
 './dump/book-of-the-week/2020-12-11.json',
 './dump/book-of-the-week/2020-12-12.json',
 './dump/book-of-the-week/2020-12-13.json',
 './dump/book-of-the-week/2020-12-14.json',
 './dump/book-of-the-week/2020-12-15.json',
 './dump/book-of-the-week/2020-12-16.json',
 './dump/book-of-the-week/2020-12-17.json',
 './dump/book-of-the-week/2020-12-18.json',
 './dump/book-of-the-week/2020-12-20.json',
 './dump/book-of-the-week/2020-12-22.json',
 './dump/book-of-the-week/2021-01-09.json',
 './dump/book-of-the-week/2021-01-10.json',
 './dump/book-of-the-week/2021-01-11.json',
 './dump/book-of-the-week/2021-01-12.json',
 './dump/book-of-the-week/2021-01-13.json',
 './dump/book-of-the-week/2021-01-14.json',
 './dump/book-of-the-week/2021-01-15.json',
 './dump/book-of-the-week/2021-01-17.json',
 './dump/book-of-the-week/2021-01-18.json',
 './dump/book-of-the-week/2021-0

In [12]:
all_messages_docs = load_docs(all_files)

In [76]:
question_files = [



 './dump/book-of-the-week/2021-08-15.json',
 './dump/book-of-the-week/2021-08-16.json',
 './dump/book-of-the-week/2021-08-17.json',
 './dump/book-of-the-week/2021-08-18.json',
 './dump/book-of-the-week/2021-08-19.json',
 './dump/book-of-the-week/2021-08-20.json',

 
 

]

In [77]:
question_messages_docs = load_docs(question_files)

top_messages = [d for d in question_messages_docs if 'parent_user_id' not in d]
thread_replies = [d for d in all_messages_docs if 'parent_user_id' in d]
replies_idx = {(d['user'], d['ts']): d for d in all_messages_docs}

In [78]:
top_messages = [d for d in top_messages if d.get('subtype') not in ('thread_broadcast', 'channel_join')]

In [79]:
threads = []

for top_message in top_messages:
    user_id = top_message['user']
    if user_id == 'USLACKBOT':
        continue
    
    top_name = users[user_id]['name']
    top_text = prepare_text(top_message['text']).strip()
    
#     print(top_text[:50] + '...')
    
    replies = []

    for p in top_message.get('replies', []):
        reply_id = (p['user'], p['ts'])
        reply = replies_idx[reply_id]
        name = users[p['user']]['name']
        text = prepare_text(reply['text']).strip()

        replies.append({'name': name, 'text': text})

    thread = {
        'name': top_name,
        'text': top_text,
        'replies': replies
    }
    
    threads.append(thread)

In [80]:
yaml_snippet = yaml.dump({'archive': threads}, sort_keys=False)

In [81]:
print(yaml_snippet)

archive:
- name: Alexey Grigorev
  text: "Hello, everyone!\nThe book of this week is [Tuning Up](https://datatalks.club/books/20210816-tuning-up.html)\
    \ by David Sweet\n&gt; Tuning Up: From A/B testing to Bayesian optimization is\
    \ a toolbox for optimizing machine learning systems, quantitative trading strategies,\
    \ and more. You\u2019ll start with a deep dive into tests like A/B testing, and\
    \ then graduate to advanced techniques used to measure performance in highly competitive\
    \ industries like finance and social media. The tests in this unique, practical\
    \ guide will quickly reveal which approaches and features deliver real results\
    \ for your business.\n- Ask as many questions as you'd like (one question - one\
    \ thread, please)\n- The book author answers questions from Monday till Thursday\n\
    - On Friday, the author decides who wins free copies of their book"
  replies: []
- name: WingCode
  text: 'Hi David Sweet, nice to meet you!

    S

In [82]:
clipboard.copy(yaml_snippet)