In [119]:
import json
import re
import glob
import os 

data_path = "../data/slack/slack_export_Janelia-Software_30days"

In [120]:
id2username = {}
id2realname = {}

with open(f"{data_path}/users.json", 'r') as f:
    users = json.load(f)
    for user in users:
        id = user['id']
        id2username[id] = user['name']
        id2realname[id] = user['profile']['real_name']

print(f"{len(id2username)} users")

170 users


In [121]:
channel2id = {}
with open(f"{data_path}/channels.json", 'r') as f:
    channels = json.load(f)

    for channel in channels:
        print(f"{channel['id']} {channel['name']}")
        channel2id[channel['name']] = channel['id']

C011TMUB3UP python
C011W6YDV99 random
C0128K68NE5 general
C013E4ULBFU storage
C013EB3CZM1 scientific-visualization
C01430CRBHT git-github
C0146BJ38PQ wednesday_web_workshop
C015MJGSM2S julia
C01H5PYR4TW hpc
C01J3KE45LG rust
C02CTFPCTDM cplusplus
C02HDABKNAE code-review
C02K818Q3B6 java
C031U6KUMNU how-to
C032XSC2CJC image_benchmarks
C03DJGPC69K programming_languages
C03S782CCMD architecture
C041XB9U8BX applied-deep-learning
C045UGQB4LX globus
C049U3BDYPL mastodon
C04UUTQVB61 wiki-improvement
C057Z7J7F29 easi-fish-pipeline
C02K252DJ86 chromatix


In [122]:
re.sub("<@(.*?)>", lambda m: id2realname[m.group(1)], "<@W97623DK2> has joined <@W97623DK2>")

'Tiago Ferreira has joined Tiago Ferreira'

In [138]:
from decimal import *
from llama_index import Document

ignored_subtypes = set(['channel_join','channel_leave'])


def fix_text(text):
    text = re.sub("&lt;", "<", text)
    text = re.sub("&gt;", ">", text)
    text = re.sub("\n+", "\n", text)
    return text

def get(element, key):
    if element and key in element:
        return element[key]
    return None

def extract_text(elements):
    text = ''
    for element in elements:
        if 'elements' in element:
            text += extract_text(element['elements'])
        el_type = get(element, 'type')
        if el_type == 'text':
            if get(get(element, 'style'), 'code'): text += '`'
            text += element['text']
            if get(get(element, 'style'), 'code'): text += '`'
        elif el_type == 'link':
            text += get(element, 'url')
        elif el_type == 'rich_text_preformatted':
            text += "\n"
        elif el_type == 'user':
            user_id = element['user_id']
            try:
                text += id2realname[user_id]
            except KeyError:
                print(f"ERROR: no such user {user_id}")
                text += user_id

    return text

def parse_message(message):
    thread_id, text_msg = None, None
    if get(message, 'type') == 'message':
        if 'subtype' in message and get(message, 'subtype') in ignored_subtypes:
            pass
        else:
            ts = message['ts']
            thread_ts = get(message, 'thread_ts') or ts
            msg_user = message['user']
            try:
                realname = id2realname[msg_user]
            except KeyError:
                realname = message['user_profile']['display_name']
                
            if 'blocks' in message:
                text = extract_text(message['blocks'])
            else:
                text = message['text']
            
            text_msg = re.sub("<@(.*?)>", lambda m: id2realname[m.group(1)], text)
            text_msg = fix_text(text_msg)

            if 'attachments' in message:
                for attachment in message['attachments']:
                    if 'title' in attachment: text_msg += f"\n{fix_text(attachment['title'])}"
                    if 'text' in attachment: text_msg += f"\n{fix_text(attachment['text'])}"
                    
            if 'files' in message:
                for file in message['files']:
                    text_msg += f"\n<{file['name']}>"

            if 'reactions' in message:
                text_msg += f"\nOthers reacted to the previous message with "
                r = [f"{reaction['name']} a total of {reaction['count']} times" for reaction in message['reactions']]
                text_msg += ", and with ".join(r) + "."

            text_msg = f"{realname} said: {text_msg}\n"
            thread_id = Decimal(thread_ts)

    return thread_id, text_msg


def create_document(channel_id, ts, doc_text):
    print("--------------------------------------------------")
    print(f"Document[channel={channel_id},ts={ts}]")
    print(doc_text)
    return Document(doc_text, extra_info={"channel": channel_id, "ts": ts})

DOCUMENT_PAUSE_SECS = 300

def index_channel(channel_name):
    channel_id = channel2id[channel_name]
    messages = {}
    for messages_file in glob.glob(f"{data_path}/{channel_name}/*.json"):
        with open(messages_file, 'r') as f:
            for message in json.load(f):
                #print(message)
                try:
                    thread_id, text_msg = parse_message(message)
                except Exception as e:
                    print("Error parsing", message)
                    raise e
                    
                if thread_id and text_msg:
                    if thread_id not in messages:
                        messages[thread_id] = []
                    messages[thread_id].append(text_msg)

    prev_id = Decimal(0)
    thread_ids = list(messages.keys())
    thread_ids.sort()

    documents = []
    doc_text = ""
    start_ts = None

    for thread_id in thread_ids:

        # Create a new document whenever messages are separated by a longer pause
        if doc_text and thread_id-prev_id > DOCUMENT_PAUSE_SECS:
            doc = create_document(channel_id, start_ts, doc_text)
            documents.append(doc)
            doc_text = ""
            start_ts = None

        print(thread_id)
        if not start_ts:
            start_ts = str(thread_id)

        for text_msg in messages[thread_id]:
            doc_text += text_msg

        prev_id = thread_id

    # Add final document
    doc = create_document(channel_id, start_ts, doc_text)
    documents.append(doc)

    return documents

documents = index_channel("general")
print(f"Loaded {len(documents)} documents")


1681844751.750139
1681844765.907739
--------------------------------------------------
Document[channel=C0128K68NE5,ts=1681844751.750139]
Mark Kittisopikul said: Is it just me or did the standard Github layout change?
Mark Kittisopikul said: I kept trying to find the green button by muscle memory, and it wasn't there.
Jody Clements said: Yes, Code and Overview buttons are new as well
Jody Clements said: Green code button moved to top right, annoyingly
Mark Kittisopikul said: and it's smaller
Jody Clements said: Obviously developers want to get to the README before the code
Mark Kittisopikul said: And the text is overflowing:
<image.png>
Jody Clements said: you should probably rebase that branch 
Mark Kittisopikul said: It's not mine
Mark Kittisopikul said: That's from here:
https://github.com/eschnett/Yggdrasil
eschnett/Yggdrasil
Collection of builder repositories for BinaryBuilder.jl
William Katz said: Definitely highlights crappy READMEs 
Davis Bennett said: i noticed something simil

In [139]:
# Verify weviate-client is installed and the database is live and ready
import weaviate
client = weaviate.Client("http://localhost:8080")
assert client.is_live()
assert client.is_ready()
client.get_meta()

{'hostname': 'http://[::]:8080', 'modules': {}, 'version': '1.19.2'}

In [140]:
# !!!! Delete data in Weaviate
client.schema.delete_class("Slack_Node")

In [141]:
# Create Documents from cached Slack logs
documents = []
for channel_name in channel2id.keys():
    for doc in index_channel(channel_name):
        documents.append(doc)

print(f"Loaded {len(documents)} documents")

1681841967.726019
1681842140.916879
--------------------------------------------------
Document[channel=C011TMUB3UP,ts=1681841967.726019]
Davis Bennett said: looks like the guy who made `ruff` (very good linter) got some funding? https://astral.sh/
Astral: Next-gen Python tooling
Astral builds high-performance developer tools for the Python ecosystem, starting with Ruff, an extremely fast Python linter, written in Rust.
Davis Bennett said: not clear how this works as a business 
William Katz said: Here's what one of their seed investors say:
https://www.accel.com/noteworthy/our-seed-investment-in-astral-accelerating-python-development
So they are using Vercel as a comparison.
Accel - Our Seed Investment in Astral: Accelerating Python Development

1682102619.372109
--------------------------------------------------
Document[channel=C011TMUB3UP,ts=1682102619.372109]
  warn('ignoring keyword argument %r' % k)

Davis Bennett said: at the moment it's not possible to add attributes when you 

In [154]:
from llama_index import LLMPredictor, PromptHelper, ServiceContext
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from llama_index import LangchainEmbedding

llm = ChatOpenAI(temperature=0.5, model_name="gpt-3.5-turbo-0301")
llm_predictor = LLMPredictor(llm=llm)
embed_model = LangchainEmbedding(OpenAIEmbeddings())

max_input_size = 4096
num_output = 256
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embed_model, prompt_helper=prompt_helper)

In [143]:
# Calculate embedding for all of the documents and save them into Weaviate
from llama_index import GPTVectorStoreIndex
from llama_index.vector_stores import WeaviateVectorStore
from llama_index.storage.storage_context import StorageContext

class_prefix = "Slack"
vector_store = WeaviateVectorStore(weaviate_client=client, class_prefix=class_prefix)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# persists the vector_store into Weaviate
index = GPTVectorStoreIndex.from_documents(documents, storage_context=storage_context, service_context=service_context)

# persist the docstore and index_store
# this is currently required although in theory Weaviate should be able to handle these as well
storage_context.persist(persist_dir='../storage/index/slack')

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')).


In [144]:
from slack_sdk import WebClient
slack_token = os.environ.get('SLACK_TOKEN')
client = WebClient(token=slack_token)
res = client.api_test()
if not res["ok"]:
    raise ValueError(f"Error initializing Slack API: {res['error']}")

In [151]:
def get_unique_nodes(nodes):
    docs_ids = set()
    unique_nodes = list()
    for node in nodes:
        if node.node.ref_doc_id not in docs_ids:
            docs_ids.add(node.node.ref_doc_id)
            unique_nodes.append(node)
    return unique_nodes
        
def get_message_link(channel, ts):
    res = client.chat_getPermalink(channel=channel, message_ts=ts)
    if res['ok']:
        return res['permalink']

def print_response(response, node_text=False):
    print(response.response)    
    for node in get_unique_nodes(response.source_nodes):
        channel_id = node.node.extra_info['channel']
        ts = node.node.extra_info['ts']
        print(get_message_link(channel_id, ts))
        if node_text:
            print(node.node.text)
        
def query(question, n=5, node_text=False):   
    query_engine = index.as_query_engine(similarity_top_k=n)
    res = query_engine.query(question)
    print_response(res, node_text)

In [157]:
from llama_index.retrievers import VectorIndexRetriever
from llama_index.vector_stores.types import VectorStoreQueryMode
from llama_index import ResponseSynthesizer
from llama_index.query_engine import RetrieverQueryEngine

# configure retriever
retriever = VectorIndexRetriever(
    index,
    similarity_top_k=3,
    vector_store_query_mode=VectorStoreQueryMode.HYBRID,
    alpha=0.8,
)

# configure response synthesizer
synth = ResponseSynthesizer.from_args()

# construct query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=synth,
)


In [158]:
query("Should you limit your cluster jobs or submit everything at once?", node_text=True)

It is recommended to submit everything at once, as there is a limit on how many GPUs can be used at one time and it is programmatic. The current limit is 150, but it may change without notice. If the limit is exceeded, new jobs will not start until the number of GPUs being used is below the limit. However, it is also recommended to submit jobs in the most logical work unit.
https://janelia-dev.slack.com/archives/C0128K68NE5/p1682018703146099?thread_ts=1682018703.146099&cid=C0128K68NE5
William Patton said: whats the best practice on large cluster jobs? I have a job that will take approximately 8 days using all 248 gpu_rtx nodes. Its a blockwise convolutional neural network prediction job so its embarrassingly parallel and scales almost linearly with number of gpus. Is it better to take 120 for 16 days ish or somewhere in between?
Ben Arthur said: how long does each of your jobs take?
William Patton said: It's just one job. I just need to start a bunch of workers to run prediction in par

In [160]:
query("What are some interesting software packages that people are using?", node_text=True)

Some interesting software packages that people are using include tmux, screen, iTerm2, X11 forwarding, NoMachine, conda, micromamba, and GNU Guix. Other interesting projects mentioned include open_llama, RedPajama dataset, Meerkat visualization tool, and the Stanford Hazy Research group.
https://janelia-dev.slack.com/archives/C0128K68NE5/p1684244275308909?thread_ts=1684244275.308909&cid=C0128K68NE5
Davis Bennett said: besides myself, how many people use https://en.wikipedia.org/wiki/Tmux?
Tmux
tmux is an open-source terminal multiplexer for Unix-like operating systems. It allows multiple terminal sessions to be accessed simultaneously in a single window. It is useful for running more than one command-line program at the same time. It can also be used to detach processes from their controlling terminals, allowing remote sessions to remain active without being visible.
Others reacted to the previous message with + a total of 4 times.
Jody Clements said: I still use screen, too much muscl