In [1]:
import zstandard as zstd
import json
import sys
import os

## Preprocessing
This will read, filter, clean, and combine my data for chunking and ingestion for RAG.

In [2]:
submissions_zst_path = "../Data/climbharder_submissions.zst" #local path to file
comments_zst_path = "../Data/climbharder_comments.zst"
output_path_submissions = "../Data/climbharder_posts.txt"
output_path_comments = "../Data/climbharder_comments.txt"


### Processing zst-compressed ndj files
For this step, I read the zst into a buffer a few lines at a time. For each line, I load the post (or comment) and filter it for sufficient karma threshold and text length. Then, I write it into the output file on its own line. My goal is to create one large newline-separated txt file of all post/comment content.

In [3]:

def process_pushshift_zst(input_path, output_path, subreddit_filter, score_threshold):
    if not os.path.isfile(input_path):
        print(f"Error: File '{input_path}' not found.")
        return

    with open(input_path, 'rb') as f, open(output_path, 'w', encoding='utf-8') as out_file:
        dctx = zstd.ZstdDecompressor()
        stream_reader = dctx.stream_reader(f)
        buffer = b''
        for chunk in iter(lambda: stream_reader.read(2**20), b''):
            buffer += chunk
            lines = buffer.split(b'\n')
            buffer = lines.pop()  # Save the last partial line for the next iteration

            for line in lines:
                if not line.strip():
                    continue
                try:
                    post = json.loads(line)
                    if post.get('subreddit') != subreddit_filter:
                        continue
                    if post.get('score', 0) <= score_threshold:
                        continue

                    # For submissions
                    if 'title' in post:
                        title = post.get('title', '').strip()
                        body = post.get('selftext', '').strip()
                        full_text = f"{title} {body}".strip()
                    # For comments
                    else:
                        full_text = post.get('body', '').strip()

                    if len(full_text) < 120:
                        continue

                    out_file.write(full_text + '\n')

                except json.JSONDecodeError:
                    continue


In [4]:
process_pushshift_zst(
        input_path=submissions_zst_path,
        output_path=output_path_submissions,
        subreddit_filter='climbharder',
        score_threshold=20
    )

process_pushshift_zst(
        input_path=comments_zst_path,
        output_path=output_path_comments,
        subreddit_filter='climbharder',
        score_threshold=20
    )

### Concatenation
Since it's easier (and safer) to process these two massive zst files into separate txts using the above script, I need to combine them into one file

In [5]:
def concatenate_two_txt_files(source_path1, source_path2, destination_path):
    try:
        with open(destination_path, 'w', encoding='utf-8') as dest_file:
            for source_path in [source_path1, source_path2]:
                with open(source_path, 'r', encoding='utf-8') as source_file:
                    for line in source_file:
                        dest_file.write(line)
        print(f"Successfully wrote combined contents to '{destination_path}'.")
    except FileNotFoundError as e:
        print(f"Error: {e}")

In [6]:
output_posts_and_comments = "../Data/climbharder_posts_and_comments.txt"
concatenate_two_txt_files(output_path_submissions, output_path_comments, output_posts_and_comments)

Successfully wrote combined contents to '../Data/climbharder_posts_and_comments.txt'.


### Loading and Splitting

Now I have to load and split the document. I used [https://chunkviz.up.railway.app/](this) site to find a text splitting style and chunk size that seems to work well with my data.

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open("../Data/climbharder_posts_and_comments.txt", 'r', encoding='utf-8') as f:
    raw_text = f.read()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

docs = splitter.create_documents([raw_text])


### Embedding
Now that I've loaded and split my text, I'll try to embed the chunks in an embedding model

In [8]:
from langchain_ollama import OllamaEmbeddings

embed_model = OllamaEmbeddings(model="nomic-embed-text")
print("starting")
texts = [doc.page_content for doc in docs]
print(f"texts:{texts[0]}")
embeddings = embed_model.embed_documents(texts)

print(embeddings[0])

starting
texts:Dave MacLeod's treatment for fingery injuries contradicts everything I thought I knew http://www.onlineclimbingcoach.blogspot.ca/2012/03/injuries-problem-with-lay-off.html
[0.051051077, 0.064840145, -0.1929585, 0.0008151712, 0.023558656, 0.03602159, 0.054215994, 0.006765342, 0.021193804, -0.07227144, 0.023222871, -0.019632885, 0.008925711, -0.026930299, 0.08541349, -0.030865876, -0.05620055, -0.022050012, -0.019279892, 0.008773777, -0.046379857, -0.035400372, -0.051545817, 7.968705e-05, 0.05468656, -0.050549537, -0.028870258, 0.02118899, -0.014771791, 0.03487955, 0.06283174, 0.02908895, -0.00684938, -0.05738809, -0.034202483, -0.023635868, 0.054646183, 0.042553984, -0.042642675, 0.0053183064, 0.0091708675, 0.011177209, 0.028631823, -0.063040346, 0.026954273, -0.0585496, -0.008536334, 0.0690073, -0.052999105, 0.023176584, 0.051429834, -0.090897664, -0.014594037, -0.0065667946, 0.014116502, 0.025369633, 0.0056919595, 0.021106543, 0.047338564, -0.07656957, 0.07748395, 0.033

In [13]:
from langchain.vectorstores import Chroma

vectorstore=Chroma.from_documents(documents=docs, embedding = embed_model, persist_directory="../db")

In [27]:
retriever = vectorstore.as_retriever()
results = retriever.invoke("what exercises help strengthen the wrist?")
print(results)

[Document(metadata={}, page_content='Do wrist extensor exercises from Eric Horst seen here help as well? https://trainingforclimbing.com/training-the-wrist-stabilizers/'), Document(metadata={}, page_content='like to create a compendium of exercises, tools, etc that target wrist strength not from a wrist stability point of view but from a strength point of view. Here is a start.'), Document(metadata={}, page_content="Wrist training is the new hangboarding Lately wrist training for slopers and compression have become all the rage. I think this is mainly sparked by two podcast guests: Yves Gravelle and Dan Varian. In it they discuss how training their wrists has increased their sloper strength dramatically. Apart from these podcasts there doesn't seem to be a great deal of information on wrist training. I'd like to create a compendium of exercises, tools, etc that target wrist strength not from a wrist"), Document(metadata={}, page_content='* Function: general conditioning for the weak, s