In [None]:
from database.database import MediumArticle
from database.database import get_session
import pandas as pd
import datetime
import re
from sentence_transformers import SentenceTransformer
import torch
import os
import torch
import tensorflow as tf
from tensorboard.plugins import projector

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

session = get_session()


Using device: cuda


In [2]:
# query all articles
articles_df = pd.read_sql(session.query(MediumArticle).statement, session.bind)
articles_df["text_length"] = articles_df["full_article_text"].apply(lambda x: len(x.split()))

articles_filtered = articles_df[articles_df["date_published"] > datetime.datetime(2020, 1, 1)]
articles_filtered = articles_filtered[articles_filtered["language"] == "en"]
print(f"Number of articles published since 2020-01-01 in English: {len(articles_filtered)}")

free_articles_df = articles_filtered[articles_filtered["is_free"] == True]
paid_articles_df = articles_filtered[articles_filtered["is_free"] == False]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Number of articles published since 2020-01-01 in English: 35185


## Build Pipeline

In [6]:
def preprocess_markdown_for_embedding(markdown_text: str) -> str:
    """
    Cleans and strips markdown content, leaving behind only the semantic text
    ready for an embedding model.

    Args:
        markdown_text: The raw markdown string.

    Returns:
        A cleaned text string.
    """

    # --- 1. Initial Cleaning and Normalization ---

    # 1.1 REMOVE LINKS AND IMAGE TAGS: Remove the pattern [text](url) and ![text](url)
    text = re.sub(r'\!?\[.*?\]\s*\(.*?\)', '', markdown_text, flags=re.DOTALL)
    text = re.sub(r'Zoom image will be displayed', '', text)
    text = re.sub(r'http[s]?://miro.medium.com/v2/resize:.*?\.png', '', text)

    # 1.2 Remove Extraneous Backslashes (e.g., escaping in \- or \.)
    text = re.sub(r'\\-', '-', text)
    text = re.sub(r'\\([`*_{}\[\]()#+.!])', r'\1', text)
    
    # 1.3 Normalize Newlines: Convert multiple newlines/whitespace into a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # --- 2. Markdown Structure Stripping ---

    # 2.1 Remove Headings (Setext style: === or --- lines)
    text = re.sub(r'\n[=-]{2,}\s*$', '', text, flags=re.MULTILINE)

    # 2.2 Remove Blockquotes/Code Fences (Markers: > and ```)
    text = re.sub(r'^\s*>\s?', '', text, flags=re.MULTILINE)
    text = re.sub(r'```[a-zA-Z]*\s*', ' ', text)
    text = re.sub(r'`', ' ', text)
    
    # 2.3 Remove List Markers (e.g., 1. or - or *)
    text = re.sub(r'^\s*\d+\.\s', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*[\-\*]\s', '', text, flags=re.MULTILINE)
    
    # 2.4 Remove Emphasis Markers (e.g., **, *, __, _)
    text = re.sub(r'(\*\*|__)', '', text) # Bold/Strong
    text = re.sub(r'(\*|_)', '', text)    # Italic/Emphasis

    # 2.5 Remove remaining HTML tags (like '<hibernate-mapping>') which are often in code
    text = re.sub(r'<[^>]+>', '', text)
    
    # --- 3. Final Text Polishing ---

    # 3.1 Normalize Whitespace again: Collapse all multiple spaces into one
    text = re.sub(r'\s+', ' ', text).strip()

    # 3.2 Lowercasing (Optional but recommended for many embedding models)
    text = text.lower()

    return text

## preprocess sample
# Process 15 free articles
sample_free_texts = free_articles_df["full_article_text"].iloc[:15]
preprocessed_free_chunks = [preprocess_markdown_for_embedding(text) for text in sample_free_texts]

# Process 15 paid articles
sample_paid_texts = paid_articles_df["full_article_text"].iloc[:15]
preprocessed_paid_chunks = [preprocess_markdown_for_embedding(text) for text in sample_paid_texts]

# Print preprocessed free articles
print("Preprocessed Free Articles:")
for i, text in enumerate(preprocessed_free_chunks, 1):
    print(f"Free Article {i}:")
    print(text)
    print("-" * 80)

# Print preprocessed paid articles
print("Preprocessed Paid Articles:")
for i, text in enumerate(preprocessed_paid_chunks, 1):
    print(f"Paid Article {i}:")
    print(text)
    print("-" * 80)

Preprocessed Free Articles:
Free Article 1:
--------------------------------------------------------------------------------
Free Article 2:
--------------------------------------------------------------------------------
Free Article 3:
when i decided to take a sabbatical from my career, i knew that i would miss conducting research. while taking a break from the day-to-day work was much needed, i still enjoyed conducting qualitative research. so, to fill the void, i decided to do my own research project. i had many options available, but the topic that personally interested me was understanding individual contributors within the tech industry at a deeper level. you see i very much considered myself dedicated to this path, and yet i never really felt there was much guidance about this path — especially ways to perform this path at a high level. from my experience, most organizations define and coach success as part of the people management path. even my managers, as wonderful, helpful,

In [4]:
model = SentenceTransformer('prdev/mini-gte') # A good balance of speed and performance

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
# Select all free articles
articles_to_embed = free_articles_df

# Preprocess the text for each article
preprocessed_texts = articles_to_embed["full_article_text"].apply(preprocess_markdown_for_embedding).tolist()

# Encode the preprocessed texts to get embeddings
embeddings_free = model.encode(preprocessed_texts, convert_to_tensor=True, device=device, batch_size=8, show_progress_bar=True)

print(f"Embeddings shape: {embeddings_free.shape}")

Batches:   0%|          | 0/2871 [00:00<?, ?it/s]

Embeddings shape: torch.Size([22965, 768])


In [17]:
# Select all free articles
articles_to_embed = paid_articles_df

# Preprocess the text for each article
preprocessed_texts = articles_to_embed["full_article_text"].apply(preprocess_markdown_for_embedding).tolist()

# Encode the preprocessed texts to get embeddings
embeddings_paid = model.encode(preprocessed_texts, convert_to_tensor=True, device=device, batch_size=20, show_progress_bar=True)

print(f"Embeddings shape: {embeddings_paid.shape}")

Batches:   0%|          | 0/611 [00:00<?, ?it/s]

Embeddings shape: torch.Size([12220, 768])


In [None]:
# Save the embeddings to a file
torch.save(embeddings_free.cpu(), 'embeddings_free.pt')
torch.save(embeddings_paid.cpu(), 'embeddings_paid.pt')

In [3]:
embeddings_free = torch.load('embeddings_free.pt').to(device)
embeddings_paid = torch.load('embeddings_paid.pt').to(device)
embeddings_free.shape, embeddings_paid.shape

(torch.Size([22965, 768]), torch.Size([12220, 768]))

### Tensorboard

In [None]:


# Combine embeddings from free and paid articles
embeddings_combined = torch.cat([embeddings_free, embeddings_paid], dim=0).cpu().numpy()

# Create labels: 'free' for free articles, 'paid' for paid articles
num_free = embeddings_free.shape[0]
num_paid = embeddings_paid.shape[0]
labels = ['free'] * num_free + ['paid'] * num_paid

print(f"Total embeddings: {embeddings_combined.shape[0]}, Labels: {len(labels)}")

# replace any newlines or tabs in titles
free_articles_df['title'] = free_articles_df['title'].str.replace('\n', ' ', regex=False).str.replace('\t', ' ', regex=False).str.strip().fillna('No Title')

# replace even more aggressively
free_articles_df['title'] = free_articles_df['title'].str.replace(r'\s+', ' ', regex=True).str.strip().fillna('No Title')
paid_articles_df['title'] = paid_articles_df['title'].str.replace(r'\s+', ' ', regex=True).str.strip().fillna('No Title')

# Create titles list
titles = free_articles_df['title'].tolist() + paid_articles_df['title'].tolist()

os.makedirs('logs', exist_ok=True)

# Write metadata to TSV file with label and title
with open('logs/metadata.tsv', 'w', encoding='utf-8') as f:
    f.write('label\ttitle\n')
    for label, title in zip(labels, titles):
        f.write(f'{label}\t{title}\n')

# Create logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True)

# Create TensorFlow variable for embeddings and save checkpoint
embeddings_var = tf.Variable(embeddings_combined, name='embeddings')
checkpoint = tf.train.Checkpoint(embeddings=embeddings_var)
checkpoint.save(os.path.join('logs', 'embeddings.ckpt'))

# Configure the projector
config = projector.ProjectorConfig()
embedding_config = config.embeddings.add()
embedding_config.tensor_name = 'embeddings/.ATTRIBUTES/VARIABLE_VALUE'
embedding_config.metadata_path = 'metadata.tsv'
projector.visualize_embeddings('logs', config)

Total embeddings: 35185, Labels: 35185


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  free_articles_df['title'] = free_articles_df['title'].str.replace('\n', ' ', regex=False).str.replace('\t', ' ', regex=False).str.strip().fillna('No Title')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  free_articles_df['title'] = free_articles_df['title'].str.replace(r'\s+', ' ', regex=True).str.strip().fillna('No Title')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pan