In [None]:
from database.database import MediumArticle, URL, Comment, Author
from database.database import get_session
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
from sqlalchemy import func
from IPython.display import display, Markdown, Latex
import re


session = get_session()

In [3]:
# query all articles
articles_df = pd.read_sql(session.query(MediumArticle).statement, session.bind)
articles_df["text_length"] = articles_df["full_article_text"].apply(lambda x: len(x.split()))

articles_filtered = articles_df[articles_df["date_published"] > datetime.datetime(2020, 1, 1)]
articles_filtered = articles_filtered[articles_filtered["language"] == "en"]
print(f"Number of articles published since 2020-01-01 in English: {len(articles_filtered)}")

free_articles_df = articles_filtered[articles_filtered["is_free"] == True]
paid_articles_df = articles_filtered[articles_filtered["is_free"] == False]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Number of articles published since 2020-01-01 in English: 35185


In [14]:
free_articles_df["full_article_text"].iloc[0][:5000]



## Build Pipeline

In [20]:
def preprocess_markdown_for_embedding(markdown_text: str) -> str:
    """
    Cleans and strips markdown content, leaving behind only the semantic text
    ready for an embedding model.

    Args:
        markdown_text: The raw markdown string.

    Returns:
        A cleaned text string.
    """

    # --- 1. Initial Cleaning and Normalization ---

    # 1.1 REMOVE LINKS AND IMAGE TAGS: Remove the pattern [text](url) and ![text](url)
    text = re.sub(r'\!?\[.*?\]\s*\(.*?\)', '', markdown_text, flags=re.DOTALL)
    text = re.sub(r'Zoom image will be displayed', '', text)
    text = re.sub(r'http[s]?://miro.medium.com/v2/resize:.*?\.png', '', text)

    # 1.2 Remove Extraneous Backslashes (e.g., escaping in \- or \.)
    text = re.sub(r'\\-', '-', text)
    text = re.sub(r'\\([`*_{}\[\]()#+.!])', r'\1', text)
    
    # 1.3 Normalize Newlines: Convert multiple newlines/whitespace into a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # --- 2. Markdown Structure Stripping ---

    # 2.1 Remove Headings (Setext style: === or --- lines)
    text = re.sub(r'\n[=-]{2,}\s*$', '', text, flags=re.MULTILINE)

    # 2.2 Remove Blockquotes/Code Fences (Markers: > and ```)
    text = re.sub(r'^\s*>\s?', '', text, flags=re.MULTILINE)
    text = re.sub(r'```[a-zA-Z]*\s*', ' ', text)
    text = re.sub(r'`', ' ', text)
    
    # 2.3 Remove List Markers (e.g., 1. or - or *)
    text = re.sub(r'^\s*\d+\.\s', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*[\-\*]\s', '', text, flags=re.MULTILINE)
    
    # 2.4 Remove Emphasis Markers (e.g., **, *, __, _)
    text = re.sub(r'(\*\*|__)', '', text) # Bold/Strong
    text = re.sub(r'(\*|_)', '', text)    # Italic/Emphasis

    # 2.5 Remove remaining HTML tags (like '<hibernate-mapping>') which are often in code
    text = re.sub(r'<[^>]+>', '', text)
    
    # --- 3. Final Text Polishing ---

    # 3.1 Normalize Whitespace again: Collapse all multiple spaces into one
    text = re.sub(r'\s+', ' ', text).strip()

    # 3.2 Lowercasing (Optional but recommended for many embedding models)
    text = text.lower()

    return text

## preprocess sample
sample_text = free_articles_df["full_article_text"].iloc[0]
preprocessed_chunks = preprocess_markdown_for_embedding(sample_text)
preprocessed_chunks

