# Medium Article Semantic Search by Title+Subtitle

### Load Data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("data/medium_post_titles.csv", nrows=100)
# data source: https://www.kaggle.com/datasets/nulldata/medium-post-titles

In [3]:
df = data.copy()
df.head()

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False
3,equality,"""Call me Nat Love"" :Black Cowboys and the Fron...",,False
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False


In [5]:
# df['category'].value_counts()

In [6]:
df["subtitle_truncated_flag"].value_counts()

subtitle_truncated_flag
False    89
True     11
Name: count, dtype: int64

### Data Cleanup

In [7]:
df.isna().sum()

category                   0
title                      0
subtitle                   1
subtitle_truncated_flag    0
dtype: int64

In [8]:
df = df.dropna()

In [9]:

df = df[~df["subtitle_truncated_flag"]]
df.head()


Unnamed: 0,category,title,subtitle,subtitle_truncated_flag
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False
5,cryptocurrency,"""Cypherpunks and Wall Street"": The Security To...",Bruce Fenton presents at the World Blockchain ...,False


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 88 entries, 0 to 99
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   category                 88 non-null     object
 1   title                    88 non-null     object
 2   subtitle                 88 non-null     object
 3   subtitle_truncated_flag  88 non-null     bool  
dtypes: bool(1), object(3)
memory usage: 2.8+ KB


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6211 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   category                 6211 non-null   object
 1   title                    6211 non-null   object
 2   subtitle                 6211 non-null   object
 3   subtitle_truncated_flag  6211 non-null   bool  
dtypes: bool(1), object(3)
memory usage: 200.2+ KB


In [None]:

df['title_extended'] = df['title'] + df['subtitle']

In [None]:
# df.head()
# df['category'].nunique()
# df.shape

### Prep for Upsert

In [28]:
import pathlib
import chromadb
from chromadb.utils import embedding_functions
from more_itertools import batched

def build_chroma_collection(
    chroma_path: pathlib.Path,
    collection_name: str,
    embedding_func_name: str,
    ids: list[str],
    documents: list[str],
    metadatas: list[dict],
    distance_func_name: str = "cosine",
):
    """Create a ChromaDB collection"""

    chroma_client = chromadb.PersistentClient(chroma_path)

    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=embedding_func_name
    )

    collection = chroma_client.create_collection(
        name=collection_name,
        embedding_function=embedding_func,
        metadata={"hnsw:space": distance_func_name},
    )

    document_indices = list(range(len(documents)))

    for batch in batched(document_indices, 166):
        start_idx = batch[0]
        end_idx = batch[-1]

        collection.add(
            ids=ids[start_idx:end_idx],
            documents=documents[start_idx:end_idx],
            metadatas=metadatas[start_idx:end_idx],
        )

In [None]:

def prepare_medium_articles_data(path):
    df = pd.read_csv(path,nrows=10000)
    df = df.dropna()
    df = df[~df["subtitle_truncated_flag"]]
    df['title_extended'] = df['title'] + df['subtitle']

    
    ids = [f"article{i}" for i in range(df.shape[0])]
    documents = df["title_extended"].to_list()
    metadatas = df.drop("title_extended", axis=1).to_dict(orient="records") 

    return {"ids": ids, "documents": documents, "metadatas": metadatas}


articles_dict = prepare_medium_articles_data("data/medium_post_titles.csv")


articles_dict.keys()

In [31]:
import chromadb
from chromadb.utils import embedding_functions

DATA_PATH = "data/medium_post_titles.csv"
CHROMA_PATH = "medium_articles_embeddings_"
EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1"
COLLECTION_NAME = "medium_articles"

articles_dict = prepare_medium_articles_data(DATA_PATH)


build_chroma_collection(
    CHROMA_PATH,
    COLLECTION_NAME,
    EMBEDDING_FUNC_NAME,
    articles_dict["ids"],
    articles_dict["documents"],
    articles_dict["metadatas"]
)


In [37]:
client = chromadb.PersistentClient(CHROMA_PATH)
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=EMBEDDING_FUNC_NAME
    )
collection = client.get_collection(name=COLLECTION_NAME, embedding_function=embedding_func)

query_texts=["Find me some best articles related to programming"]

best_articles = collection.query(
    query_texts = query_texts,
    n_results=5,
    include=["documents", "distances", "metadatas"]
)

best_articles["documents"][0][0]

'9 Amazing Articles on Python ProgrammingA collection of our favorite pieces from the last year'

In [35]:
collection.count()

81053

In [38]:
def display_query_results(query_texts, query_results):
    for i, query in enumerate(query_texts):
        print(f"\n🔍 Query {i + 1}: \"{query}\"\n{'-' * 60}")
        documents = query_results["documents"][i]
        distances = query_results["distances"][i]

        for j, (doc, dist) in enumerate(zip(documents, distances), start=1):
            print(f"Result {j}:")
            print(f"📄 Document: {doc}")
            print(f"📏 Distance: {dist:.4f}")
            print()

display_query_results(query_texts,  best_articles)


🔍 Query 1: "Find me some best articles related to programming"
------------------------------------------------------------
Result 1:
📄 Document: 9 Amazing Articles on Python ProgrammingA collection of our favorite pieces from the last year
📏 Distance: 0.3586

Result 2:
📄 Document: A graphical introduction to dynamic programmingA highly visual introduction to dynamic programming, and an overview of three separate problems utilizing dynamic programming
📏 Distance: 0.3992

Result 3:
📄 Document: “The Art of Computer Programming” by Donald KnuthBill Gates doesn’t think most people can finish this book. I gave it a shot.
📏 Distance: 0.4263

Result 4:
📄 Document: Top 5 Contemporary Software Engineering BooksIntro
📏 Distance: 0.4314

Result 5:
📄 Document: What programming language should I learn?Answering one of the most commonly asked questions the right way
📏 Distance: 0.4331



In [26]:
articles_dict['metadatas'][:2]

[{'category': 'work',
  'title': '"21 Conversations" - A fun (and easy) game for teams to get to know each other',
  'subtitle': 'A (new?) Icebreaker game to get your team to say all the interesting stuff',
  'subtitle_truncated_flag': False},
 {'category': 'spirituality',
  'title': '"Biblical Porn" at Mars Hill',
  'subtitle': "Author and UW lecturer Jessica Johnson talks about her new book on Mars Hill Church's and Mark Driscoll's evangelical masculinity",
  'subtitle_truncated_flag': False}]

In [45]:
test_df = df.copy()
test_df['metadata'] = test_df.apply(lambda x: {
    'title' : x['title'],
    'subtitle': x['subtitle'],
    'category': x['category']
    
}, axis=1)


In [46]:
test_df.head()


Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended,metadata
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for...","{'title': '""21 Conversations"" - A fun (and eas..."
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars HillAuthor and UW lect...","{'title': '""Biblical Porn"" at Mars Hill', 'sub..."
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False,"""CISGENDER?! Is That A Disease?!""Or, a primer ...","{'title': '""CISGENDER?! Is That A Disease?!""',..."
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False,"""Can I Train my Model on Your Computer?""How we...","{'title': '""Can I Train my Model on Your Compu..."
5,cryptocurrency,"""Cypherpunks and Wall Street"": The Security To...",Bruce Fenton presents at the World Blockchain ...,False,"""Cypherpunks and Wall Street"": The Security To...","{'title': '""Cypherpunks and Wall Street"": The ..."


### Query

In [41]:
query_texts = ["which city is the most beautiful"]

output = collection.query(
    query_texts = query_texts,
    n_results=5,
    include=["documents", "distances", "metadatas", "embeddings"]
)

retrieved_docs = output["documents"][0]

In [42]:
retrieved_docs

['The 5 Best Cities For Street ArtIncredible murals around the globe',
 '3 Places Where You Can Find BeautyIf you are willing to look hard enough, eventually you will see beauty in the most difficult of places.',
 'For The Simple Beauties Of Life\u200a—\u200aPhotosAutumn In The North, All The More Beautiful For Its Brevity',
 'A City That’s Better for the Blind Is Better for EveryoneComplete parity with the sighted may seem like an impossible goal, but maybe the only thing holding us back is a lack of imagination.',
 'The Poetry and History of CitiesFar from being a soulless commercial center, the city is the most intense expression of humanity around.']

In [None]:
https://python.langchain.com/docs/integrations/chat/google_generative_ai/