In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("medium_post_titles.csv", nrows=10_000) #limitation for datarows to save space

In [3]:
df.head()

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False
3,equality,"""Call me Nat Love"" :Black Cowboys and the Fron...",,False
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False


In [4]:
df["subtitle_truncated_flag"].value_counts()

subtitle_truncated_flag
False    6318
True     3682
Name: count, dtype: int64

In [5]:
### Medium article semantic search TITLE + SUBTITLE

In [6]:
## LOADING DATA

In [7]:
df.isna().sum()

category                     0
title                        0
subtitle                   107
subtitle_truncated_flag      0
dtype: int64

In [8]:
#cleaning from NaN values
df = df.dropna()

In [9]:
df.isna().sum()

category                   0
title                      0
subtitle                   0
subtitle_truncated_flag    0
dtype: int64

In [10]:
#only False
df = df[~df["subtitle_truncated_flag"]]

In [11]:
df["subtitle_truncated_flag"].value_counts()

subtitle_truncated_flag
False    6211
Name: count, dtype: int64

In [12]:
df.shape

(6211, 4)

In [13]:
## CLEANING DATA

In [14]:
df["title"]

0       "21 Conversations" - A fun (and easy) game for...
1                            "Biblical Porn" at Mars Hill
2                       "CISGENDER?! Is That A Disease?!"
4                "Can I Train my Model on Your Computer?"
5       "Cypherpunks and Wall Street": The Security To...
                              ...                        
9994       America Lets Too Much Young Talent Go to Waste
9996    America Loves the Idea of Family Farms. That’s...
9997    America May Need to Adopt China’s Weapons to W...
9998    America May Outsmart China in 5G With AI and B...
9999                         America Needs Bernie Sanders
Name: title, Length: 6211, dtype: object

In [15]:
df["title_extended"] = df["title"] + df["subtitle"]

In [16]:
#display unique categories
df["category"].nunique()

93

In [17]:
df.shape # <-- it says that require above 6 k vectors

(6211, 5)

In [18]:
## Prep to upsert

In [19]:
from tqdm.autonotebook import tqdm
import pinecone

  from tqdm.autonotebook import tqdm


In [20]:
pinecone.init(api_key="API_KEY", environment="gcp-starter")

In [22]:
pinecone.create_index(name="medium-data", dimension=384, pod_type="starter", metric="cosine")

In [23]:
from sentence_transformers import SentenceTransformer

In [24]:
import torch

In [25]:
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

Downloading .gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [26]:
df["values"] = df["title_extended"].map(lambda x: (model.encode(x)).tolist()) #encode return numpy array to list converts to python list

In [27]:
df.head() #values has index value (vector)

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended,values
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for...","[-0.031074408441781998, -0.014303437434136868,..."
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars HillAuthor and UW lect...","[-0.03467036783695221, -0.01816530153155327, -..."
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False,"""CISGENDER?! Is That A Disease?!""Or, a primer ...","[0.03740726038813591, -0.0008568025659769773, ..."
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False,"""Can I Train my Model on Your Computer?""How we...","[-0.013686483725905418, 0.004296026658266783, ..."
5,cryptocurrency,"""Cypherpunks and Wall Street"": The Security To...",Bruce Fenton presents at the World Blockchain ...,False,"""Cypherpunks and Wall Street"": The Security To...","[-0.03146887943148613, -0.004646676126867533, ..."


In [28]:
df["id"] = df.reset_index(drop="index").index

In [29]:
df.head() #added id column

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended,values,id
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for...","[-0.031074408441781998, -0.014303437434136868,...",0
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars HillAuthor and UW lect...","[-0.03467036783695221, -0.01816530153155327, -...",1
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False,"""CISGENDER?! Is That A Disease?!""Or, a primer ...","[0.03740726038813591, -0.0008568025659769773, ...",2
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False,"""Can I Train my Model on Your Computer?""How we...","[-0.013686483725905418, 0.004296026658266783, ...",3
5,cryptocurrency,"""Cypherpunks and Wall Street"": The Security To...",Bruce Fenton presents at the World Blockchain ...,False,"""Cypherpunks and Wall Street"": The Security To...","[-0.03146887943148613, -0.004646676126867533, ...",4


In [30]:
df["metadata"] = df.apply(lambda x: {
    'title': x['title'],
    'subtitle': x['subtitle'],
    'category': x['category']
}, axis=1)

In [31]:
df.head()

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended,values,id,metadata
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for...","[-0.031074408441781998, -0.014303437434136868,...",0,"{'title': '""21 Conversations"" - A fun (and eas..."
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars HillAuthor and UW lect...","[-0.03467036783695221, -0.01816530153155327, -...",1,"{'title': '""Biblical Porn"" at Mars Hill', 'sub..."
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False,"""CISGENDER?! Is That A Disease?!""Or, a primer ...","[0.03740726038813591, -0.0008568025659769773, ...",2,"{'title': '""CISGENDER?! Is That A Disease?!""',..."
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False,"""Can I Train my Model on Your Computer?""How we...","[-0.013686483725905418, 0.004296026658266783, ...",3,"{'title': '""Can I Train my Model on Your Compu..."
5,cryptocurrency,"""Cypherpunks and Wall Street"": The Security To...",Bruce Fenton presents at the World Blockchain ...,False,"""Cypherpunks and Wall Street"": The Security To...","[-0.03146887943148613, -0.004646676126867533, ...",4,"{'title': '""Cypherpunks and Wall Street"": The ..."


In [32]:
df_upsert = df[["id", "values", "metadata"]] # above pre-processing requried to upsert to db, we needed index (id), value vecotr (values), and metadata to filter (metdata)

In [35]:
df_upsert.loc[:, 'id'] = df_upsert['id'].map(lambda x: str(x)) #id has to be string (index)

In [33]:
index = pinecone.Index('medium-data')

In [36]:
index.upsert_from_dataframe(df_upsert)

sending upsert requests:   0%|          | 0/6211 [00:00<?, ?it/s]

{'upserted_count': 6211}

In [37]:
### QUERY to database

In [39]:
xc = index.query((model.encode('Where is my cat?')).tolist(), top_k=10, include_metadata=True) #model encode into vector

In [43]:
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['title']}")

0.52: Allow Yourself To Get Another Pet And See What Happens
0.49: A Mutual Rescue Mission
0.45: A Cat Taught Me A Lesson About My Intense Need For Approval
0.41: A Letter to My Boss on Why I Couldn’t Work Today
0.38: A Tale of Two Kitties: My Experience and Tips for Moving Abroad with Pets
0.38: A Eulogy to my Dead Cat and Immortality
0.36: A Day in the Catskills with Miriam
0.36: A Trip to the Westminster Dog Show
0.35: Across The Pond
0.34: A Pet Named Anxiety


In [44]:
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['subtitle']}")

0.52: While my cat could never be replaced, I could fill the hole he left in my life.
0.49: On the cats who saved me
0.45: And all I wanted was a cuddle.
0.41: My cat conspired against me.
0.38: “Didn’t they have to go into quarantine?”
0.38: A meditation on death, life, and the time in-between.
0.36: If only he could write her story
0.36: Ruth and I go to the big show!
0.35: Romance in the city.
0.34: There are only two reasons I am telling this story.


In [45]:
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['title']}: {result['metadata']['category']}")

0.52: Allow Yourself To Get Another Pet And See What Happens: pets
0.49: A Mutual Rescue Mission: mental-health
0.45: A Cat Taught Me A Lesson About My Intense Need For Approval: pets
0.41: A Letter to My Boss on Why I Couldn’t Work Today: fiction
0.38: A Tale of Two Kitties: My Experience and Tips for Moving Abroad with Pets: pets
0.38: A Eulogy to my Dead Cat and Immortality: philosophy
0.36: A Day in the Catskills with Miriam: fiction
0.36: A Trip to the Westminster Dog Show: comics
0.35: Across The Pond: fiction
0.34: A Pet Named Anxiety: lifestyle


In [46]:
xc2 = index.query((model.encode('Which city is the most beautiful?')).tolist(), top_k=10, include_metadata=True)

In [47]:
for result in xc2['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['title']}: {result['metadata']['category']}")

0.57: 3 Places Where You Can Find Beauty: photography
0.52: A Shining City on a Hill: politics
0.46: 6 Easy Reasons to Enjoy Exploring South Wales: travel
0.43: A City That’s Better for the Blind Is Better for Everyone: accessibility
0.42: Across The Pond: fiction
0.41: Ace Hotel: A UX Case Study: ux
0.41: A Most Beautiful Game: sports
0.41: 6 Literary Cities for Book Lovers To Visit This Year: travel
0.4: A city and its architecture: cities
0.38: Adaptive urban design: design
