In [31]:
import json
import pandas as pd
import random
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
import os
import time
from pinecone import Pinecone, ServerlessSpec

In [33]:
json_filename = 'data.json'

In [35]:
#Load Data
data = []

with open(json_filename, 'r') as file:
    for line in file:
        data.append(json.loads(line))

In [37]:
df = pd.DataFrame(data)

In [39]:
print(df.head())

                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   
1  https://www.huffpost.com/entry/american-airlin...   
2  https://www.huffpost.com/entry/funniest-tweets...   
3  https://www.huffpost.com/entry/funniest-parent...   
4  https://www.huffpost.com/entry/amy-cooper-lose...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   
1  He was subdued by passengers and crew when he ...        Mary Papenfuss   
2  "Until you have a dog y

In [41]:
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

# Filter the DataFrame to get only dates after 10/1/2021 
threshold_date = pd.to_datetime('2021-10-01')
filtered_df = df[df['date'] > threshold_date]

print(filtered_df.head())

                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   
1  https://www.huffpost.com/entry/american-airlin...   
2  https://www.huffpost.com/entry/funniest-tweets...   
3  https://www.huffpost.com/entry/funniest-parent...   
4  https://www.huffpost.com/entry/amy-cooper-lose...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   
1  He was subdued by passengers and crew when he ...        Mary Papenfuss   
2  "Until you have a dog y

In [43]:
filtered_df = filtered_df.astype(str)

In [45]:
# Concatenate all the columns into one column (as a string) for embedding
filtered_df['concatenated'] = filtered_df.apply(lambda row: ' '.join(row), axis=1)

print(filtered_df)

                                                   link  \
0     https://www.huffpost.com/entry/covid-boosters-...   
1     https://www.huffpost.com/entry/american-airlin...   
2     https://www.huffpost.com/entry/funniest-tweets...   
3     https://www.huffpost.com/entry/funniest-parent...   
4     https://www.huffpost.com/entry/amy-cooper-lose...   
...                                                 ...   
1901  https://www.huffpost.com/entry/womens-march-ab...   
1902  https://www.huffpost.com/entry/biden-says-ever...   
1903  https://www.huffpost.com/entry/cyber-ninjas-ar...   
1904  https://www.huffpost.com/entry/jennifer-garner...   
1905  https://www.huffpost.com/entry/joro-spiders-in...   

                                               headline       category  \
0     Over 4 Million Americans Roll Up Sleeves For O...      U.S. NEWS   
1     American Airlines Flyer Charged, Banned For Li...      U.S. NEWS   
2     23 Of The Funniest Tweets About Cats And Dogs ...         COMED

In [47]:
# Load the SentenceTransformer model
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model_name)

# Concatenate all text columns into a list for embedding
texts_to_embed = filtered_df['concatenated'].tolist()

# Get embeddings
embeddings = model.encode(texts_to_embed, show_progress_bar=True, batch_size=32)

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

In [49]:
pc = Pinecone(api_key='')

In [51]:
index_name = 'news-embedding-stitching'

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

embed_dim = 768

In [53]:
pc.create_index(
        index_name,
        dimension=embed_dim,  
        metric='euclidean',
        spec=spec
    )

In [55]:
index = pc.Index(index_name)

In [57]:
# Set a fixed random seed
random.seed(9)

# Ensure embeddings are in the correct format
embeddings_list = embeddings.tolist() 

# Prepare metadata with raw_text_index and original text
metadata = [{"raw_text_index": str(i), "text": texts_to_embed[i]} for i in range(len(texts_to_embed))]

# Reserve 10 random embeddings for querying using the fixed seed
reserved_indices = random.sample(range(len(embeddings_list)), 10)

query_embeddings = [embeddings_list[i] for i in reserved_indices]
query_metadata = [metadata[i] for i in reserved_indices]

# Prepare the remaining embeddings for upsert 
remaining_embeddings = [embeddings_list[i] for i in range(len(embeddings_list)) if i not in reserved_indices]
remaining_metadata = [metadata[i] for i in range(len(metadata)) if i not in reserved_indices]

# Prepare data for upsert
upsert_data = {
    'id': [str(i) for i in range(len(remaining_embeddings))],  
    'values': remaining_embeddings,  
    'metadata': remaining_metadata  
}

vectors_to_upsert = [(upsert_data['id'][i], upsert_data['values'][i], upsert_data['metadata'][i]) 
                     for i in range(len(upsert_data['id']))]

# Upsert in batches 
batch_size = 100
sleep_time = 2
for i in range(0, len(vectors_to_upsert), batch_size):
    batch = vectors_to_upsert[i:i+batch_size]
    index.upsert(vectors=batch)
    print(f"Upserted batch {i // batch_size + 1} of {len(vectors_to_upsert) // batch_size + 1}")
    time.sleep(sleep_time)

print("Upsert complete!")

Upserted batch 1 of 19
Upserted batch 2 of 19
Upserted batch 3 of 19
Upserted batch 4 of 19
Upserted batch 5 of 19
Upserted batch 6 of 19
Upserted batch 7 of 19
Upserted batch 8 of 19
Upserted batch 9 of 19
Upserted batch 10 of 19
Upserted batch 11 of 19
Upserted batch 12 of 19
Upserted batch 13 of 19
Upserted batch 14 of 19
Upserted batch 15 of 19
Upserted batch 16 of 19
Upserted batch 17 of 19
Upserted batch 18 of 19
Upserted batch 19 of 19
Upsert complete!
