In [3]:
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone

load_dotenv()
pc_api_key= os.getenv("PINECONE_API_KEY")
# Load dataset
dataset = load_dataset("Mozes721/stock-crypto-weather-dataset", data_files="stock_mapper_training.csv")
df = dataset["train"].to_pandas()

# Step 2: Create alias map
alias_to_ticker = {}

for _, row in df.iterrows():
    ticker = row['text'].upper()
    name = row['label'].lower()
    alias_to_ticker[ticker] = ticker
    alias_to_ticker[name] = ticker
    # Optional: add lowercase ticker too
    alias_to_ticker[ticker.lower()] = ticker

# Step 3: Prepare for embedding
aliases = list(alias_to_ticker.keys())
tickers = [alias_to_ticker[a] for a in aliases]

# Embed
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(aliases, convert_to_numpy=True)

# Step 5: Push to Pinecone
pc = Pinecone(api_key=pc_api_key)
index = pc.Index("stock-index")

# Prepare vectors in correct format
vectors = []
for i in range(len(aliases)):
    vectors.append({
        "id": f"stock_{i}",
        "values": embeddings[i].tolist(),
        "metadata": {"ticker": tickers[i], "alias": aliases[i]}
    })

# Batch upsert to avoid 2MB limit
batch_size = 50
total_batches = (len(vectors) + batch_size - 1) // batch_size

for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i + batch_size]
    index.upsert(vectors=batch)
    batch_num = i // batch_size + 1
    print(f"Batch {batch_num}/{total_batches} has been embedded and uploaded ({len(batch)} vectors)")

print("All batches completed!")

Batch 1/33 has been embedded and uploaded (50 vectors)
Batch 2/33 has been embedded and uploaded (50 vectors)
Batch 3/33 has been embedded and uploaded (50 vectors)
Batch 4/33 has been embedded and uploaded (50 vectors)
Batch 5/33 has been embedded and uploaded (50 vectors)
Batch 6/33 has been embedded and uploaded (50 vectors)
Batch 7/33 has been embedded and uploaded (50 vectors)
Batch 8/33 has been embedded and uploaded (50 vectors)
Batch 9/33 has been embedded and uploaded (50 vectors)
Batch 10/33 has been embedded and uploaded (50 vectors)
Batch 11/33 has been embedded and uploaded (50 vectors)
Batch 12/33 has been embedded and uploaded (50 vectors)
Batch 13/33 has been embedded and uploaded (50 vectors)
Batch 14/33 has been embedded and uploaded (50 vectors)
Batch 15/33 has been embedded and uploaded (50 vectors)
Batch 16/33 has been embedded and uploaded (50 vectors)
Batch 17/33 has been embedded and uploaded (50 vectors)
Batch 18/33 has been embedded and uploaded (50 vectors)
B

In [26]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone

# Load dataset
dataset = load_dataset("Mozes721/stock-crypto-weather-dataset", data_files="stock_mapper_training.csv")
df = dataset["train"].to_pandas()

# Create alias map
alias_to_ticker = {}
for _, row in df.iterrows():
    ticker = row['text'].upper()
    name = row['label'].lower()
    alias_to_ticker[ticker] = ticker
    alias_to_ticker[name] = ticker
    alias_to_ticker[ticker.lower()] = ticker

# Prepare for embedding
aliases = list(alias_to_ticker.keys())
tickers = [alias_to_ticker[a] for a in aliases]

# Embed
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(aliases, convert_to_numpy=True)