In [3]:
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone

load_dotenv()
pc_api_key= os.getenv("PINECONE_API_KEY")
# Load dataset
dataset = load_dataset("Mozes721/stock-crypto-weather-dataset", data_files="stock_mapper_training.csv")
df = dataset["train"].to_pandas()

# Step 2: Create alias map
alias_to_ticker = {}

for _, row in df.iterrows():
    ticker = row['text'].upper()
    name = row['label'].lower()
    alias_to_ticker[ticker] = ticker
    alias_to_ticker[name] = ticker
    # Optional: add lowercase ticker too
    alias_to_ticker[ticker.lower()] = ticker

# Step 3: Prepare for embedding
aliases = list(alias_to_ticker.keys())
tickers = [alias_to_ticker[a] for a in aliases]

# Embed
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(aliases, convert_to_numpy=True)

# Step 5: Push to Pinecone
pc = Pinecone(api_key=pc_api_key)
index = pc.Index("stock-index")

# Prepare vectors in correct format
vectors = []
for i in range(len(aliases)):
    vectors.append({
        "id": f"stock_{i}",
        "values": embeddings[i].tolist(),
        "metadata": {"ticker": tickers[i], "alias": aliases[i]}
    })

# Batch upsert to avoid 2MB limit
batch_size = 50
total_batches = (len(vectors) + batch_size - 1) // batch_size

for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i + batch_size]
    index.upsert(vectors=batch)
    batch_num = i // batch_size + 1
    print(f"Batch {batch_num}/{total_batches} has been embedded and uploaded ({len(batch)} vectors)")

print("All batches completed!")

Batch 1/33 has been embedded and uploaded (50 vectors)
Batch 2/33 has been embedded and uploaded (50 vectors)
Batch 3/33 has been embedded and uploaded (50 vectors)
Batch 4/33 has been embedded and uploaded (50 vectors)
Batch 5/33 has been embedded and uploaded (50 vectors)
Batch 6/33 has been embedded and uploaded (50 vectors)
Batch 7/33 has been embedded and uploaded (50 vectors)
Batch 8/33 has been embedded and uploaded (50 vectors)
Batch 9/33 has been embedded and uploaded (50 vectors)
Batch 10/33 has been embedded and uploaded (50 vectors)
Batch 11/33 has been embedded and uploaded (50 vectors)
Batch 12/33 has been embedded and uploaded (50 vectors)
Batch 13/33 has been embedded and uploaded (50 vectors)
Batch 14/33 has been embedded and uploaded (50 vectors)
Batch 15/33 has been embedded and uploaded (50 vectors)
Batch 16/33 has been embedded and uploaded (50 vectors)
Batch 17/33 has been embedded and uploaded (50 vectors)
Batch 18/33 has been embedded and uploaded (50 vectors)
B

In [2]:
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone

load_dotenv()
pc_api_key= os.getenv("PINECONE_API_KEY")

dataset = load_dataset("Mozes721/stock-crypto-weather-dataset", data_files="crypto_mapppings.csv")
df = dataset["train"].to_pandas()

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 12242 examples [00:00, 595655.20 examples/s]


In [3]:
# Step 2: Create alias map
alias_to_ticker = {}

for _, row in df.iterrows():
    ticker = row['text'].upper()
    name = row['label'].lower()
    alias_to_ticker[ticker] = ticker
    alias_to_ticker[name] = ticker
    # Optional: add lowercase ticker too
    alias_to_ticker[ticker.lower()] = ticker

# Step 3: Prepare for embedding
aliases = list(alias_to_ticker.keys())
tickers = [alias_to_ticker[a] for a in aliases]

# Embed
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(aliases, convert_to_numpy=True)

# Step 5: Push to Pinecone
pc = Pinecone(api_key=pc_api_key)
index = pc.Index("crypto-index")

# Prepare vectors in correct format
vectors = []
for i in range(len(aliases)):
    vectors.append({
        "id": f"crypto_{i}",
        "values": embeddings[i].tolist(),
        "metadata": {"ticker": tickers[i], "alias": aliases[i]}
    })

# Batch upsert to avoid 2MB limit
batch_size = 50
total_batches = (len(vectors) + batch_size - 1) // batch_size

for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i + batch_size]
    index.upsert(vectors=batch)
    batch_num = i // batch_size + 1
    print(f"Batch {batch_num}/{total_batches} has been embedded and uploaded ({len(batch)} vectors)")

print("All batches completed!")

Batch 1/691 has been embedded and uploaded (50 vectors)
Batch 2/691 has been embedded and uploaded (50 vectors)
Batch 3/691 has been embedded and uploaded (50 vectors)
Batch 4/691 has been embedded and uploaded (50 vectors)
Batch 5/691 has been embedded and uploaded (50 vectors)
Batch 6/691 has been embedded and uploaded (50 vectors)
Batch 7/691 has been embedded and uploaded (50 vectors)
Batch 8/691 has been embedded and uploaded (50 vectors)
Batch 9/691 has been embedded and uploaded (50 vectors)
Batch 10/691 has been embedded and uploaded (50 vectors)
Batch 11/691 has been embedded and uploaded (50 vectors)
Batch 12/691 has been embedded and uploaded (50 vectors)
Batch 13/691 has been embedded and uploaded (50 vectors)
Batch 14/691 has been embedded and uploaded (50 vectors)
Batch 15/691 has been embedded and uploaded (50 vectors)
Batch 16/691 has been embedded and uploaded (50 vectors)
Batch 17/691 has been embedded and uploaded (50 vectors)
Batch 18/691 has been embedded and uploa

In [1]:
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone

load_dotenv()
pc_api_key= os.getenv("PINECONE_API_KEY")

dataset = load_dataset("Mozes721/stock-crypto-weather-dataset", data_files="city_mapppings.csv")
df = dataset["train"].to_pandas()

  from .autonotebook import tqdm as notebook_tqdm
Python(74330) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Generating train split: 209579 examples [00:00, 1258847.41 examples/s]


In [2]:

# Step 2: Create alias map for cities (simplified since text=label)
city_aliases = {}

for _, row in df.iterrows():
    city_name = row['text']
    city_aliases[city_name] = city_name
    # Add lowercase version too
    city_aliases[city_name.lower()] = city_name

# Step 3: Prepare for embedding
cities = list(city_aliases.keys())
city_labels = [city_aliases[c] for c in cities]

# Embed
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(cities, convert_to_numpy=True)

# Step 5: Push to Pinecone
pc = Pinecone(api_key=pc_api_key)
index = pc.Index("city-index")

# Prepare vectors in correct format
vectors = []
for i in range(len(cities)):
    vectors.append({
        "id": f"city_{i}",
        "values": embeddings[i].tolist(),
        "metadata": {"city": city_labels[i], "alias": cities[i]}
    })

# Batch upsert to avoid 2MB limit
batch_size = 50
total_batches = (len(vectors) + batch_size - 1) // batch_size

for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i + batch_size]
    index.upsert(vectors=batch)
    batch_num = i // batch_size + 1
    print(f"Batch {batch_num}/{total_batches} has been embedded and uploaded ({len(batch)} vectors)")

print("All city batches completed!")

AttributeError: 'NoneType' object has no attribute 'lower'