In [41]:
from datasets import load_dataset
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load dataset
stock_ds = load_dataset("Mozes721/stock-crypto-weather-dataset", data_files="stock_mapper_training.csv")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [42]:
# Create bidirectional training data with more variations
def create_enhanced_bidirectional_data(original_dataset):
    texts = []
    labels = []
    
    for example in original_dataset['train']:
        ticker = example['text']
        company = example['label']
        
        # Add multiple variations for better learning
        variations_per_pair = 3  # Repeat each mapping multiple times
        
        for _ in range(variations_per_pair):
            # Add ticker -> ticker mapping (identity)
            texts.append(ticker)
            labels.append(ticker)
            
            texts.append(ticker.lower())
            labels.append(ticker)
            
            # Add company -> ticker mapping
            texts.append(company)
            labels.append(ticker)
            
            # Add lowercase variations
            texts.append(company.lower())
            labels.append(ticker)
            
            # Add uppercase variations
            texts.append(company.upper())
            labels.append(ticker)
            
            # Add title case
            texts.append(company.title())
            labels.append(ticker)
    
    return Dataset.from_dict({"text": texts, "label": labels})

# Create enhanced bidirectional dataset
print("Creating enhanced bidirectional dataset...")
bidirectional_ds = create_enhanced_bidirectional_data(dataset)

print(f"Original dataset size: {len(dataset['train'])}")
print(f"Enhanced bidirectional dataset size: {len(bidirectional_ds)}")

Creating enhanced bidirectional dataset...
Original dataset size: 551
Enhanced bidirectional dataset size: 9918


In [None]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import (
    Pinecone,
    ServerlessSpec,
    CloudProvider,
    AwsRegion,
    VectorType
)

# Load dataset
dataset = load_dataset("Mozes721/stock-crypto-weather-dataset", data_files="stock_mapper_training.csv")
df = dataset["train"].to_pandas()

# Step 2: Create alias map
alias_to_ticker = {}

for _, row in df.iterrows():
    ticker = row['text'].upper()
    name = row['label'].lower()
    alias_to_ticker[ticker] = ticker
    alias_to_ticker[name] = ticker

# Optional: add lowercase ticker too
    alias_to_ticker[ticker.lower()] = ticker

# Step 3: Prepare for embedding
aliases = list(alias_to_ticker.keys())
tickers = [alias_to_ticker[a] for a in aliases]



# Embed
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(aliases, convert_to_numpy=True)


# Step 5: Push to Pinecone
pinecone.init(api_key="pcsk_2WAoZn_SnZJQepsKcJEK1qkpzJUCwgHUCuRhgmu3oJ7tih5xpEMDPnK2CrugA2hKw44XGU", environment="us-east-1")
index = pinecone.Index("stock-index")  # assuming it's a dense index

# Upsert vectors
to_upsert = [
    (aliases[i], embeddings[i], {"ticker": tickers[i]})
    for i in range(len(aliases))
]
index.upsert(vectors=to_upsert)


AttributeError: init is no longer a top-level attribute of the pinecone package.

Please create an instance of the Pinecone class instead.

Example:

    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='my_index', 
            dimension=1536, 
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )



In [None]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import pinecone

# Load dataset
dataset = load_dataset("Mozes721/stock-crypto-weather-dataset", data_files="stock_mapper_training.csv")
df = dataset["train"].to_pandas()

# Step 2: Create alias map
alias_to_ticker = {}

for _, row in df.iterrows():
    ticker = row['text'].upper()
    name = row['label'].lower()
    alias_to_ticker[ticker] = ticker
    alias_to_ticker[name] = ticker

# Optional: add lowercase ticker too
    alias_to_ticker[ticker.lower()] = ticker

# Step 3: Prepare for embedding
aliases = list(alias_to_ticker.keys())
tickers = [alias_to_ticker[a] for a in aliases]



# Embed
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(aliases, convert_to_numpy=True)


# Step 5: Push to Pinecone
pinecone.init(api_key="pcsk_2WAoZn_SnZJQepsKcJEK1qkpzJUCwgHUCuRhgmu3oJ7tih5xpEMDPnK2CrugA2hKw44XGU", environment="us-east-1")
index = pinecone.Index("stock-index")  # assuming it's a dense index

# Upsert vectors
to_upsert = [
    (aliases[i], embeddings[i], {"ticker": tickers[i]})
    for i in range(len(aliases))
]
index.upsert(vectors=to_upsert)


AttributeError: init is no longer a top-level attribute of the pinecone package.

Please create an instance of the Pinecone class instead.

Example:

    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='my_index', 
            dimension=1536, 
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )



In [45]:
# Create label mappings
unique_tickers = list(set(bidirectional_ds['label']))
label2id = {ticker: i for i, ticker in enumerate(unique_tickers)}
id2label = {i: ticker for i, ticker in enumerate(unique_tickers)}

print(f"Number of unique tickers: {len(unique_tickers)}")

# Filter and shuffle the dataset
ds = bidirectional_ds.filter(lambda x: x['text'] is not None and str(x['text']).strip() != "")
ds = ds.shuffle(seed=42)

Number of unique tickers: 551


Filter: 100%|██████████| 9918/9918 [00:00<00:00, 475130.57 examples/s]


In [47]:
# Preprocessing function
def preprocess_function(examples):
    tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64)  # Reduced max_length
    tokenized["labels"] = [label2id[label] for label in examples["label"]]
    return tokenized

# Apply preprocessing
print("Tokenizing dataset...")
tokenized_ds = ds.map(preprocess_function, batched=True)
tokenized_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Use a larger model for better capacity
print("Loading model...")
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",  # Using full BERT instead of DistilBERT
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Tokenizing dataset...


Map: 100%|██████████| 9918/9918 [00:00<00:00, 53808.88 examples/s]

Loading model...



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
def predict_ticker_with_confidence(input_text):
    device = next(model.parameters()).device
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()
        confidence = probabilities[0][predicted_class].item()
    
    return id2label[predicted_class], confidence

In [None]:
from sentence_transformers import SentenceTransformer
from pinecone import (
    Pinecone,
    ServerlessSpec,
    CloudProvider,
    AwsRegion,
    VectorType
)

class EmbeddingStockMapper:
    def __init__(self, model_name="all-MiniLM-L6-v2", pinecone_api_key="pcsk_2WAoZn_SnZJQepsKcJEK1qkpzJUCwgHUCuRhgmu3oJ7tih5xpEMDPnK2CrugA2hKw44XGU"):
        # Initialize the embedding model
        self.model = SentenceTransformer(model_name)
        
        # Initialize Pinecone (old API)
        pc = Pinecone(api_key=pinecone_api_key)
        self.index = pc.Index("stock-index")
        
    def get_stock_ticker(self, query):
        # Get embedding for the query
        query_embedding = self.model.encode(query, convert_to_numpy=True)
        
        # Search in Pinecone
        results = self.index.query(
            vector=query_embedding.tolist(),
            top_k=1,
            include_metadata=True
        )
        
        if results.matches:
            return results.matches[0].metadata['ticker']
        return None

# Initialize the mapper
mapper = EmbeddingStockMapper()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
test_queries = ["AAPL", "Apple Inc.", "apple", "What is the current stock price of Tesla.", "Google", "google", "TSLA", "Tesla", "tesla", "Microsoft Corporation", "microsoft"]

for query in test_queries:
    ticker = mapper.get_stock_ticker(query)
    print(f"Query: {query} -> Ticker: {ticker}")

Query: AAPL -> Ticker: AAPL
Query: Apple Inc. -> Ticker: AAPL
Query: apple -> Ticker: AAPL
Query: What is the current stock price of Tesla -> Ticker: TSLA
Query: Google -> Ticker: GOOGL
Query: google -> Ticker: GOOGL
Query: TSLA -> Ticker: TSLA
Query: Tesla -> Ticker: TSLA
Query: tesla -> Ticker: TSLA
Query: Microsoft Corporation -> Ticker: MSFT
Query: microsoft -> Ticker: MSFT


In [None]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import pinecone

# Load dataset
dataset = load_dataset("Mozes721/stock-crypto-weather-dataset", data_files="stock_mapper_training.csv")
df = dataset["train"].to_pandas()

# Step 2: Create alias map
alias_to_ticker = {}

for _, row in df.iterrows():
    ticker = row['text'].upper()
    name = row['label'].lower()
    alias_to_ticker[ticker] = ticker
    alias_to_ticker[name] = ticker

# Optional: add lowercase ticker too
    alias_to_ticker[ticker.lower()] = ticker

# Step 3: Prepare for embedding
aliases = list(alias_to_ticker.keys())
tickers = [alias_to_ticker[a] for a in aliases]



# Embed
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(aliases, convert_to_numpy=True)


# Step 5: Push to Pinecone
pinecone.init(api_key="pcsk_2WAoZn_SnZJQepsKcJEK1qkpzJUCwgHUCuRhgmu3oJ7tih5xpEMDPnK2CrugA2hKw44XGU", environment="us-east-1")
index = pinecone.Index("stock-index")  # assuming it's a dense index

# Upsert vectors
to_upsert = [
    (aliases[i], embeddings[i], {"ticker": tickers[i]})
    for i in range(len(aliases))
]
index.upsert(vectors=to_upsert)


AttributeError: init is no longer a top-level attribute of the pinecone package.

Please create an instance of the Pinecone class instead.

Example:

    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='my_index', 
            dimension=1536, 
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )



In [None]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import pinecone

# Load dataset
dataset = load_dataset("Mozes721/stock-crypto-weather-dataset", data_files="stock_mapper_training.csv")
df = dataset["train"].to_pandas()

# Step 2: Create alias map
alias_to_ticker = {}

for _, row in df.iterrows():
    ticker = row['text'].upper()
    name = row['label'].lower()
    alias_to_ticker[ticker] = ticker
    alias_to_ticker[name] = ticker

# Optional: add lowercase ticker too
    alias_to_ticker[ticker.lower()] = ticker

# Step 3: Prepare for embedding
aliases = list(alias_to_ticker.keys())
tickers = [alias_to_ticker[a] for a in aliases]



# Embed
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(aliases, convert_to_numpy=True)


# Step 5: Push to Pinecone
pinecone.init(api_key="pcsk_2WAoZn_SnZJQepsKcJEK1qkpzJUCwgHUCuRhgmu3oJ7tih5xpEMDPnK2CrugA2hKw44XGU", environment="us-east-1")
index = pinecone.Index("stock-index")  # assuming it's a dense index

# Upsert vectors
to_upsert = [
    (aliases[i], embeddings[i], {"ticker": tickers[i]})
    for i in range(len(aliases))
]
index.upsert(vectors=to_upsert)


AttributeError: init is no longer a top-level attribute of the pinecone package.

Please create an instance of the Pinecone class instead.

Example:

    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='my_index', 
            dimension=1536, 
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )



In [None]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import pinecone

# Load dataset
dataset = load_dataset("Mozes721/stock-crypto-weather-dataset", data_files="stock_mapper_training.csv")
df = dataset["train"].to_pandas()

# Step 2: Create alias map
alias_to_ticker = {}

for _, row in df.iterrows():
    ticker = row['text'].upper()
    name = row['label'].lower()
    alias_to_ticker[ticker] = ticker
    alias_to_ticker[name] = ticker

# Optional: add lowercase ticker too
    alias_to_ticker[ticker.lower()] = ticker

# Step 3: Prepare for embedding
aliases = list(alias_to_ticker.keys())
tickers = [alias_to_ticker[a] for a in aliases]



# Embed
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(aliases, convert_to_numpy=True)


# Step 5: Push to Pinecone
pinecone.init(api_key="pcsk_2WAoZn_SnZJQepsKcJEK1qkpzJUCwgHUCuRhgmu3oJ7tih5xpEMDPnK2CrugA2hKw44XGU", environment="us-east-1")
index = pinecone.Index("stock-index")  # assuming it's a dense index

# Upsert vectors
to_upsert = [
    (aliases[i], embeddings[i], {"ticker": tickers[i]})
    for i in range(len(aliases))
]
index.upsert(vectors=to_upsert)


AttributeError: init is no longer a top-level attribute of the pinecone package.

Please create an instance of the Pinecone class instead.

Example:

    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='my_index', 
            dimension=1536, 
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )



In [None]:
test_queries = ["AAPL", "Apple", "apple", "GOOGL", "Google", "google"]

for query in test_queries:
    ticker = mapper.get_stock_ticker(query)
    print(f"Query: {query} -> Ticker: {ticker}")

AttributeError: module 'pinecone' has no attribute 'Pinecone'