In [None]:
import importlib
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from collections import defaultdict
import torch
from sklearn.manifold import TSNE
import Database as db
import Config as c
importlib.reload(db)
importlib.reload(c)

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

import plotly.express as px
from sklearn.manifold import TSNE
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm





In [167]:
def normalize_matrix(df):
    return (df - df.min()) / (df.max() - df.min())

In [168]:
raw_ticker_data = db.load_cached_ticker()
print(f"Loaded {len(raw_ticker_data)} tickers from cache.")


Loaded 748 tickers from cache.


In [143]:
cat_fields = ['exchange', 'sector', 'industry']
raw_ticker_data = pd.DataFrame(raw_ticker_data)
raw_ticker_data[cat_fields] = raw_ticker_data[cat_fields].fillna('')

# Step 1: Encode categorical features
encoder = OneHotEncoder(sparse_output=False)
cat_features = encoder.fit_transform(raw_ticker_data[cat_fields])

# Compute cosine similarity on categorical features
S_cat = cosine_similarity(cat_features)

# Step 2: Encode textual fields with Sentence-BERT
model = SentenceTransformer('all-MiniLM-L6-v2')

# Combine 'name' and 'description' for embedding (or do separately if preferred)
texts = (raw_ticker_data['name'] + ". " + raw_ticker_data['description']).tolist()

text_embeddings = model.encode(texts, convert_to_numpy=True)
S_text = cosine_similarity(text_embeddings)

# Step 3: Combine similarity matrices with weights
w_cat = 0.4
w_text = 0.6

S_combined = w_cat * S_cat + w_text * S_text


In [144]:

# Optional: row-normalize for use in label propagation or affinity matrices
#row_sums = S_combined.sum(axis=1, keepdims=True)
#S_combined_normalized = S_combined / np.maximum(row_sums, 1e-12)
similarity_df = pd.DataFrame(
    S_combined,
    index=raw_ticker_data['symbol'],
    columns=raw_ticker_data['symbol']
)

similarity_df = normalize_matrix(similarity_df)


In [145]:
print("scombined")
print(similarity_df)


scombined
symbol        AA       AAL      AAPL       ACB      ACGL      ACLX       ACM  \
symbol                                                                         
AA      1.000000  0.210675  0.160077  0.230139  0.127737  0.167571  0.320880   
AAL     0.169789  1.000000  0.256592  0.160001  0.341891  0.252162  0.348806   
AAPL    0.122286  0.261402  1.000000  0.133841  0.306991  0.232506  0.100482   
ACB     0.232369  0.203682  0.173536  1.000000  0.089308  0.342984  0.149561   
ACGL    0.099670  0.354168  0.315490  0.057274  1.000000  0.302700  0.177653   
...          ...       ...       ...       ...       ...       ...       ...   
ZG      0.138414  0.318474  0.284762  0.128168  0.363876  0.270975  0.180456   
ZH      0.190856  0.110572  0.092732  0.091668  0.018017  0.048746  0.247294   
ZLAB    0.159953  0.071648  0.116447  0.318570  0.080195  0.570108  0.114930   
ZM      0.051733  0.244400  0.402967  0.054453  0.207569  0.249475  0.184048   
ZS      0.064208  0.193286  0.

In [146]:
# Step 1: Convert similarity to distance for t-SNE (t-SNE works better on distances)
# Distance = 1 - similarity (clip to [0, 2] to be safe)
dist_matrix = 1 - similarity_df.values
dist_matrix = np.clip(dist_matrix, 0, 2)

# Step 2: t-SNE embedding (precomputed distance metric)
tsne = TSNE(
    n_components=2,
    perplexity=5,  # smaller perplexity since fewer points (tickers)
    metric='precomputed',
    init='random',  
    random_state=42
)
ticker_2d = tsne.fit_transform(dist_matrix)

# Step 3: Create DataFrame for plotly
plot_df = pd.DataFrame({
    'x': ticker_2d[:, 0],
    'y': ticker_2d[:, 1],
    'Ticker': similarity_df.index  # ticker symbols as labels
})

# Step 4: Plot interactive scatter with hover labels
fig = px.scatter(
    plot_df,
    x='x', y='y',
    hover_name='Ticker',
    color_discrete_sequence=['blue'],  # all points same color
    opacity=0.8
)

fig.update_traces(marker=dict(size=10), showlegend=False)
fig.update_layout(
    title="t-SNE projection of tickers by combined similarity",
    xaxis_title="t-SNE 1",
    yaxis_title="t-SNE 2"
)

fig.show()


In [169]:

raw_news=load_cached_news()

# Convert to DataFrame
df = pd.DataFrame(raw_news)

# Combine text fields into one column for embedding
df['text'] = df['headline'].fillna('') + '. ' + df['summary'].fillna('')

# Optional: ensure symbols is always a list
df['symbols'] = df['symbols'].apply(lambda x: x if isinstance(x, list) else [])

# Keep only the important columns
df = df[['date', 'author', 'source', 'text', 'symbols']]

In [170]:
# ======== Step 1: Embed articles ========
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast, 384-dim embeddings
article_embeddings = model.encode(df['text'].tolist(), convert_to_tensor=True)

In [171]:

# ======== Step 2: Build ticker embeddings ========
ticker_articles = defaultdict(list)

for idx, row in df.iterrows():
    for ticker in row['symbols']:
        ticker_articles[ticker].append(article_embeddings[idx])

ticker_embeddings = {}
for ticker, embeds in ticker_articles.items():
    if embeds:
        ticker_embeddings[ticker] = torch.mean(torch.stack(embeds), dim=0)
    else:
        ticker_embeddings[ticker] = None


In [174]:

# ======== Step 3: Calculate similarity (affiliation scores) ========
all_tickers = sorted(ticker_embeddings.keys())
"""affiliation_scores = np.zeros((len(df), len(all_tickers)))

for i, article_embed in enumerate(article_embeddings):
    for j, ticker in enumerate(all_tickers):
        if ticker_embeddings[ticker] is not None:
            score = torch.nn.functional.cosine_similarity(
                article_embed.unsqueeze(0),
                ticker_embeddings[ticker].unsqueeze(0)
            ).item()
        else:
            score = 0.0
        affiliation_scores[i, j] = score
    print(f"Processed article {i+1}/{len(df)} for affiliation scores")"""
embed_dim = next(v for v in ticker_embeddings.values() if v is not None).shape[0]

ticker_matrix = torch.stack([
    ticker_embeddings[t] if ticker_embeddings[t] is not None else torch.zeros(embed_dim)
    for t in all_tickers
])
# Normalize both article and ticker embeddings for cosine similarity
article_norm = torch.nn.functional.normalize(article_embeddings, p=2, dim=1)  # [num_articles, embed_dim]
ticker_norm = torch.nn.functional.normalize(ticker_matrix, p=2, dim=1)        # [num_tickers, embed_dim]

# Cosine similarity = dot product between normalized vectors
affiliation_scores = article_norm @ ticker_norm.T  # shape: [num_articles, num_tickers]

# Convert to NumPy if needed
affiliation_scores = affiliation_scores.cpu().numpy()

In [175]:

# ======== Step 4: Put into DataFrame ========
scores_df = pd.DataFrame(
    affiliation_scores,
    index=df.index,
    columns=all_tickers
)
print(scores_df)



               A        AA       AAC      AACG      AACI      AADI     AAGIY  \
0       0.219303  0.110575  0.215196  0.109859  0.222181  0.178119  0.175537   
1       0.330077  0.275797  0.331612  0.179276  0.275500  0.161103  0.138697   
2       0.250460  0.252494  0.110773  0.254689  0.203288  0.249698  0.033619   
3       0.263905  0.215315  0.112951  0.182082  0.161463  0.306461  0.127207   
4       0.265580  0.213226  0.168324  0.194391  0.211894  0.280215  0.051324   
...          ...       ...       ...       ...       ...       ...       ...   
129342  0.186190  0.229642  0.109114  0.091726  0.189000  0.204156  0.076231   
129343  0.087607  0.178706 -0.026309  0.089425  0.067973  0.155188  0.093885   
129344 -0.049319 -0.007598  0.072631 -0.050933  0.080825 -0.071487 -0.028496   
129345  0.222498  0.221314  0.262190  0.298831  0.285454  0.203429  0.174795   
129346  0.406313  0.446830  0.216445  0.419803  0.111875  0.427513  0.163330   

            AAIC       AAL      AAMC  .

In [176]:
ticker_similarity_from_news = pd.DataFrame(
    cosine_similarity(scores_df.T),
    index=all_tickers,
    columns=all_tickers
)
ticker_similarity_from_news = normalize_matrix(ticker_similarity_from_news)
print("ticker_similarity_from_news:", ticker_similarity_from_news)

ticker_similarity_from_news:              A        AA       AAC      AACG      AACI      AADI     AAGIY  \
A     1.000000  0.988281  0.939083  0.963850  0.869850  0.989720  0.958665   
AA    0.987919  1.000000  0.933515  0.969388  0.856958  0.987814  0.962049   
AAC   0.942892  0.939568  1.000000  0.931779  0.920868  0.931277  0.931419   
AACG  0.961965  0.968768  0.923430  1.000000  0.861203  0.962417  0.956314   
AACI  0.888224  0.880896  0.927505  0.886713  1.000000  0.881941  0.881855   
...        ...       ...       ...       ...       ...       ...       ...   
ZVSA  0.971397  0.978004  0.928095  0.977331  0.866175  0.976811  0.960647   
ZWS   0.985269  0.985838  0.919012  0.956535  0.847475  0.985598  0.949187   
ZYME  0.989114  0.984778  0.935292  0.957760  0.867679  0.989689  0.957715   
ZYNE  0.974753  0.977839  0.932961  0.979658  0.876223  0.982381  0.959049   
ZYXI  0.985902  0.986861  0.938433  0.977816  0.879603  0.987392  0.962946   

          AAIC       AAL      AAMC

In [None]:
# Number of tickers
num_tickers = ticker_similarity_from_news.shape[0]

# List of ticker names (column names)
ticker_names = list(ticker_similarity_from_news.columns)
# Step 1: Convert similarity to distance for t-SNE (t-SNE works better on distances)
# Distance = 1 - similarity (clip to [0, 2] to be safe)
dist_matrix = 1 - ticker_similarity_from_news
dist_matrix = np.clip(dist_matrix, 0, 2)

# Step 2: t-SNE embedding (precomputed distance metric)
tsne = TSNE(
    n_components=2,
    perplexity=5,  # smaller perplexity since fewer points (tickers)
    metric='precomputed',
    init='random',  
    random_state=42
)
ticker_2d = tsne.fit_transform(dist_matrix)

# Step 3: Create DataFrame for plotly
plot_df = pd.DataFrame({
    'x': ticker_2d[:, 0],
    'y': ticker_2d[:, 1],
    'Ticker': ticker_similarity_from_news.index  # ticker symbols as labels
})

# Step 4: Plot interactive scatter with hover labels
fig = px.scatter(
    plot_df,
    x='x', y='y',
    hover_name='Ticker',
    color_discrete_sequence=['blue'],  # all points same color
    opacity=0.8
)

fig.update_traces(marker=dict(size=10), showlegend=False)
fig.update_layout(
    title="t-SNE projection of tickers by combined similarity",
    xaxis_title="t-SNE 1",
    yaxis_title="t-SNE 2"
)

fig.show()


In [164]:
w_meta = 0.8
w_news = 0.2
S_meta_norm = normalize_matrix(similarity_df)*w_meta
print("S_meta_norm:", S_meta_norm)
ticker_sim_norm = normalize_matrix(ticker_similarity_from_news)*w_news
print("ticker_sim_norm:", ticker_sim_norm)

snormsqare = S_meta_norm.multiply(S_meta_norm, fill_value=1)
tickersqure = ticker_sim_norm.multiply(ticker_sim_norm, fill_value=1)


combined_ticker_similarity = (snormsqare.add(tickersqure, fill_value=0)).add(ticker_sim_norm.multiply(S_meta_norm, fill_value=1), fill_value=0)
combined_ticker_similarity = normalize_matrix(combined_ticker_similarity)

print(combined_ticker_similarity)

S_meta_norm: symbol        AA       AAL      AAPL       ACB      ACGL      ACLX       ACM  \
symbol                                                                         
AA      0.800000  0.168540  0.128061  0.184111  0.102190  0.134057  0.256704   
AAL     0.135832  0.800000  0.205274  0.128001  0.273513  0.201729  0.279044   
AAPL    0.097828  0.209121  0.800000  0.107073  0.245593  0.186005  0.080385   
ACB     0.185895  0.162946  0.138829  0.800000  0.071447  0.274387  0.119649   
ACGL    0.079736  0.283334  0.252392  0.045819  0.800000  0.242160  0.142122   
...          ...       ...       ...       ...       ...       ...       ...   
ZG      0.110731  0.254779  0.227809  0.102534  0.291101  0.216780  0.144365   
ZH      0.152685  0.088458  0.074185  0.073334  0.014413  0.038997  0.197835   
ZLAB    0.127962  0.057319  0.093157  0.254856  0.064156  0.456086  0.091944   
ZM      0.041386  0.195520  0.322374  0.043562  0.166055  0.199580  0.147238   
ZS      0.051366  0.154629 

In [165]:
# Step 1: Convert similarity to distance for t-SNE (t-SNE works better on distances)
# Distance = 1 - similarity (clip to [0, 2] to be safe)
dist_matrix = 1 - combined_ticker_similarity
dist_matrix = np.clip(dist_matrix, 0, 2)

# Step 2: t-SNE embedding (precomputed distance metric)
tsne = TSNE(
    n_components=2,
    perplexity=5,  # smaller perplexity since fewer points (tickers)
    metric='precomputed',
    init='random',  
    random_state=42
)
ticker_2d = tsne.fit_transform(dist_matrix)

# Step 3: Create DataFrame for plotly
plot_df = pd.DataFrame({
    'x': ticker_2d[:, 0],
    'y': ticker_2d[:, 1],
    'Ticker': combined_ticker_similarity.index  # ticker symbols as labels
})

# Step 4: Plot interactive scatter with hover labels
fig = px.scatter(
    plot_df,
    x='x', y='y',
    hover_name='Ticker',
    color_discrete_sequence=['blue'],  # all points same color
    opacity=0.8
)

fig.update_traces(marker=dict(size=10), showlegend=False)
fig.update_layout(
    title="t-SNE projection of tickers by combined similarity",
    xaxis_title="t-SNE 1",
    yaxis_title="t-SNE 2"
)

fig.show()