In [56]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
from IPython.display import HTML
from sklearn.decomposition import PCA
import umap

ROOT = Path.cwd().resolve().parent if Path.cwd().name == "notebooks" else Path.cwd()
# Force project root = folder that contains "results"
RESULTS_PATH = ROOT / "results" / "embeddings.json"
print("Resolved root:", ROOT)
print("Looking for:", RESULTS_PATH)
assert RESULTS_PATH.exists(), f"embeddings.json not found in {RESULTS_PATH}"
ROOT = Path.cwd()
while ROOT != ROOT.parent and not (ROOT / "results").exists():
    ROOT = ROOT.parent
print("Project rootL:", ROOT)
print("Embeddings file:", RESULTS_PATH)

assert RESULTS_PATH.exists(), "Run the Day 5 embedding exponent export first."

with open(RESULTS_PATH, "r") as f:
    emb_data = json.load(f)

#emb_data: {family_name: [[d0,d1,...], ...]...}
families = list(emb_data.keys())
print(f"Loaded families: {len(families)}")



Resolved root: /Users/mderaznasr/Documents/GitHub/Protein-fewshot
Looking for: /Users/mderaznasr/Documents/GitHub/Protein-fewshot/results/embeddings.json
Project rootL: /Users/mderaznasr/Documents/GitHub/Protein-fewshot/notebooks
Embeddings file: /Users/mderaznasr/Documents/GitHub/Protein-fewshot/results/embeddings.json
Loaded families: 21


In [23]:
from pathlib import Path
import json

ROOT = Path.cwd().resolve().parent if Path.cwd().name == "notebooks" else Path.cwd()
emb_path = ROOT / "results" / "embeddings.json"
print("Embeddings path:", emb_path, "| exists:", emb_path.exists())

with open(emb_path, "r") as f:
    emb_data = json.load(f)

print("Number of families in embeddings.json:", len(emb_data))
for fam, vecs in emb_data.items():
    print(f"  {fam}: {len(vecs)} sequences")

Embeddings path: /Users/mderaznasr/Documents/GitHub/Protein-fewshot/results/embeddings.json | exists: True
Number of families in embeddings.json: 21
  Antenna: 47 sequences
  C2Domain: 100 sequences
  Fibronectin: 28 sequences
  Granulin: 17 sequences
  Guanine: 28 sequences
  Kringle: 96 sequences
  Melatonin: 25 sequences
  Metallothionein: 100 sequences
  OuterMembraneUsher: 39 sequences
  PPE: 97 sequences
  Phosphatase: 40 sequences
  Phosphocarrier: 81 sequences
  Phosphofructokinase: 100 sequences
  PotexCarlavirusCoat: 30 sequences
  Prepilin: 36 sequences
  PrepilinEndopeptidase: 36 sequences
  Retinoid: 33 sequences
  Retroviral: 80 sequences
  SsrABinding: 100 sequences
  UbiquitinE1: 31 sequences
  ZincFingerAN1: 64 sequences


In [None]:
#flatten embeddings into a Dataframe/matrix
#turning the embeddings into a table
# each row is a protein sequence 
#each column is a number from the embedding
import pandas as pd
import numpy as np
'''
    "familyA": [ [0.2, 0.1, 0.9, ...], [0.4, 0.3, 0.8, ...], ... ],
    "familyB": [ ... ],
'''
rows = []
for fam, vecs in emb_data.items():
    for v in vecs:
        arr = np.asarray(v, dtype=float)
        if arr.ndim > 1:
            arr = arr.reshape(-1)   # flatten any (1, D) / (D,1) etc
        row = {"family": fam}
        for i, val in enumerate(arr):
            row[f"d{i}"] = float(val)
        rows.append(row)
        # rows.append(
        #     {"family": fam, **{f"d{i}": v[i] for i in range(len(v))}}
        # )
df = pd.DataFrame(rows) #like a table
feat_cols = []
for c in df.columns:
    if c.startswith("d"):
        feat_cols.append(c)
# feat_cols = [c for c in df.columns if c.startswith("d")]

print("df shape:", df.shape)
print("number of embedding features:", len(feat_cols))
print("families (first 5):", df["family"].unique()[:5])
# df.head()


df shape: (1208, 129)
number of embedding features: 128
families (first 5): ['Antenna' 'C2Domain' 'Fibronectin' 'Granulin' 'Guanine']


In [42]:
#PCA/UMAP projection from 128D to 2D

X = df[feat_cols].to_numpy() #getting embedding columns

#PCA
#1.Learns the directions of maximum variance in your data
#2.	Turns each 128-dim embedding into 2D:

# - build numeric feature matrix
pca = PCA(n_components=2, random_state=42) #create PCA model
Z_pca = pca.fit_transform(X)
df["x_pca"] = Z_pca[:, 0]
df["y_pca"] = Z_pca[:, 1]

X = df[feat_cols].to_numpy(dtype=float)
n_samples, n_features = X.shape
print("PCA explained variance:", pca.explained_variance_ratio_[:2])
if n_samples < 2 or n_features < 2:
    print("⚠️ Not enough data for 2D PCA. Using dummy coordinates.")
    df["x_pca"] = np.zeros(n_samples)
    df["y_pca"] = np.arange(n_samples)
    HAS_PCA = False
try:
#try to create UMAP model
    reducer = umap.UMAP(
        n_neighbors=15,
        min_dist=0.1,
        metric="cosine",
        random_state=42,
    )
    Z_umap = reducer.fit_transform(X)
    '''
        Produces a 2D layout similar to PCA, but usually:
        •	tighter clusters
        •	curved / non-linear shapes
        •	more biologically meaningful visual separation
    '''
    df["x_umap"] = Z_umap[:, 0]
    df["y_umap"] = Z_umap[:, 1]
    HAS_UMAP = True
    print("UMAP projection computed.")
except Exception as e:
    HAS_UMAP = False
    print("UMAP failed, using PCA only:", e)
#prevents crashing if dependencies not installed or anything missing


PCA explained variance: [0.52169154 0.36370692]



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP projection computed.


In [None]:
#plotly scatter (PCA projection)
import ipywidgets as widgets
from IPython.display import display

use_umap = HAS_UMAP if "HAS_UMAP" in globals() else False
use_pca = True     # PCA is always available

if use_umap:
    xcol, ycol = "x_umap", "y_umap"
    title = "Protein Embeddings (UMAP projection, 2D)"
else:
    xcol, ycol = "x_pca", "y_pca"
    title = "Protein Embeddings (PCA projection, 2D)"

print(f"Using projection: {xcol}, {ycol}") 

# embedding L2 norm (just for fun / hover)
vecs = df[feat_cols].to_numpy()
df["norm"] = np.linalg.norm(vecs, axis=1)

family_list = sorted(df["family"].unique())
family_search = widgets.Text(
    value="",
    placeholder="Type to filter families...",
    description="Search:"
)
family_selector = widgets.SelectMultiple(
    options=family_list,
    value=tuple(family_list),
    description="Families",
    rows = 8
)

def filter_df():
    ''' Return filtered dataframe based on widget selections.'''
    if family_search.value.strip():
        filtered = [f for f in family_list if family_search.value.lower() in f.lower()]
        family_selector.options = filtered
        selected = filtered
    else:
        selected = list(family_selector.value)
        family_selector.options = family_list
    
    return df[df["family"].isin(selected)].copy()

def make_plot(_):
    dff = filter_df()  

    fig = px.scatter(
        dff,
        x=xcol,
        y=ycol,
        color="family",
        hover_data=["family", "norm"],
        title=f"{title} (Filtered: {len(dff)} proteins)",
        width=900,
        height=650,
        opacity=0.82,
    )

    fig.update_traces(marker=dict(size=6, line=dict(width=0)))
    fig.update_layout(legend_title_text="Protein Family")
    fig.show()

    html_path = ROOT / "results" / "plot_interactive.html"
    pio.write_html(fig, html_path, auto_open=False)
    print(f"Saved interactive HTML -> {html_path}")
    



ModuleNotFoundError: No module named 'ipywidgets'