In [None]:
!pip install -q transformers accelerate torch pyarrow

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd
import numpy as np

In [2]:
# --- Cell 1: bootstrap ---
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# --- Cell 2: load merged Apple+FINRA data (from Data_Processing) ---
import os, pandas as pd

DOCS_PATH   = "/content/drive/MyDrive/Projects/JPM Short Interest/outputs/merged_docs.parquet"
CHUNKS_PATH = "/content/drive/MyDrive/Projects/JPM Short Interest/outputs/merged_chunks_nltk.parquet"  # if you need chunks here

# (optional) sanity check
print(os.listdir("/content/drive/MyDrive/Projects/JPM Short Interest/Last Records/outputs"))

df = pd.read_parquet(DOCS_PATH)  # this is the one with full_text + interval + FINRA cols
print(df.shape)
assert 'full_text' in df.columns, "full_text missing"
assert 'interval'  in df.columns, "interval missing"

['merged_docs.parquet', 'merged_chunks_nltk.parquet']
(283, 19)


In [4]:
import torch

In [5]:
# 1) Load model + tokenizer
MODEL_ID = "Ksu246/nolbert-classifier"   # 3-class
tok = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# 2) Hard-token chunker (IDs never exceed 512 after adding specials)
def token_windows(input_ids, max_len=512, stride=64):
    """
    Slide over token IDs with overlap, add special tokens per chunk,
    and ensure each chunk length <= max_len.
    """
    core_max = max_len - tok.num_special_tokens_to_add(pair=False)
    start = 0
    n = len(input_ids)
    chunks = []
    while start < n:
        end = min(start + core_max, n)
        core = input_ids[start:end]
        with_special = tok.build_inputs_with_special_tokens(core)
        attn = [1]*len(with_special)
        chunks.append((with_special, attn))
        if end == n:
            break
        start = max(end - stride, 0)
    return chunks

# 3) Score one text safely (average probs across chunks, length-weighted)
@torch.no_grad()
def score_text_safe(text: str, max_len=512, stride=64):
    if not isinstance(text, str) or not text.strip():
        return {"NEUTRAL": np.nan, "POSITIVE": np.nan, "NEGATIVE": np.nan}
    enc = tok(text, add_special_tokens=False)
    ids = enc["input_ids"]
    if not ids:
        return {"NEUTRAL": np.nan, "POSITIVE": np.nan, "NEGATIVE": np.nan}

    chunks = token_windows(ids, max_len=max_len, stride=stride)

    # batch the chunks
    input_ids_list = [torch.tensor(c[0], dtype=torch.long) for c in chunks]
    attn_list      = [torch.tensor(c[1], dtype=torch.long) for c in chunks]
    # pad
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids_list, batch_first=True, padding_value=tok.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attn_list,      batch_first=True, padding_value=0)

    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

    # forward
    logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
    probs = torch.softmax(logits, dim=-1).cpu().numpy()  # shape [num_chunks, 3]

    # weights by effective tokens per chunk (sum of attention)
    weights = attention_mask.sum(dim=1).cpu().numpy()
    w = np.maximum(weights, 1.0).astype(float)
    wp = (probs * w[:, None]).sum(axis=0) / w.sum()

    # map LABEL_* -> names (NoLBERT uses LABEL_0/1/2)
    return {"NEUTRAL": float(wp[0]), "POSITIVE": float(wp[1]), "NEGATIVE": float(wp[2])}

# 4) Apply to your DataFrame df with a 'full_text' column
def add_nolbert_features_safe(df: pd.DataFrame, text_col="full_text"):
    rows = [score_text_safe(s, max_len=512, stride=64) for s in df[text_col].fillna("").tolist()]
    probs = pd.DataFrame(rows)
    out = pd.concat([df.reset_index(drop=True), probs], axis=1)
    out["sent_polarity"] = out["POSITIVE"] - out["NEGATIVE"]
    out["sent_entropy"]  = -(out[["NEUTRAL","POSITIVE","NEGATIVE"]]
                             .clip(1e-9,1).apply(lambda r: float(np.sum(r*np.log(r))), axis=1))
    return out


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/692 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
drive.mount('/content/drive')

DOCS_PATH   = "/content/drive/MyDrive/Projects/JPM Short Interest/outputs/merged_docs.parquet"
CHUNKS_PATH = "/content/drive/MyDrive/Projects/JPM Short Interest/outputs/merged_chunks_nltk.parquet"

docs   = pd.read_parquet(DOCS_PATH)
chunks = pd.read_parquet(CHUNKS_PATH)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

@torch.no_grad()
def predict_batch(texts):
    enc = tok(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    enc = {k: v.to(device) for k, v in enc.items()}
    logits = model(**enc).logits
    probs = torch.softmax(logits, dim=-1).cpu().numpy()
    return pd.DataFrame(probs, columns=["NEUTRAL","POSITIVE","NEGATIVE"])

BATCH=64
out=[]
for i in range(0, len(chunks), BATCH):
    out.append(predict_batch(chunks["chunk_text"].iloc[i:i+BATCH].tolist()).reset_index(drop=True))
scores = pd.concat(out, ignore_index=True)

scored_chunks = pd.concat([chunks.reset_index(drop=True), scores], axis=1)

# length-weighted aggregation back to doc (use words as weights here)
def _agg(g):
    w = g["chunk_word_len"].to_numpy().clip(1,None).astype(float)
    W = w.sum()
    neu = (g["NEUTRAL"].to_numpy()*w).sum()/W
    pos = (g["POSITIVE"].to_numpy()*w).sum()/W
    neg = (g["NEGATIVE"].to_numpy()*w).sum()/W
    return pd.Series({"NEUTRAL":neu,"POSITIVE":pos,"NEGATIVE":neg})

doc_sent = scored_chunks.groupby("doc_id", as_index=False).apply(_agg)
doc_sent["sent_polarity"] = doc_sent["POSITIVE"] - doc_sent["NEGATIVE"]
doc_sent["sent_entropy"]  = -(doc_sent[["NEUTRAL","POSITIVE","NEGATIVE"]]
                              .clip(1e-9,1)
                              .apply(lambda r: float(np.sum(r*np.log(r))), axis=1))

final_df = docs.merge(doc_sent, on="doc_id", how="left")

# Example modeling target
if {"interval","currentShortPositionQuantity"}.issubset(final_df.columns):
    final_df = final_df.sort_values("interval")
    final_df["target_next"] = final_df["currentShortPositionQuantity"].shift(-1)
    model_df = final_df.dropna(subset=["target_next"]).reset_index(drop=True)
else:
    model_df = final_df.copy()

OUT = "/content/drive/MyDrive/Projects/JPM Short Interest/outputs/merged_with_nolbert.parquet"
model_df.to_parquet(OUT, index=False)
print("Saved:", OUT)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
model_df