In [1]:
!pip install -q transformers accelerate torch pyarrow

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd
import numpy as np

In [3]:
# --- Cell 1: bootstrap ---
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# --- Cell 2: load merged Apple+FINRA data (from Data_Processing) ---
import os, pandas as pd

DOCS_PATH   = "/content/drive/MyDrive/Projects/JPM Short Interest/Wrapup/outputs/merged_docs_new.parquet"
CHUNKS_PATH = "/content/drive/MyDrive/Projects/JPM Short Interest/Wrapup/outputs/merged_chunks_nltk_new.parquet"  # if you need chunks here

# (optional) sanity check
print(os.listdir("/content/drive/MyDrive/Projects/JPM Short Interest/Wrapup/outputs"))

df = pd.read_parquet(DOCS_PATH)  # this is the one with full_text + interval + FINRA cols
print(df.shape)
assert 'full_text' in df.columns, "full_text missing"
assert 'interval'  in df.columns, "interval missing"

['merged_docs_new.parquet', 'merged_chunks_nltk_new.parquet']
(81, 19)


In [5]:
import torch

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1) Load model + tokenizer
MODEL_ID = "Ksu246/nolbert-classifier"   # 3-class
tok = AutoTokenizer.from_pretrained(MODEL_ID)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token or tok.sep_token or tok.cls_token

model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# dynamic label map (robust if model label order differs)
id2label = model.config.id2label
label_names = [id2label[i] for i in range(len(id2label))]
# normalize to NEUTRAL/POSITIVE/NEGATIVE column names
label_alias = {ln.upper(): ln for ln in label_names}  # original casing
def to_std_cols(cols):
    mapped = {}
    for i, c in enumerate(cols):
        up = c.upper()
        if "NEU" in up: mapped[c] = "NEUTRAL"
        elif "POS" in up: mapped[c] = "POSITIVE"
        elif "NEG" in up: mapped[c] = "NEGATIVE"
        else: mapped[c] = c
    return mapped

# 2) Hard-token chunker (IDs never exceed 512 after adding specials)
def token_windows(input_ids, max_len=512, stride=64):
    """
    Slide over token IDs with overlap, add special tokens per chunk,
    and ensure each chunk length <= max_len.
    """
    core_max = max_len - tok.num_special_tokens_to_add(pair=False)
    start = 0
    n = len(input_ids)
    chunks = []
    while start < n:
        end = min(start + core_max, n)
        core = input_ids[start:end]
        with_special = tok.build_inputs_with_special_tokens(core)
        attn = [1]*len(with_special)
        chunks.append((with_special, attn))
        if end == n:
            break
        start = max(end - stride, 0)
    return chunks

@torch.no_grad()
def score_text_safe(text: str, max_len=512, stride=64):
    if not isinstance(text, str) or not text.strip():
        return {"NEUTRAL": np.nan, "POSITIVE": np.nan, "NEGATIVE": np.nan}
    enc = tok(text, add_special_tokens=False)
    ids = enc["input_ids"]
    if not ids:
        return {"NEUTRAL": np.nan, "POSITIVE": np.nan, "NEGATIVE": np.nan}

    chunks = token_windows(ids, max_len=max_len, stride=stride)

    # batch the chunks
    input_ids_list = [torch.tensor(c[0], dtype=torch.long) for c in chunks]
    attn_list      = [torch.tensor(c[1], dtype=torch.long) for c in chunks]
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids_list, batch_first=True, padding_value=tok.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attn_list,      batch_first=True, padding_value=0)

    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

    logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
    probs = torch.softmax(logits, dim=-1).cpu().numpy()  # [num_chunks, num_labels]

    weights = attention_mask.sum(dim=1).cpu().numpy()
    w = np.maximum(weights, 1.0).astype(float)
    wp = (probs * w[:, None]).sum(axis=0) / w.sum()

    # map to dynamic labels -> standardized columns
    prob_df = pd.DataFrame([wp], columns=label_names)
    prob_df = prob_df.rename(columns=to_std_cols(prob_df.columns))
    # ensure all three exist
    for col in ["NEUTRAL", "POSITIVE", "NEGATIVE"]:
        if col not in prob_df.columns:
            prob_df[col] = np.nan
    return {k: float(prob_df[k].iloc[0]) for k in ["NEUTRAL","POSITIVE","NEGATIVE"]}

# 3) Batch over rows (speeds up end-to-end) while keeping the safe scorer
def add_nolbert_features_safe(df: pd.DataFrame, text_col="full_text", max_len=512, stride=64, batch=64):
    vals = df[text_col].fillna("").tolist()
    out_rows = []
    for i in range(0, len(vals), batch):
        for s in vals[i:i+batch]:
            out_rows.append(score_text_safe(s, max_len=max_len, stride=stride))
    probs = pd.DataFrame(out_rows)
    out = pd.concat([df.reset_index(drop=True), probs], axis=1)
    out["sent_polarity"] = out["POSITIVE"] - out["NEGATIVE"]
    out["sent_entropy"]  = -(out[["NEUTRAL","POSITIVE","NEGATIVE"]]
                             .clip(1e-9,1).apply(lambda r: float(np.sum(r*np.log(r))), axis=1))
    return out


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/692 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [7]:
!pip install -q transformers torch pandas numpy pyarrow
import pandas as pd, numpy as np, torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from google.colab import drive
drive.mount('/content/drive')

DOCS_PATH   = "/content/drive/MyDrive/Projects/JPM Short Interest/Wrapup/outputs/merged_docs_new.parquet"
CHUNKS_PATH = "/content/drive/MyDrive/Projects/JPM Short Interest/Wrapup/outputs/merged_chunks_nltk_new.parquet"

docs   = pd.read_parquet(DOCS_PATH)
chunks = pd.read_parquet(CHUNKS_PATH)

MODEL_ID = "Ksu246/nolbert-classifier"
tok   = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID).eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

@torch.no_grad()
def predict_batch(texts):
    enc = tok(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    enc = {k: v.to(device) for k, v in enc.items()}
    logits = model(**enc).logits
    probs = torch.softmax(logits, dim=-1).cpu().numpy()
    return pd.DataFrame(probs, columns=["NEUTRAL","POSITIVE","NEGATIVE"])

BATCH=64
out=[]
for i in range(0, len(chunks), BATCH):
    out.append(predict_batch(chunks["chunk_text"].iloc[i:i+BATCH].tolist()).reset_index(drop=True))
scores = pd.concat(out, ignore_index=True)

scored_chunks = pd.concat([chunks.reset_index(drop=True), scores], axis=1)

# length-weighted aggregation back to doc (use words as weights here)
def _agg(g):
    w = g["chunk_word_len"].to_numpy().clip(1,None).astype(float)
    W = w.sum()
    neu = (g["NEUTRAL"].to_numpy()*w).sum()/W
    pos = (g["POSITIVE"].to_numpy()*w).sum()/W
    neg = (g["NEGATIVE"].to_numpy()*w).sum()/W
    return pd.Series({"NEUTRAL":neu,"POSITIVE":pos,"NEGATIVE":neg})

doc_sent = scored_chunks.groupby("doc_id", as_index=False).apply(_agg)
doc_sent["sent_polarity"] = doc_sent["POSITIVE"] - doc_sent["NEGATIVE"]
doc_sent["sent_entropy"]  = -(doc_sent[["NEUTRAL","POSITIVE","NEGATIVE"]]
                              .clip(1e-9,1)
                              .apply(lambda r: float(np.sum(r*np.log(r))), axis=1))

final_df = docs.merge(doc_sent, on="doc_id", how="left")

# Example modeling target
if {"interval","currentShortPositionQuantity"}.issubset(final_df.columns):
    final_df = final_df.sort_values("interval")
    final_df["target_next"] = final_df["currentShortPositionQuantity"].shift(-1)
    model_df = final_df.dropna(subset=["target_next"]).reset_index(drop=True)
else:
    model_df = final_df.copy()

OUT = "/content/drive/MyDrive/Projects/JPM Short Interest/Wrapup/outputs/merged_with_nolbert_new.parquet"
model_df.to_parquet(OUT, index=False)
print("Saved:", OUT)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved: /content/drive/MyDrive/Projects/JPM Short Interest/Wrapup/outputs/merged_with_nolbert_new.parquet


  doc_sent = scored_chunks.groupby("doc_id", as_index=False).apply(_agg)


In [8]:
model_df

Unnamed: 0,ticker,interval,event_dt,transcriptid,full_text,accountingYearMonthNumber,issueName,issuerServicesGroupExchangeCode,marketClassCode,currentShortPositionQuantity,...,changePercent,changePreviousNumber,settlementDate,doc_id,NEUTRAL,POSITIVE,NEGATIVE,sent_polarity,sent_entropy,target_next
0,GOOGL,0,2023-08-02 23:26:13,2866669.0,"Good afternoon. My name is Sarah, and I will b...",20230815,Alphabet Inc. Class A Common S,R,NNM,46172293,...,2.10,949021,2023-08-15,0,0.295190,0.574016,0.130794,0.443223,0.944859,81763698
1,TSLA,0,2023-08-16 17:31:29,2886847.0,Welcome to today's earnings call of the Elevin...,20230815,"Tesla, Inc. Common Stock",R,NNM,81763698,...,4.54,3554464,2023-08-15,44,0.381000,0.542145,0.076855,0.465289,0.896759,49039430
2,GOOGL,2,2023-08-30 22:54:08,2899513.0,"Hi. Good afternoon, everybody. We're going to ...",20230831,Alphabet Inc. Class A Common S,R,NNM,49039430,...,6.21,2867147,2023-08-31,1,0.776395,0.180263,0.043342,0.136921,0.641388,46080635
3,GOOGL,3,2023-09-13 20:57:57,2907954.0,All right. I'd like to welcome everyone this a...,20230915,Alphabet Inc. Class A Common S,R,NNM,46080635,...,-6.03,-2958795,2023-09-15,2,0.783510,0.196963,0.019527,0.177435,0.588026,84724119
4,TSLA,3,2023-09-13 22:48:13,2908022.0,How is it going everybody? Really happy to hav...,20230915,"Tesla, Inc. Common Stock",R,NNM,84724119,...,3.20,2630272,2023-09-15,45,0.732808,0.181976,0.085216,0.096760,0.747724,82428956
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,TSLA,48,2025-06-18 09:40:16,3488025.0,Welcome to Marvell's Custom AI Investor Event....,20250613,"Tesla, Inc. Common Stock",R,NNM,77096746,...,0.13,98667,2025-06-13,77,0.669312,0.297491,0.033197,0.264294,0.742447,80074233
76,TSLA,49,2025-07-01 02:20:56,3493543.0,"Good afternoon, and welcome to the Liontrust A...",20250630,"Tesla, Inc. Common Stock",R,NNM,80074233,...,3.86,2977487,2025-06-30,78,0.545582,0.372505,0.081913,0.290593,0.903374,76515659
77,GOOGL,49,2025-07-02 20:19:37,3494205.0,"Good afternoon, and welcome to Kits Eyecare Fi...",20250630,Alphabet Inc. Class A Common S,R,NNM,76515659,...,12.61,8570860,2025-06-30,42,0.423766,0.546529,0.029705,0.516824,0.798485,69962139
78,TSLA,50,2025-07-16 16:20:34,3497955.0,"Good morning, ladies and gentlemen. Thank you ...",20250715,"Tesla, Inc. Common Stock",R,NNM,69962139,...,-12.63,-10112094,2025-07-15,79,0.362863,0.492398,0.144739,0.347659,0.996448,57377636
