In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel, PeftConfig
import torch

# === Paths ===
base_model_name_or_path = "Qwen/Qwen2.5-1.5B-Instruct"   # or local path
adapter_path = "models/ablation-1/checkpoint-2508"              # your LoRA/PEFT adapter directory

# === Load tokenizer ===
tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path)

# === Load base model ===
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name_or_path,
    torch_dtype=torch.float16,
    device_map="auto"
)

# === Load adapter ===
model = PeftModel.from_pretrained(base_model, adapter_path)
model.eval()

# === Text generation pipeline ===
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto"
)

# === Run inference ===
prompt = '[ "BENT", "FACULTY", "FLAIR", "GIFT", "PLAYWRIGHT", "SWORD", "WRAP", "WREATH", "DEAN", "GABLE", "GARLAND", "TEMPLE", "HAY", "JACKPOT", "ROAD", "ROOF"]'
outputs = generator(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)

print("\n=== Generated Output ===")
print(outputs[0]["generated_text"])


In [None]:
import os

import hdbscan

os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import json
from bertopic import BERTopic
from datasets import load_dataset
from sklearn.cluster import AgglomerativeClustering, KMeans
from tqdm import tqdm
from transformers.pipelines import pipeline
import numpy as np

dataset = load_dataset("ItsTYtan/nytimes-connections", split="test")

puzzles = [entry["puzzle"] for entry in dataset]
solutions = [entry["solution"] for entry in dataset]
cluster_model = KMeans(
    n_clusters=4,
)

results = []
for puzzle, solution in tqdm(list(zip(puzzles, solutions)), desc="evalutating"):
    model = BERTopic(
        hdbscan_model=cluster_model,
    )
    
    topics, _ = model.fit_transform(puzzle) 
    # topic_distr, _ = model.approximate_distribution(
    #     puzzle,
    #     window=1,           # try 4–8 for short texts
    #     stride=1,           # increase (e.g., 3–4) to speed up on big corpora
    #     use_embedding_model=False  # True uses the embedding model; slower, sometimes better
    # )
    # print(topics)
    # print(topic_distr)

    X = model.umap_model.embedding_
    dists = model.hdbscan_model.transform(X) 
    sigma = np.median(dists)  # scale; tune as needed
    scores = np.exp(-(dists**2)/(2*sigma**2))
    scores = (scores.T / scores.sum(axis=1)).T  # row-normalize to sum to 1
    print(puzzle)
    print(dists)

    
    guesses = [[],[],[],[]]
    for topic, doc in list(zip(topics, puzzle)):
        guesses[topic].append(doc)

    results.append({
        "guesses": guesses,
        "solution": solution
    })

with open("results/kmeans.json", "w") as f:
    json.dump(results, f, indent=2)


  from .autonotebook import tqdm as notebook_tqdm
evalutating:   1%|▏         | 1/79 [00:17<22:40, 17.44s/it]

[3, 1, 1, 3, 2, 2, 0, 1, 0, 0, 0, 2, 2, 1, 3, 0]
[[1.0868645  1.2531993  1.4244772  0.5463282 ]
 [0.44047326 1.3152683  1.2589457  1.3773137 ]
 [0.58683294 1.1992122  1.4587659  1.0404034 ]
 [1.3636981  1.101607   1.179415   0.5283993 ]
 [1.372842   0.8458015  1.083147   1.3389418 ]
 [1.1126344  0.75525    0.9777239  1.5598032 ]
 [1.4512901  0.8520729  0.6075238  1.467134  ]
 [0.63098454 1.0728164  1.2743764  1.2957456 ]
 [1.590674   0.92649984 0.46643487 1.3670483 ]
 [1.1904771  1.1725125  0.6300164  1.2701013 ]
 [1.3345101  1.1603997  0.8056534  1.1365678 ]
 [1.3195542  0.39773247 1.0723674  1.2602313 ]
 [1.3822545  0.7044582  1.0423753  0.99131405]
 [0.56248975 1.3297431  1.1561518  1.355719  ]
 [1.2529556  1.2395666  1.1288656  0.31715196]
 [1.1217868  1.0213598  0.8273318  1.4438094 ]]


evalutating:   1%|▏         | 1/79 [00:18<24:11, 18.61s/it]


KeyboardInterrupt: 