In [None]:
!pip install unsloth
!pip install huggingface
!pip install tf-keras

In [5]:
import torch
from transformers import BitsAndBytesConfig
from unsloth import FastLanguageModel

fourbit_models = [
    "unsloth/Qwen3-1.7B-unsloth-bnb-4bit", # Qwen 14B 2x faster
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    "unsloth/Qwen3-32B-unsloth-bnb-4bit",

    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/Phi-4",
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit" # [NEW] We support TTS models!
] # More models at https://huggingface.co/unsloth
model_name = "unsloth/Qwen3-32B-unsloth-bnb-4bit"
# model_name = "unsloth/Phi-4"#,#"unsloth/Qwen3-32B-unsloth-bnb-4bit"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,  # or torch.float16 if no bf16
    llm_int8_enable_fp32_cpu_offload=True,  # key for offloading
)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,          # consider lowering if you still OOM
    quantization_config=bnb_config,
    device_map="auto",            # let HF shard between GPU/CPU
)


In [7]:
import pandas as pd
import os
datafold = "/sbgenomics/project-files/"
file = "y_pasc_score2024.csv"
# file = "RECOVERAdult_BiostatsDerived_202412_symptoms_deID.csv"
result = pd.read_csv(datafold + file)
result = result.rename(columns={"PARTICIPANT_ID":"id", "pasc_score_2024":"name"})
result['index'] = "pasc_score_2024"
result_reordered = result.iloc[:, [0, 3,1,2]]
# file = "all_methods_summary.csv"
file = "vaccine_2026.csv"
df = pd.read_csv(datafold + file)
df=df.iloc[:, [1, 2, 3,4]]
combined_df = pd.concat([result_reordered, df], ignore_index=True, axis=0)
combined_df['index'] = combined_df['index'].replace('pasc_score_2024', 'pasc')
rows_to_keep_mask = ~((combined_df['index'] == 'pasc') & (combined_df['name'].isna()))

# Filter the DataFrame using the mask
combined_df = combined_df[rows_to_keep_mask]

In [8]:
import numpy as np
B = "pasc"  # set your target value
out = combined_df.groupby("id").filter(lambda g: (g["index"] == B).sum() > 4)
print(out.shape)#282581, 4)

ids = np.unique(out['id'])
print(len(ids)) #15158 13451
index = 5

selected_ids = [id_value for id_value in ids[5:8]] # Make sure ids is just the list of values
output_ind_df = combined_df[combined_df['id'].isin(selected_ids)]

# output_ind_df = combined_df[combined_df['id']==ids[index]]
print(output_ind_df.shape)
# df_new = output_ind_df.drop(columns=['id'])
output_ind = output_ind_df.to_string(index=True, header=True)
# output_string = combined_df.to_string(index=True, header=True)

(250430, 4)
11669
(57, 4)


In [9]:
# output_ind_df
prompt = (
"""You are a clinical, medical, physician, and statistical expert.

You will be given a table for several patient with columns:
- id: patient id
- date: date of record
- index: 0=enrollment; 1/2/3/4=vaccine dose number; "followup" or "followup_k"=follow-up visit, pasc=wellness score (lower is better). Threshold: >=12 = Long COVID (PASC), <12 = No PASC.
- name: free-text description (may include vaccine brand like pfizer/moderna/etc, and may contain notes, if it is value, then pasc)
- (optional) followup_2: "no" means no vaccine between this visit and the previous visit; "yes" means a vaccine occurred between visits.

Important notes:
- Use feature selection to find most important patterns, do pairwise comparison
- Data noise, vaccines may occur before enrollment (index=0).Dates may be out of order in the raw text; sort by date when building a timeline.
- If vaccine brand is ambiguous/misspelled (e.g., "pfzier"), normalize to the closest common brand and also keep the raw string.
- Consider fairness, treat each participant equally. consider feature selection, choose the dominant patterns, perform pairwise comparison

TASKS
Write a concise summary,sStart a section exactly titled: #Summary
   In #Summary include:
   - 3–5 sentences describing the patients’ course, key events, and strongest observed associations of the longitudal information

INPUT TABLE :
"""
+ output_ind +
"""
"""
)


In [3]:
# # prompt = "here is the vaccine record of one patient, can you provide a summary of the useful information " +output_string + " and present results in a table to be saved, such as date of vaccine, index of vaccine, name of vaccine"
# prompt = "here is followup of maybe long covid patient, vaccines may be taken before enrollment. there are 4 columns, id column: is the patient id, date column: is the date of item recorded, index column: 0 means the enrollment, 1,2,3,4 means the first, second, third and fourth vaccine, followup means the followup, pasc_score_2024 means the score of wellness, the lower the score, the better the patient, 12 is usually the threshold, >=12 means long covid, < 12 means no; name column: is the description, such as pfzier is the vaccine name, number with pasc_score_2024 means the score, with index column followup_2, no means there is no vaccine between this visit and previous visit, yes means another vaccine " +\
# output_ind + " make analysis of the table, then find some discoveries of such as pasc score and the vaccine, the pattern, the timeline, when peak happens, what are the events before and after, discories that usually in nature/science. Write summary in #Summary"

messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    temperature = 0.7,
    max_new_tokens = 3024,
    streamer = TextStreamer(tokenizer, skip_prompt = False),
)