# Install Required libraries

In [None]:
!pip install sacremoses peft
! pip install -U transformers

# Import required libraries

In [None]:
from transformers import AutoModelForSequenceClassification, pipeline, AutoModel, AutoTokenizer
import torch
import pandas as pd
import numpy as np


# Import Data

In [None]:
df = pd.read_csv('---')

In [None]:
# Check for consensus and mark rows for removal
rows_to_drop = []
for index, row in df.iterrows():
    if row['Consensus'] == '-': #Drop unknown and unmarked
        rows_to_drop.append(index)

# Drop the marked rows
df.drop(rows_to_drop, inplace=True)

# Print the updated DataFrame
print(df)

# Small data analysis

In [None]:
model2 = AutoModel.from_pretrained("microsoft/biogpt")


# Load Model and tokenizer (BioGPT)

In [None]:
model2

BioGptModel(
  (embed_tokens): Embedding(42384, 1024, padding_idx=1)
  (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
  (layers): ModuleList(
    (0-23): 24 x BioGptDecoderLayer(
      (self_attn): BioGptAttention(
        (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
      )
      (activation_fn): GELUActivation()
      (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1024, out_features=4096, bias=True)
      (fc2): Linear(in_features=4096, out_features=1024, bias=True)
      (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    )
  )
  (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)

In [None]:
from peft import (
    LoraConfig,
    PeftType,
    PromptEncoderConfig,
    PeftConfig,
    PeftModel,
    PeftModelForFeatureExtraction,
    PeftModelForSequenceClassification
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


peft_model_id = "Lukee4/biogpt-2019_2labels"
config = PeftConfig.from_pretrained(peft_model_id)
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
config.auto_mapping= {'base_model_class': 'BioGptModel',
                      'parent_library': 'transformers.models.biogpt.modeling_biogpt'}

model = AutoModel.from_pretrained("microsoft/biogpt", output_hidden_states=True)
#AutoModelForSequenceClassification

# Load the Lora model
inference_model = PeftModel.from_pretrained(model, peft_model_id, output_hidden_states=True)
#PeftModelForSequenceClassification

In [None]:
config

PeftConfig(peft_type='LORA', auto_mapping={'base_model_class': 'PeftModel', 'parent_library': 'peft.peft_model'}, base_model_name_or_path=None, revision=None, task_type=None, inference_mode=True)

In [None]:
model

BioGptModel(
  (embed_tokens): Embedding(42384, 1024, padding_idx=1)
  (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
  (layers): ModuleList(
    (0-23): 24 x BioGptDecoderLayer(
      (self_attn): BioGptAttention(
        (k_proj): Linear(
          in_features=1024, out_features=1024, bias=True
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.1, inplace=False)
          )
          (lora_A): ModuleDict(
            (default): Linear(in_features=1024, out_features=8, bias=False)
          )
          (lora_B): ModuleDict(
            (default): Linear(in_features=8, out_features=1024, bias=False)
          )
          (lora_embedding_A): ParameterDict()
          (lora_embedding_B): ParameterDict()
        )
        (v_proj): Linear(
          in_features=1024, out_features=1024, bias=True
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.1, inplace=False)
          )
          (lora_A): ModuleDict(
            (default):

In [None]:
inference_model

PeftModel(
  (base_model): LoraModel(
    (model): BioGptModel(
      (embed_tokens): Embedding(42384, 1024, padding_idx=1)
      (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-23): 24 x BioGptDecoderLayer(
          (self_attn): BioGptAttention(
            (k_proj): Linear(
              in_features=1024, out_features=1024, bias=True
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.1, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=1024, out_features=8, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=8, out_features=1024, bias=False)
              )
              (lora_embedding_A): ParameterDict()
              (lora_embedding_B): ParameterDict()
            )
            (v_proj): Linear(
              in_features=1024, out_features=1024, bias=True
              (lora_dr

## Define the pipeline

In [None]:
# Create the pipeline
p = pipeline(
    task="feature-extraction",
    tokenizer=tokenizer,
    model=inference_model,
    framework="pt",
    device=-1,  # use CUDA with 0
)

### Get the embeddings of the last token of the last hidden state

In [None]:
# Custom function to extract the embedding of the last token
def extract_last_token(last_hidden_states):
    last_hidden_states = np.array(last_hidden_states)
    return last_hidden_states[:,-1, :]

# Process the data using the pipeline
results = p([row["Chief Complaint"] for _, row in df.iterrows()])

# Extract the last token of the last hidden state
embeddings = [extract_last_token(hidden_state) for hidden_state in results]

In [None]:
# Assign the reshaped embeddings to the "embeddings" column in the DataFrame
df["embeddings_biogpt_tuned"] = embeddings

# Print the resulting DataFrame
print(df)

In [None]:
df.to_json('/content/drive/MyDrive/Colab Notebooks/dissertation/data/2019_withFT.json', orient='records')